Coverage for portality/crosswalks/article_doaj_xml.py: 88%

168 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-22 15:59 +0100

1from portality.core import app 

2from lxml import etree 

3import re 

4from portality.bll import exceptions 

5from portality.crosswalks.exceptions import CrosswalkException 

6from portality import models 

7from datetime import datetime 

8 

9from portality.ui.messages import Messages 

10 

11 

12class DOAJXWalk(object): 

13 """ 

14 ~~DOAJArticleXML:Crosswalk->DOAJArticleXML:Feature~~ 

15 """ 

16 format_name = "doaj" 

17 schema_path = app.config.get("SCHEMAS", {}).get("doaj") 

18 

19 def __init__(self): 

20 self.validation_log = "" 

21 

22 # load the schema into memory for more efficient usage in repeat calls to the crosswalk 

23 if self.schema_path is None: 

24 raise exceptions.IngestException(message="Unable to validate for DOAJXWalk, as schema path is not set in config") 

25 try: 

26 # ~~->DOAJArticleXML:Schema~~ 

27 with open(self.schema_path) as schema_file: 

28 schema_doc = etree.parse(schema_file) 

29 

30 # If we are using a test or dev environment, edit the schema to use local paths 

31 if app.config.get("DOAJENV") != 'production': 

32 self._localise_schema(schema_doc) 

33 

34 self.schema = etree.XMLSchema(schema_doc) 

35 except Exception as e: 

36 raise exceptions.IngestException(message="There was an error attempting to load schema from " + self.schema_path, inner=e) 

37 

38 def validate_file(self, file_handle): 

39 # first try to parse the file 

40 try: 

41 doc = etree.parse(file_handle) 

42 except etree.XMLSyntaxError as e: # although the treatment is the same, pulling this out so we remember what the primary kind of exception should be 

43 raise CrosswalkException(message="Unable to parse XML file", inner=e) 

44 except UnicodeDecodeError as e: 

45 msg = 'Text decode failed, expected utf-8 encoded XML.' 

46 raise CrosswalkException(message='Unable to parse XML file', inner=e, inner_message=msg) 

47 except Exception as e: 

48 raise CrosswalkException(message="Unable to parse XML file", inner=e) 

49 

50 # then pass the doc to the validator 

51 valid = self.validate(doc) 

52 

53 if not valid: 

54 msg = "Validation message from schema '{x}': {y}\n".format(x=DOAJXWalk.format_name, y=self.validation_log) 

55 raise CrosswalkException(message="Unable to validate document with identified schema", inner_message=msg) 

56 

57 return doc 

58 

59 def validate(self, doc): 

60 valid = self.schema.validate(doc) 

61 if not valid: 

62 el = self.schema.error_log.__repr__() 

63 # strip the filename, as we don't want to leak the path to the UI 

64 rx = "[\da-f]{32}.xml:(.*)" 

65 match = re.search(rx, el) 

66 if match is not None: 

67 el = match.group(1) 

68 self.validation_log = el 

69 return valid 

70 

71 def crosswalk_file(self, file_handle, add_journal_info=True): 

72 doc = self.validate_file(file_handle) 

73 return self.crosswalk_doc(doc, add_journal_info=add_journal_info) 

74 

75 def crosswalk_doc(self, doc, add_journal_info=True): 

76 # go through the records in the doc and crosswalk each one individually 

77 articles = [] 

78 root = doc.getroot() 

79 for record in root.findall("record"): 

80 article = self.crosswalk_article(record, add_journal_info=add_journal_info) 

81 articles.append(article) 

82 

83 return articles 

84 

85 def crosswalk_article(self, record, add_journal_info=True): 

86 """ 

87 Example record: 

88 <record> 

89 <language>eng</language> 

90 <publisher>Co-Action Publishing</publisher> 

91 <journalTitle>Tellus A</journalTitle> 

92 <issn>0280-6495</issn> 

93 <eissn>1600-0870</eissn> 

94 <publicationDate>2014-02-05</publicationDate> 

95 <volume>66</volume> 

96 <issue>0</issue> 

97 <startPage>1</startPage> 

98 <endPage>18</endPage> 

99 <doi>10.3402/tellusa.v66.21390</doi> 

100 <publisherRecordId>21390</publisherRecordId> 

101 <documentType>Original</documentType> 

102 <title language="eng">LakeMIP Kivu...</title> 

103 <authors> 

104 <author> 

105 <name>WIM Thiery</name> 

106 <affiliationId>1</affiliationId> 

107 <orcid_id>https://orcid.org/0001-1234-1234</orcid_id> 

108 </author> 

109 </authors> 

110 <affiliationsList> 

111 <affiliationName affiliationId="1"> 

112 Department of Earth and Environmental Sciences...</affiliationName> 

113 </affiliationsList> 

114 

115 <abstract language="eng">The African great...</abstract> 

116 <fullTextUrl format="pdf">http://www.tellusa.net/index.php/tellusa/article/download/21390/pdf_1</fullTextUrl> 

117 <keywords language="eng"> 

118 <keyword>lake modelling</keyword> 

119 </keywords> 

120 </record> 

121 """ 

122 article = models.Article() # ~~->Article:Model~~ 

123 bibjson = article.bibjson() 

124 

125 # language 

126 lang = _element(record, "language") 

127 if lang is not None: 

128 bibjson.journal_language = lang 

129 

130 # publisher 

131 pub = _element(record, "publisher") 

132 if pub is not None: 

133 bibjson.publisher = pub 

134 

135 # journal title 

136 jt = _element(record, "journalTitle") 

137 if jt is not None: 

138 bibjson.journal_title = jt 

139 

140 # p-issn 

141 pissn = _element(record, "issn") 

142 if pissn is not None: 

143 bibjson.add_identifier(bibjson.P_ISSN, pissn.upper()) 

144 

145 # e-issn 

146 eissn = _element(record, "eissn") 

147 if eissn is not None: 

148 bibjson.add_identifier(bibjson.E_ISSN, eissn.upper()) 

149 

150 if pissn is not None and eissn is not None: 

151 if pissn.upper() == eissn.upper(): 

152 raise CrosswalkException( 

153 message=Messages.EXCEPTION_IDENTICAL_PISSN_AND_EISSN.format(value=pissn)) 

154 

155 # publication date 

156 pd = _element(record, "publicationDate") 

157 if pd is not None: 

158 y, m = _year_month(pd) 

159 if y is not None: 

160 bibjson.year = y 

161 if m is not None: 

162 bibjson.month = m 

163 

164 # volume 

165 vol = _element(record, "volume") 

166 if vol is not None: 

167 bibjson.volume = vol 

168 

169 # issue 

170 iss = _element(record, "issue") 

171 if iss is not None: 

172 bibjson.number = iss 

173 

174 # start page 

175 sp = _element(record, "startPage") 

176 if sp is not None: 

177 bibjson.start_page = sp 

178 

179 # end page 

180 ep = _element(record, "endPage") 

181 if ep is not None: 

182 bibjson.end_page = ep 

183 

184 # doi 

185 doi = _element(record, "doi") 

186 if doi is not None: 

187 bibjson.add_identifier(bibjson.DOI, doi) 

188 

189 # publisher record id 

190 pri = _element(record, "publisherRecordId") 

191 if pri is not None: 

192 article.set_publisher_record_id(pri) 

193 

194 # document type 

195 dt = _element(record, "documentType") 

196 if dt is not None: 

197 # FIXME: outstanding question as to what to do with this 

198 pass 

199 

200 # title 

201 title = _element(record, "title") 

202 if title is not None: 

203 bibjson.title = title 

204 

205 # authors 

206 ## first we need to extract the affiliations 

207 affiliations = {} 

208 affel = record.find("affiliationsList") 

209 if affel is not None: 

210 for ael in affel: 

211 affid = ael.get("affiliationId") 

212 aff = ael.text 

213 affiliations[affid] = aff 

214 ## now crosswalk each author and dereference their affiliation from the table 

215 authorsel = record.find("authors") 

216 if authorsel is not None: 

217 for ael in authorsel: 

218 name = _element(ael, "name") 

219 affid = _element(ael, "affiliationId") 

220 aff = affiliations.get(affid) 

221 orcid = _element(ael, "orcid_id") 

222 bibjson.add_author(name, affiliation=aff, orcid_id=orcid) 

223 

224 # abstract 

225 abstract = _element(record, "abstract") 

226 if abstract is not None: 

227 bibjson.abstract = abstract[:30000] # avoids Elasticsearch 

228 # exceptions about .exact analyser not being able to handle 

229 # more than 32766 UTF8 characters 

230 

231 # fulltext 

232 ftel = record.find("fullTextUrl") 

233 if ftel is not None and ftel.text is not None and ftel.text != "": 

234 ct = ftel.get("format") 

235 url = ftel.text 

236 bibjson.add_url(url, "fulltext", ct) 

237 

238 # keywords 

239 keyel = record.find("keywords") 

240 if keyel is not None: 

241 for kel in keyel: 

242 if kel.text != "": 

243 bibjson.add_keyword(kel.text) 

244 

245 # add the journal info if requested 

246 if add_journal_info: 

247 article.add_journal_metadata() 

248 

249 return article 

250 

251 @staticmethod 

252 def _localise_schema(schema_doc): 

253 """ Edit the DOAJ Article schema in-memory to use local paths """ 

254 language_list_import = schema_doc.xpath("xs:import[contains(@schemaLocation, 'iso_639-2b.xsd')]", 

255 namespaces=schema_doc.getroot().nsmap).pop() 

256 language_list_import.attrib['schemaLocation'] = './iso_639-2b.xsd' 

257 return schema_doc 

258 

259############################################################################### 

260# some convenient utilities 

261############################################################################### 

262 

263 

264def _year_month(date): 

265 try: 

266 stamp = datetime.strptime(date, "%Y-%m-%d") 

267 return stamp.year, stamp.month 

268 except: 

269 pass 

270 try: 

271 stamp = datetime.strptime(date, "%Y-%m") 

272 return stamp.year, stamp.month 

273 except: 

274 pass 

275 try: 

276 stamp = datetime.strptime(date, "%Y") 

277 return stamp.year, None 

278 except: 

279 pass 

280 return None, None 

281 

282 

283def _element(xml, field): 

284 el = xml.find(field) 

285 if el is not None and el.text is not None and el.text != "": 

286 return el.text.strip() 

287 return None