Coverage for portality/crosswalks/article_crossref_xml.py: 96%

223 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-09-05 21:15 +0100

1from portality.core import app 

2from lxml import etree 

3import re 

4from portality.bll import exceptions 

5from portality.crosswalks.exceptions import CrosswalkException 

6from portality import models 

7from portality.ui.messages import Messages 

8 

9 

10 

11class CrossrefXWalk442(object): 

12 """ 

13 ~~Crossref442XML:Crosswalk->Crossref442:Feature~~ 

14 """ 

15 format_name = "crossref442" 

16 NS = {'x': 'http://www.crossref.org/schema/4.4.2', 'j': 'http://www.ncbi.nlm.nih.gov/JATS1'} 

17 

18 """ 

19 Example record: 

20 <doi_batch version="4.4.2" xmlns="http://www.crossref.org/schema/4.4.2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.crossref.org/schema/4.3.7 http://www.crossref.org/schema/deposit/crossref4.3.7.xsd"> 

21 <head> 

22 <doi_batch_id>1dbb27d1030c6c9d9d-7ff0</doi_batch_id> 

23 <timestamp>200504260247</timestamp> 

24 <depositor> 

25 <depositor_name>your name</depositor_name> 

26 <email_address>your@email.com</email_address> 

27 </depositor> 

28 <registrant>WEB-FORM</registrant> 

29 </head> 

30 <body> 

31 <journal> 

32 <journal_metadata> 

33 <full_title>Test Publication</full_title> 

34 <abbrev_title>TP</abbrev_title> 

35 <issn media_type="print">2073-9813</issn> 

36 </journal_metadata> 

37 <journal_issue> 

38 <publication_date media_type="print"> 

39 <month>12</month> 

40 <day>1</day> 

41 <year>2005</year> 

42 </publication_date> 

43 <journal_volume> 

44 <volume>12</volume> 

45 </journal_volume> 

46 <issue>1</issue> 

47 </journal_issue> 

48 <!-- ====== This is the article's metadata ======== --> 

49 <journal_article publication_type="full_text"> 

50 <titles> 

51 <title>First Article</title> 

52 </titles> 

53 <contributors> 

54 <person_name sequence="first" contributor_role="author"> 

55 <given_name>Bob</given_name> 

56 <surname>Surname</surname> 

57 <ORCID>http://orcid.org/0000-0002-4011-3590</ORCID> 

58 </person_name> 

59 </contributors> 

60 <publication_date media_type="print"> 

61 <month>12</month> 

62 <day>1</day> 

63 <year>2004</year> 

64 </publication_date> 

65 <pages> 

66 <first_page>100</first_page> 

67 <last_page>200</last_page> 

68 </pages> 

69 <doi_data> 

70 <doi>10.50505/test_20051229930</doi> 

71 <resource>http://www.crossref.org/</resource> 

72 </doi_data> 

73 <!-- ========= Here is the list of references cited in the above article --> 

74 <citation_list> 

75 <citation key="ref1"> 

76 <journal_title>Current Opinion in Oncology</journal_title> 

77 <author>Chauncey</author> 

78 <volume>13</volume> 

79 <first_page>21</first_page> 

80 <cYear>2001</cYear> 

81 </citation> 

82 <citation key="ref2"> 

83 <doi>10.5555/small_md_0001</doi> 

84 </citation> 

85 <citation key="ref=3"> 

86 <unstructured_citation>Clow GD, McKay CP, Simmons Jr. GM, and Wharton RA, Jr. 1988. Climatological observations and predicted sublimation rates at Lake Hoare, Antarctica. Journal of Climate 1:715-728.</unstructured_citation> 

87 </citation> 

88 </citation_list> 

89 </journal_article> 

90 </journal> 

91 </body> 

92 </doi_batch> 

93 """ 

94 

95 def __init__(self): 

96 self.validation_log = "" 

97 self.schema_path = app.config.get("SCHEMAS", {}).get(self.format_name) 

98 

99 # load the schema into memory for more efficient usage in repeat calls to the crosswalk 

100 if self.schema_path is None: 

101 raise exceptions.IngestException( 

102 message="Unable to validate for " + self.format_name + ", as schema path is not set in config") 

103 

104 while app.config["CROSSREF442_SCHEMA"] is None: 

105 continue 

106 

107 # ~~->CrossrefXML:Schema~~ 

108 self.schema = app.config["CROSSREF442_SCHEMA"] 

109 

110 def validate_file(self, file_handle): 

111 # first try to parse the file 

112 try: 

113 doc = etree.parse(file_handle) 

114 except etree.XMLSyntaxError as e: # although the treatment is the same, pulling this out so we remember what the primary kind of exception should be 

115 raise CrosswalkException(message="Unable to parse XML file", inner=e) 

116 except UnicodeDecodeError as e: 

117 msg = 'Text decode failed, expected utf-8 encoded XML.' 

118 raise CrosswalkException(message='Unable to parse XML file', inner=e, inner_message=msg) 

119 except Exception as e: 

120 raise CrosswalkException(message="Unable to parse XML file", inner=e) 

121 

122 # then pass the doc to the validator 

123 valid = self.validate(doc) 

124 

125 if not valid: 

126 msg = "Validation message from schema '{x}': {y}\n".format(x=self.format_name, 

127 y=self.validation_log) 

128 raise CrosswalkException(message="Unable to validate document with identified schema", inner_message=msg) 

129 

130 return doc 

131 

132 def validate(self, doc): 

133 valid = self.schema.validate(doc) 

134 if not valid: 

135 el = self.schema.error_log.__repr__() 

136 # strip the filename, as we don't want to leak the path to the UI 

137 rx = "[\da-f]{32}.xml:(.*)" 

138 match = re.search(rx, el) 

139 if match is not None: 

140 el = match.group(1) 

141 self.validation_log = el 

142 return valid 

143 

144 def crosswalk_file(self, file_handle, add_journal_info): 

145 doc = self.validate_file(file_handle) 

146 return self.crosswalk_doc(doc) 

147 

148 def crosswalk_doc(self, doc): 

149 # go through the records in the doc and crosswalk each one individually 

150 articles = [] 

151 root = doc.getroot() 

152 body = root.find("x:body", self.NS) 

153 journals = body.findall("x:journal", self.NS) 

154 if journals is not None: 

155 for journal in journals: 

156 arts = journal.findall("x:journal_article", self.NS) 

157 for record in arts: 

158 article = self.crosswalk_article(record, journal) 

159 articles.append(article) 

160 

161 return articles 

162 

163 def crosswalk_article(self, record, journal): 

164 article = models.Article() # ~~->Article:Model~~ 

165 bibjson = article.bibjson() 

166 

167 self.extract_journal_title(journal, bibjson) 

168 self.extract_issns(journal, bibjson) 

169 self.extract_publication_date(record, journal, bibjson) 

170 self.extract_volume(journal, bibjson) 

171 self.extract_issue(journal, bibjson) 

172 self.extract_pages(record, journal, bibjson) 

173 self.extract_doi(record, journal, bibjson) 

174 self.extract_fulltext(record, journal, bibjson) 

175 self.extract_article_title(record, journal, bibjson) 

176 self.extract_authors(record, journal, bibjson) 

177 self.extract_abstract(record, journal, bibjson) 

178 

179 return article 

180 

181 

182 ############################################################################### 

183 ## extractors 

184 ############################################################################### 

185 

186 def extract_journal_title(self, journal, bibjson): 

187 jm = journal.find("x:journal_metadata", self.NS) 

188 if jm is not None: 

189 jt = _element(jm, "x:full_title", self.NS) 

190 if jt is not None: 

191 bibjson.journal_title = jt 

192 

193 def extract_issns(self, journal, bibjson): 

194 md = journal.find("x:journal_metadata", self.NS) 

195 if md is not None: 

196 issns = md.findall("x:issn", self.NS) 

197 

198 # if more than 2 issns raise the exception 

199 if len(issns) > 2: 

200 raise CrosswalkException(message=Messages.EXCEPTION_TOO_MANY_ISSNS) 

201 if len(issns) == 1: 

202 if len(issns[0].attrib) == 0 or issns[0].attrib["media_type"] == 'electronic': 

203 bibjson.add_identifier(bibjson.E_ISSN, issns[0].text.upper()) 

204 elif issns[0].attrib["media_type"] == 'print': 

205 bibjson.add_identifier(bibjson.P_ISSN, issns[0].text.upper()) 

206 

207 elif len(issns) == 2: 

208 attrs = [0, 0] 

209 if len(issns[0].attrib) != 0: 

210 attrs[0] = issns[0].attrib["media_type"] 

211 if len(issns[1].attrib) != 0: 

212 attrs[1] = issns[1].attrib["media_type"] 

213 

214 # if both issns have the same type - raise the exception 

215 if attrs[0] != 0 and attrs[0] == attrs[1]: 

216 raise CrosswalkException( 

217 message=Messages.EXCEPTION_ISSNS_OF_THE_SAME_TYPE.format(type=issns[1].attrib["media_type"])) 

218 

219 # if both issns have the same value - raise the exception 

220 if issns[0].text.upper() == issns[1].text.upper(): 

221 raise CrosswalkException( 

222 message=Messages.EXCEPTION_IDENTICAL_PISSN_AND_EISSN.format(value=issns[0].text.upper())) 

223 

224 if bool(attrs[0]) != bool(attrs[1]): 

225 if attrs[0] != 0: 

226 if attrs[0] == "electronic": 

227 attrs[1] = "print" 

228 else: 

229 attrs[1] = "electronic" 

230 else: 

231 if attrs[1] == "electronic": 

232 attrs[0] = "print" 

233 else: 

234 attrs[0] = "electronic" 

235 elif attrs[0] == 0: 

236 attrs[0] = "electronic" 

237 attrs[1] = "print" 

238 

239 bibjson.add_identifier(bibjson.P_ISSN if attrs[0] == "print" else bibjson.E_ISSN, issns[0].text.upper()) 

240 bibjson.add_identifier(bibjson.P_ISSN if attrs[1] == "print" else bibjson.E_ISSN, issns[1].text.upper()) 

241 

242 def extract_publication_date(self, record, journal, bibjson): 

243 pd = record.find("x:publication_date", self.NS) 

244 

245 if pd is not None: 

246 bibjson.year = _element(pd, "x:year", self.NS) 

247 bibjson.month = _element(pd, "x:month", self.NS) 

248 

249 def extract_volume(self, journal, bibjson): 

250 issue = journal.find("x:journal_issue", self.NS) 

251 

252 if issue is not None: 

253 vol = issue.find("x:journal_volume", self.NS) 

254 if vol is not None: 

255 volume = _element(vol, "x:volume", self.NS) 

256 if volume is not None: 

257 bibjson.volume = volume 

258 

259 def extract_issue(self, journal, bibjson): 

260 issue = journal.find("x:journal_issue", self.NS) 

261 if issue is not None: 

262 number = _element(issue, "x:issue", self.NS) 

263 if number is not None: 

264 bibjson.number = number 

265 

266 def extract_pages(self, record, journal, bibjson): 

267 pages = record.find('x:pages', self.NS) 

268 # start page 

269 if pages is not None: 

270 sp = _element(pages, "x:first_page", self.NS) 

271 if sp is not None: 

272 bibjson.start_page = sp 

273 

274 # end page 

275 ep = _element(pages, "x:last_page", self.NS) 

276 if ep is not None: 

277 bibjson.end_page = ep 

278 

279 def extract_doi(self, record, journal, bibjson): 

280 d = record.find("x:doi_data", self.NS) 

281 if d is not None: 

282 # doi 

283 doi = _element(d, "x:doi", self.NS) 

284 if doi is not None: 

285 bibjson.add_identifier(bibjson.DOI, doi) 

286 

287 def extract_fulltext(self, record, journal, bibjson): 

288 d = record.find("x:doi_data", self.NS) 

289 ftel = _element(d, "x:resource", self.NS) 

290 if ftel is not None: 

291 bibjson.add_url(ftel, "fulltext", self.NS) 

292 

293 def extract_article_title(self, record, journal, bibjson): 

294 titles = record.find('x:titles', self.NS) 

295 if titles is not None: 

296 title = _element(titles, "x:title", self.NS) 

297 if title is not None: 

298 bibjson.title = title 

299 

300 def extract_authors(self, record, journal, bibjson): 

301 contributors = record.find("x:contributors", self.NS) 

302 if contributors is None: 

303 raise CrosswalkException(message=Messages.EXCEPTION_NO_CONTRIBUTORS_FOUND, 

304 inner_message=Messages.EXCEPTION_NO_CONTRIBUTORS_EXPLANATION) 

305 contribs = contributors.findall("x:person_name", self.NS) 

306 if contribs is not None: 

307 for ctb in contribs: 

308 if ctb.attrib["contributor_role"] == 'author': 

309 name = _element(ctb, "x:surname", self.NS) 

310 e = _element(ctb, "x:given_name", self.NS) 

311 name = e + ' ' + name if e else name 

312 e = _element(ctb, "x:affiliation", self.NS) 

313 affiliation = e if e else None 

314 e = _element(ctb, "x:ORCID", self.NS) 

315 orcid = e if e else None 

316 bibjson.add_author(name, affiliation, orcid) 

317 

318 def extract_abstract(self, record, journal, bibjson): 

319 abstract_par = record.find("j:abstract", self.NS) 

320 if abstract_par is not None: 

321 text_elems = list(abstract_par.iter()) 

322 text = "" 

323 if text_elems is not None: 

324 for elems in text_elems: 

325 if elems.text is not None: 

326 text = text + elems.text 

327 bibjson.abstract = text[:30000] # avoids Elasticsearch 

328 # exceptions about .exact analyser not being able to handle 

329 # more than 32766 UTF8 characters 

330 

331############################################################################### 

332## Crossref 5.3.1 Xwalk 

333############################################################################### 

334 

335class CrossrefXWalk531(CrossrefXWalk442): 

336 """ 

337 ~~Crossref531XML:Crosswalk->Crossref531:Feature~~ 

338 ~~->Crossref422XML:Crosswalk~~ 

339 """ 

340 format_name = "crossref531" 

341 NS = {'x': 'http://www.crossref.org/schema/5.3.1', 'j': 'http://www.ncbi.nlm.nih.gov/JATS1'} 

342 

343 def __init__(self): 

344 super(CrossrefXWalk531,self).__init__() 

345 

346 while app.config["CROSSREF531_SCHEMA"] is None: 

347 continue 

348 

349 self.schema = app.config["CROSSREF531_SCHEMA"] 

350 

351 def extract_authors(self, record, journal, bibjson): 

352 contributors = record.find("x:contributors", self.NS) 

353 if contributors is None: 

354 raise CrosswalkException(message=Messages.EXCEPTION_NO_CONTRIBUTORS_FOUND, 

355 inner_message=Messages.EXCEPTION_NO_CONTRIBUTORS_EXPLANATION) 

356 contribs = contributors.findall("x:person_name", self.NS) 

357 if contribs is not None: 

358 for ctb in contribs: 

359 if ctb.attrib["contributor_role"] == 'author': 

360 name = _element(ctb, "x:surname", self.NS) 

361 e = _element(ctb, "x:given_name", self.NS) 

362 name = e + ' ' + name if e else name 

363 

364 # only first affiliation is supported even if multiple are provided 

365 affs = ctb.find("x:affiliations", self.NS) 

366 affiliation = None 

367 if affs is not None: 

368 institution = affs.find("x:institution", self.NS) 

369 if institution is not None: 

370 inst_name = _element(institution, "x:institution_name", self.NS) 

371 affiliation = inst_name if inst_name else None 

372 

373 e = _element(ctb, "x:ORCID", self.NS) 

374 orcid = e if e else None 

375 bibjson.add_author(name, affiliation, orcid) 

376 

377############################################################################### 

378## some convenient utilities 

379############################################################################### 

380 

381def _element(xml, field, namespace): 

382 el = xml.find(field, namespace) 

383 if el is not None: 

384 # self converts the entire element to a string, so that we can handle the possibility of 

385 # embedded html tags, etc. 

386 # etree.tostring doesn't actually produce a string, but a byte array, so we must specify 

387 # the encoding and THEN also decode it using that same encoding to get an actual string 

388 string = etree.tostring(el, encoding="utf-8").decode("utf-8") 

389 start = string.index(">") + 1 

390 end = string.rindex('</') 

391 text = string[start:end] 

392 return text if text else None 

393 else: 

394 return None