Coverage for portality/crosswalks/article_crossref_xml.py: 96%
223 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-09-05 21:15 +0100
« prev ^ index » next coverage.py v6.4.2, created at 2022-09-05 21:15 +0100
1from portality.core import app
2from lxml import etree
3import re
4from portality.bll import exceptions
5from portality.crosswalks.exceptions import CrosswalkException
6from portality import models
7from portality.ui.messages import Messages
11class CrossrefXWalk442(object):
12 """
13 ~~Crossref442XML:Crosswalk->Crossref442:Feature~~
14 """
15 format_name = "crossref442"
16 NS = {'x': 'http://www.crossref.org/schema/4.4.2', 'j': 'http://www.ncbi.nlm.nih.gov/JATS1'}
18 """
19 Example record:
20 <doi_batch version="4.4.2" xmlns="http://www.crossref.org/schema/4.4.2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.crossref.org/schema/4.3.7 http://www.crossref.org/schema/deposit/crossref4.3.7.xsd">
21 <head>
22 <doi_batch_id>1dbb27d1030c6c9d9d-7ff0</doi_batch_id>
23 <timestamp>200504260247</timestamp>
24 <depositor>
25 <depositor_name>your name</depositor_name>
26 <email_address>your@email.com</email_address>
27 </depositor>
28 <registrant>WEB-FORM</registrant>
29 </head>
30 <body>
31 <journal>
32 <journal_metadata>
33 <full_title>Test Publication</full_title>
34 <abbrev_title>TP</abbrev_title>
35 <issn media_type="print">2073-9813</issn>
36 </journal_metadata>
37 <journal_issue>
38 <publication_date media_type="print">
39 <month>12</month>
40 <day>1</day>
41 <year>2005</year>
42 </publication_date>
43 <journal_volume>
44 <volume>12</volume>
45 </journal_volume>
46 <issue>1</issue>
47 </journal_issue>
48 <!-- ====== This is the article's metadata ======== -->
49 <journal_article publication_type="full_text">
50 <titles>
51 <title>First Article</title>
52 </titles>
53 <contributors>
54 <person_name sequence="first" contributor_role="author">
55 <given_name>Bob</given_name>
56 <surname>Surname</surname>
57 <ORCID>http://orcid.org/0000-0002-4011-3590</ORCID>
58 </person_name>
59 </contributors>
60 <publication_date media_type="print">
61 <month>12</month>
62 <day>1</day>
63 <year>2004</year>
64 </publication_date>
65 <pages>
66 <first_page>100</first_page>
67 <last_page>200</last_page>
68 </pages>
69 <doi_data>
70 <doi>10.50505/test_20051229930</doi>
71 <resource>http://www.crossref.org/</resource>
72 </doi_data>
73 <!-- ========= Here is the list of references cited in the above article -->
74 <citation_list>
75 <citation key="ref1">
76 <journal_title>Current Opinion in Oncology</journal_title>
77 <author>Chauncey</author>
78 <volume>13</volume>
79 <first_page>21</first_page>
80 <cYear>2001</cYear>
81 </citation>
82 <citation key="ref2">
83 <doi>10.5555/small_md_0001</doi>
84 </citation>
85 <citation key="ref=3">
86 <unstructured_citation>Clow GD, McKay CP, Simmons Jr. GM, and Wharton RA, Jr. 1988. Climatological observations and predicted sublimation rates at Lake Hoare, Antarctica. Journal of Climate 1:715-728.</unstructured_citation>
87 </citation>
88 </citation_list>
89 </journal_article>
90 </journal>
91 </body>
92 </doi_batch>
93 """
95 def __init__(self):
96 self.validation_log = ""
97 self.schema_path = app.config.get("SCHEMAS", {}).get(self.format_name)
99 # load the schema into memory for more efficient usage in repeat calls to the crosswalk
100 if self.schema_path is None:
101 raise exceptions.IngestException(
102 message="Unable to validate for " + self.format_name + ", as schema path is not set in config")
104 while app.config["CROSSREF442_SCHEMA"] is None:
105 continue
107 # ~~->CrossrefXML:Schema~~
108 self.schema = app.config["CROSSREF442_SCHEMA"]
110 def validate_file(self, file_handle):
111 # first try to parse the file
112 try:
113 doc = etree.parse(file_handle)
114 except etree.XMLSyntaxError as e: # although the treatment is the same, pulling this out so we remember what the primary kind of exception should be
115 raise CrosswalkException(message="Unable to parse XML file", inner=e)
116 except UnicodeDecodeError as e:
117 msg = 'Text decode failed, expected utf-8 encoded XML.'
118 raise CrosswalkException(message='Unable to parse XML file', inner=e, inner_message=msg)
119 except Exception as e:
120 raise CrosswalkException(message="Unable to parse XML file", inner=e)
122 # then pass the doc to the validator
123 valid = self.validate(doc)
125 if not valid:
126 msg = "Validation message from schema '{x}': {y}\n".format(x=self.format_name,
127 y=self.validation_log)
128 raise CrosswalkException(message="Unable to validate document with identified schema", inner_message=msg)
130 return doc
132 def validate(self, doc):
133 valid = self.schema.validate(doc)
134 if not valid:
135 el = self.schema.error_log.__repr__()
136 # strip the filename, as we don't want to leak the path to the UI
137 rx = "[\da-f]{32}.xml:(.*)"
138 match = re.search(rx, el)
139 if match is not None:
140 el = match.group(1)
141 self.validation_log = el
142 return valid
144 def crosswalk_file(self, file_handle, add_journal_info):
145 doc = self.validate_file(file_handle)
146 return self.crosswalk_doc(doc)
148 def crosswalk_doc(self, doc):
149 # go through the records in the doc and crosswalk each one individually
150 articles = []
151 root = doc.getroot()
152 body = root.find("x:body", self.NS)
153 journals = body.findall("x:journal", self.NS)
154 if journals is not None:
155 for journal in journals:
156 arts = journal.findall("x:journal_article", self.NS)
157 for record in arts:
158 article = self.crosswalk_article(record, journal)
159 articles.append(article)
161 return articles
163 def crosswalk_article(self, record, journal):
164 article = models.Article() # ~~->Article:Model~~
165 bibjson = article.bibjson()
167 self.extract_journal_title(journal, bibjson)
168 self.extract_issns(journal, bibjson)
169 self.extract_publication_date(record, journal, bibjson)
170 self.extract_volume(journal, bibjson)
171 self.extract_issue(journal, bibjson)
172 self.extract_pages(record, journal, bibjson)
173 self.extract_doi(record, journal, bibjson)
174 self.extract_fulltext(record, journal, bibjson)
175 self.extract_article_title(record, journal, bibjson)
176 self.extract_authors(record, journal, bibjson)
177 self.extract_abstract(record, journal, bibjson)
179 return article
182 ###############################################################################
183 ## extractors
184 ###############################################################################
186 def extract_journal_title(self, journal, bibjson):
187 jm = journal.find("x:journal_metadata", self.NS)
188 if jm is not None:
189 jt = _element(jm, "x:full_title", self.NS)
190 if jt is not None:
191 bibjson.journal_title = jt
193 def extract_issns(self, journal, bibjson):
194 md = journal.find("x:journal_metadata", self.NS)
195 if md is not None:
196 issns = md.findall("x:issn", self.NS)
198 # if more than 2 issns raise the exception
199 if len(issns) > 2:
200 raise CrosswalkException(message=Messages.EXCEPTION_TOO_MANY_ISSNS)
201 if len(issns) == 1:
202 if len(issns[0].attrib) == 0 or issns[0].attrib["media_type"] == 'electronic':
203 bibjson.add_identifier(bibjson.E_ISSN, issns[0].text.upper())
204 elif issns[0].attrib["media_type"] == 'print':
205 bibjson.add_identifier(bibjson.P_ISSN, issns[0].text.upper())
207 elif len(issns) == 2:
208 attrs = [0, 0]
209 if len(issns[0].attrib) != 0:
210 attrs[0] = issns[0].attrib["media_type"]
211 if len(issns[1].attrib) != 0:
212 attrs[1] = issns[1].attrib["media_type"]
214 # if both issns have the same type - raise the exception
215 if attrs[0] != 0 and attrs[0] == attrs[1]:
216 raise CrosswalkException(
217 message=Messages.EXCEPTION_ISSNS_OF_THE_SAME_TYPE.format(type=issns[1].attrib["media_type"]))
219 # if both issns have the same value - raise the exception
220 if issns[0].text.upper() == issns[1].text.upper():
221 raise CrosswalkException(
222 message=Messages.EXCEPTION_IDENTICAL_PISSN_AND_EISSN.format(value=issns[0].text.upper()))
224 if bool(attrs[0]) != bool(attrs[1]):
225 if attrs[0] != 0:
226 if attrs[0] == "electronic":
227 attrs[1] = "print"
228 else:
229 attrs[1] = "electronic"
230 else:
231 if attrs[1] == "electronic":
232 attrs[0] = "print"
233 else:
234 attrs[0] = "electronic"
235 elif attrs[0] == 0:
236 attrs[0] = "electronic"
237 attrs[1] = "print"
239 bibjson.add_identifier(bibjson.P_ISSN if attrs[0] == "print" else bibjson.E_ISSN, issns[0].text.upper())
240 bibjson.add_identifier(bibjson.P_ISSN if attrs[1] == "print" else bibjson.E_ISSN, issns[1].text.upper())
242 def extract_publication_date(self, record, journal, bibjson):
243 pd = record.find("x:publication_date", self.NS)
245 if pd is not None:
246 bibjson.year = _element(pd, "x:year", self.NS)
247 bibjson.month = _element(pd, "x:month", self.NS)
249 def extract_volume(self, journal, bibjson):
250 issue = journal.find("x:journal_issue", self.NS)
252 if issue is not None:
253 vol = issue.find("x:journal_volume", self.NS)
254 if vol is not None:
255 volume = _element(vol, "x:volume", self.NS)
256 if volume is not None:
257 bibjson.volume = volume
259 def extract_issue(self, journal, bibjson):
260 issue = journal.find("x:journal_issue", self.NS)
261 if issue is not None:
262 number = _element(issue, "x:issue", self.NS)
263 if number is not None:
264 bibjson.number = number
266 def extract_pages(self, record, journal, bibjson):
267 pages = record.find('x:pages', self.NS)
268 # start page
269 if pages is not None:
270 sp = _element(pages, "x:first_page", self.NS)
271 if sp is not None:
272 bibjson.start_page = sp
274 # end page
275 ep = _element(pages, "x:last_page", self.NS)
276 if ep is not None:
277 bibjson.end_page = ep
279 def extract_doi(self, record, journal, bibjson):
280 d = record.find("x:doi_data", self.NS)
281 if d is not None:
282 # doi
283 doi = _element(d, "x:doi", self.NS)
284 if doi is not None:
285 bibjson.add_identifier(bibjson.DOI, doi)
287 def extract_fulltext(self, record, journal, bibjson):
288 d = record.find("x:doi_data", self.NS)
289 ftel = _element(d, "x:resource", self.NS)
290 if ftel is not None:
291 bibjson.add_url(ftel, "fulltext", self.NS)
293 def extract_article_title(self, record, journal, bibjson):
294 titles = record.find('x:titles', self.NS)
295 if titles is not None:
296 title = _element(titles, "x:title", self.NS)
297 if title is not None:
298 bibjson.title = title
300 def extract_authors(self, record, journal, bibjson):
301 contributors = record.find("x:contributors", self.NS)
302 if contributors is None:
303 raise CrosswalkException(message=Messages.EXCEPTION_NO_CONTRIBUTORS_FOUND,
304 inner_message=Messages.EXCEPTION_NO_CONTRIBUTORS_EXPLANATION)
305 contribs = contributors.findall("x:person_name", self.NS)
306 if contribs is not None:
307 for ctb in contribs:
308 if ctb.attrib["contributor_role"] == 'author':
309 name = _element(ctb, "x:surname", self.NS)
310 e = _element(ctb, "x:given_name", self.NS)
311 name = e + ' ' + name if e else name
312 e = _element(ctb, "x:affiliation", self.NS)
313 affiliation = e if e else None
314 e = _element(ctb, "x:ORCID", self.NS)
315 orcid = e if e else None
316 bibjson.add_author(name, affiliation, orcid)
318 def extract_abstract(self, record, journal, bibjson):
319 abstract_par = record.find("j:abstract", self.NS)
320 if abstract_par is not None:
321 text_elems = list(abstract_par.iter())
322 text = ""
323 if text_elems is not None:
324 for elems in text_elems:
325 if elems.text is not None:
326 text = text + elems.text
327 bibjson.abstract = text[:30000] # avoids Elasticsearch
328 # exceptions about .exact analyser not being able to handle
329 # more than 32766 UTF8 characters
331###############################################################################
332## Crossref 5.3.1 Xwalk
333###############################################################################
335class CrossrefXWalk531(CrossrefXWalk442):
336 """
337 ~~Crossref531XML:Crosswalk->Crossref531:Feature~~
338 ~~->Crossref422XML:Crosswalk~~
339 """
340 format_name = "crossref531"
341 NS = {'x': 'http://www.crossref.org/schema/5.3.1', 'j': 'http://www.ncbi.nlm.nih.gov/JATS1'}
343 def __init__(self):
344 super(CrossrefXWalk531,self).__init__()
346 while app.config["CROSSREF531_SCHEMA"] is None:
347 continue
349 self.schema = app.config["CROSSREF531_SCHEMA"]
351 def extract_authors(self, record, journal, bibjson):
352 contributors = record.find("x:contributors", self.NS)
353 if contributors is None:
354 raise CrosswalkException(message=Messages.EXCEPTION_NO_CONTRIBUTORS_FOUND,
355 inner_message=Messages.EXCEPTION_NO_CONTRIBUTORS_EXPLANATION)
356 contribs = contributors.findall("x:person_name", self.NS)
357 if contribs is not None:
358 for ctb in contribs:
359 if ctb.attrib["contributor_role"] == 'author':
360 name = _element(ctb, "x:surname", self.NS)
361 e = _element(ctb, "x:given_name", self.NS)
362 name = e + ' ' + name if e else name
364 # only first affiliation is supported even if multiple are provided
365 affs = ctb.find("x:affiliations", self.NS)
366 affiliation = None
367 if affs is not None:
368 institution = affs.find("x:institution", self.NS)
369 if institution is not None:
370 inst_name = _element(institution, "x:institution_name", self.NS)
371 affiliation = inst_name if inst_name else None
373 e = _element(ctb, "x:ORCID", self.NS)
374 orcid = e if e else None
375 bibjson.add_author(name, affiliation, orcid)
377###############################################################################
378## some convenient utilities
379###############################################################################
381def _element(xml, field, namespace):
382 el = xml.find(field, namespace)
383 if el is not None:
384 # self converts the entire element to a string, so that we can handle the possibility of
385 # embedded html tags, etc.
386 # etree.tostring doesn't actually produce a string, but a byte array, so we must specify
387 # the encoding and THEN also decode it using that same encoding to get an actual string
388 string = etree.tostring(el, encoding="utf-8").decode("utf-8")
389 start = string.index(">") + 1
390 end = string.rindex('</')
391 text = string[start:end]
392 return text if text else None
393 else:
394 return None