Coverage for portality/tasks/harvester_helpers/epmc/models.py: 45%
342 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-19 18:38 +0100
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-19 18:38 +0100
1from portality.lib import dataobj
2from portality.lib import xmlutil as xutil
3from lxml import etree
6class JATSException(Exception):
7 def __init__(self, message, rawstring, *args, **kwargs):
8 super(JATSException, self).__init__(message, *args, **kwargs)
9 self.raw = rawstring
12class EPMCFullTextException(JATSException):
13 """
14 Here for backwards compatibility
15 """
16 pass
19class EPMCMetadataException(Exception):
20 def __init__(self, message, rawstring, *args, **kwargs):
21 super(EPMCMetadataException, self).__init__(message, *args, **kwargs)
22 self.raw = rawstring
25class EPMCMetadataXML(object):
26 def __init__(self, raw=None, xml=None):
27 self.raw = None
28 self.xml = None
29 if raw is not None:
30 self.raw = raw
31 try:
32 self.xml = etree.fromstring(self.raw)
33 except:
34 raise JATSException("Unable to parse XML", self.raw)
35 elif xml is not None:
36 self.xml = xml
38 def tostring(self):
39 if self.raw is not None:
40 return self.raw
41 elif self.xml is not None:
42 return etree.tostring(self.xml)
44 @property
45 def title(self):
46 return xutil.xp_first_text(self.xml, "title")
48 @property
49 def publication_type(self):
50 return xutil.xp_first_text(self.xml, "//pubTypeList/pubType")
52 @property
53 def language(self):
54 return xutil.xp_first_text(self.xml, "language")
56 @property
57 def publication_date(self):
58 pd = xutil.xp_first_text(self.xml, "firstPublicationDate")
59 if pd is not None:
60 return pd
61 pd = xutil.xp_first_text(self.xml, "electronicPublicationDate")
62 if pd is not None:
63 return pd
64 pd = xutil.xp_first_text(self.xml, "//journalInfo/printPublicationDate")
65 return pd
67 @property
68 def pmid(self):
69 return xutil.xp_first_text(self.xml, "pmid")
71 @property
72 def pmcid(self):
73 return xutil.xp_first_text(self.xml, "pmcid")
75 @property
76 def doi(self):
77 return xutil.xp_first_text(self.xml, "DOI")
79 @property
80 def issns(self):
81 issn = xutil.xp_first_text(self.xml, "//journalInfo/journal/ISSN")
82 essn = xutil.xp_first_text(self.xml, "//journalInfo/journal/ESSN")
83 issns = []
84 if issn is not None:
85 issns.append(issn)
86 if essn is not None:
87 issns.append(essn)
88 return issns
90 @property
91 def keywords(self):
92 return xutil.xp_texts(self.xml, "//keywordList/keyword")
94 @property
95 def author_string(self):
96 return xutil.xp_first_text(self.xml, "//authorString")
98 @property
99 def authors(self):
100 """
101 <fullName>Cerasoli E</fullName>
102 <firstName>Eleonora</firstName>
103 <lastName>Cerasoli</lastName>
104 <initials>E</initials>
105 <affiliation>Biotechnology Department, National Physical Laboratory Teddington, UK.</affiliation>
106 """
107 author_elements = self.xml.xpath("//authorList/author")
108 obs = []
109 for ael in author_elements:
110 ao = {}
112 fn = ael.find("fullName")
113 if fn is not None:
114 ao["fullName"] = fn.text
116 first = ael.find("firstName")
117 if first is not None:
118 ao["firstName"] = first.text
120 last = ael.find("lastName")
121 if last is not None:
122 ao["lastName"] = last.text
124 inits = ael.find("initials")
125 if inits is not None:
126 ao["initials"] = inits.text
128 aff = ael.find("affiliation")
129 if aff is not None:
130 ao["affiliation"] = aff.text
132 if len(list(ao.keys())) > 0:
133 obs.append(ao)
135 return obs
137 @property
138 def grants(self):
139 grant_elements = self.xml.xpath("//grantsList/grant")
140 obs = []
141 for ael in grant_elements:
142 go = {}
144 gid = ael.find("grantId")
145 if gid is not None:
146 go["grantId"] = gid.text
148 ag = ael.find("agency")
149 if ag is not None:
150 go["agency"] = ag.text
152 if len(list(go.keys())) > 0:
153 obs.append(go)
155 return obs
157 @property
158 def mesh_descriptors(self):
159 return xutil.xp_texts(self.xml, "//meshHeadingList/meshHeading/descriptorName")
162class EPMCMetadata(dataobj.DataObj):
163 def __init__(self, raw):
164 super(EPMCMetadata, self).__init__(raw)
166 @property
167 def pmcid(self):
168 return self._get_single("pmcid", self._utf8_unicode(), allow_coerce_failure=False)
170 @property
171 def pmid(self):
172 return self._get_single("pmid", self._utf8_unicode(), allow_coerce_failure=False)
174 @property
175 def doi(self):
176 return self._get_single("doi", self._utf8_unicode(), allow_coerce_failure=False)
178 @property
179 def in_epmc(self):
180 return self._get_single("inEPMC", self._utf8_unicode(), allow_coerce_failure=False)
182 @property
183 def is_oa(self):
184 return self._get_single("isOpenAccess", self._utf8_unicode(), allow_coerce_failure=False)
186 @property
187 def issn(self):
188 return self._get_single("journalInfo.journal.issn", self._utf8_unicode(), allow_coerce_failure=False)
190 @property
191 def journal(self):
192 return self._get_single("journalInfo.journal.title", self._utf8_unicode(), allow_coerce_failure=False)
194 @property
195 def essn(self):
196 return self._get_single("journalInfo.journal.essn", self._utf8_unicode(), allow_coerce_failure=False)
198 @property
199 def title(self):
200 return self._get_single("title", self._utf8_unicode(), allow_coerce_failure=False)
202 @property
203 def journal_volume(self):
204 return self._get_single("journalInfo.volume", self._utf8_unicode())
206 @property
207 def journal_issue(self):
208 return self._get_single("journalInfo.issue", self._utf8_unicode())
210 @property
211 def language(self):
212 return self._get_single("language", self._utf8_unicode())
214 @property
215 def month_of_publication(self):
216 return self._get_single("journalInfo.monthOfPublication", dataobj.to_int())
218 @property
219 def year_of_publication(self):
220 return self._get_single("journalInfo.yearOfPublication", dataobj.to_int())
222 @property
223 def page_info(self):
224 return self._get_single("pageInfo", self._utf8_unicode())
226 @property
227 def start_page(self):
228 pi = self.page_info
229 if pi is None:
230 return None
231 bits = pi.split("-")
232 if len(bits) > 0:
233 return bits[0]
234 return None
236 @property
237 def end_page(self):
238 pi = self.page_info
239 if pi is None:
240 return None
241 bits = pi.split("-")
242 if len(bits) > 1:
243 return bits[1]
244 return None
246 @property
247 def fulltext_urls(self):
248 return self._get_list("fullTextUrlList.fullTextUrl")
250 def get_first_fulltext_url(self, availability=None, document_style=None, site=None):
251 for obj in self.fulltext_urls:
252 if availability is not None and availability != obj.get("availability"):
253 continue
254 if document_style is not None and document_style != obj.get("documentStyle"):
255 continue
256 if site is not None and site != obj.get("site"):
257 continue
258 return obj.get("url")
259 return None
261 @property
262 def abstract(self):
263 return self._get_single("abstractText", self._utf8_unicode())
265 @property
266 def authors(self):
267 return self._get_list("authorList.author")
269 @property
270 def author_string(self):
271 return self._get_single("authorString")
274class JATS(object):
275 def __init__(self, raw=None, xml=None):
276 self.raw = None
277 self.xml = None
278 if raw is not None:
279 self.raw = raw
280 try:
281 self.xml = etree.fromstring(self.raw)
282 except:
283 raise JATSException("Unable to parse XML", self.raw)
284 elif xml is not None:
285 self.xml = xml
287 @property
288 def title(self):
289 return xutil.xp_first_text(self.xml, "//title-group/article-title")
291 @property
292 def is_aam(self):
293 manuscripts = self.xml.xpath("//article-id[@pub-id-type='manuscript']")
294 return len(manuscripts) > 0
296 def get_licence_details(self):
297 # get the licence type
298 l = self.xml.xpath("//license")
299 if len(l) > 0:
300 l = l[0]
301 else:
302 return None, None, None
303 type = l.get("license-type")
304 url = l.get("{http://www.w3.org/1999/xlink}href")
306 # get the paragraph(s) describing the licence
307 para = self.xml.xpath("//license/license-p")
308 out = ""
309 for p in para:
310 out += etree.tostring(p)
312 return type, url, out
314 @property
315 def copyright_statement(self):
316 return xutil.xp_first_text(self.xml, "//copyright-statement")
318 @property
319 def categories(self):
320 return xutil.xp_texts(self.xml, "//article-categories/subj-group/subject")
322 @property
323 def authors(self):
324 aels = self.xml.xpath("//contrib-group/contrib[@contrib-type='author']")
325 return self._make_contribs(aels)
327 @property
328 def contribs(self):
329 cs = self.xml.xpath("//contrib-group/contrib")
330 return self._make_contribs(cs)
332 @property
333 def emails(self):
334 return xutil.xp_texts(self.xml, "//email")
336 @property
337 def keywords(self):
338 return xutil.xp_texts(self.xml, "//kwd-group/kwd")
340 @property
341 def publisher(self):
342 return xutil.xp_first_text(self.xml, "//publisher/publisher-name")
344 @property
345 def publication_date(self):
346 # first look for an explicit publication date
347 pds = self.xml.xpath("//article-meta/pub-date[@date-type='pub']")
348 if len(pds) > 0:
349 return self._make_date(pds[0])
351 # if not, look for exactly one pub-date and use that
352 pds = self.xml.xpath("//article-meta/pub-date")
353 if len(pds) == 1:
354 return self._make_date(pds[0])
356 # otherwise, insufficient information
357 return None
359 @property
360 def date_accepted(self):
361 das = self.xml.xpath("//history/date[@date-type='accepted']")
362 if len(das) > 0:
363 return self._make_date(das[0])
365 @property
366 def date_submitted(self):
367 rcs = self.xml.xpath("//history/date[@date-type='received']")
368 if len(rcs) > 0:
369 return self._make_date(rcs[0])
371 @property
372 def issn(self):
373 return xutil.xp_texts(self.xml, "//journal-meta/issn")
375 @property
376 def pmcid(self):
377 id = xutil.xp_first_text(self.xml, "//article-meta/article-id[@pub-id-type='pmcid']")
378 if id is not None and not id.startswith("PMC"):
379 id = "PMC" + id
380 return id
382 @property
383 def doi(self):
384 return xutil.xp_first_text(self.xml, "//article-meta/article-id[@pub-id-type='doi']")
386 def _make_date(self, element):
387 ob = xutil.objectify(element)
388 year = ob.get("year")
389 month = ob.get("month", "01")
390 day = ob.get("day", "01")
391 if len(month) < 2:
392 month = "0" + month
393 if len(day) < 2:
394 day = "0" + day
395 if year is None or len(year) != 4:
396 return None
397 return year + "-" + month + "-" + day
399 def _make_contribs(self, elements):
400 obs = []
402 for c in elements:
403 con = {}
405 # first see if there is a name we can pull out
406 name = c.find("name")
407 if name is not None:
408 sn = name.find("surname")
409 if sn is not None:
410 con["surname"] = sn.text
412 gn = name.find("given-names")
413 if gn is not None:
414 con["given-names"] = gn.text
416 # see if there's an email address
417 email = c.find("email")
418 if email is not None:
419 con["email"] = email.text
421 # now do the affiliations (by value and by (x)reference)
422 affs = []
424 aff = c.find("aff")
425 if aff is not None:
426 contents = aff.xpath("string()")
427 norm = " ".join(contents.split())
428 affs.append(norm)
430 xrefs = c.findall("xref")
431 for x in xrefs:
432 if x.get("ref-type") == "aff":
433 affid = x.get("rid")
434 xp = "//aff[@id='" + affid + "']"
435 aff_elements = self.xml.xpath(xp)
436 for ae in aff_elements:
437 contents = ae.xpath("string()")
438 norm = " ".join(contents.split())
439 affs.append(norm)
441 if len(affs) > 0:
442 con["affiliations"] = affs
444 if len(list(con.keys())) > 0:
445 obs.append(con)
447 return obs
449 def tostring(self):
450 if self.raw is not None:
451 return self.raw
452 elif self.xml is not None:
453 return etree.tostring(self.xml)