Coverage for portality/tasks/harvester_helpers/epmc/models.py: 45%

342 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-22 15:59 +0100

1from portality.lib import dataobj 

2from portality.lib import xmlutil as xutil 

3from lxml import etree 

4 

5 

6class JATSException(Exception): 

7 def __init__(self, message, rawstring, *args, **kwargs): 

8 super(JATSException, self).__init__(message, *args, **kwargs) 

9 self.raw = rawstring 

10 

11 

12class EPMCFullTextException(JATSException): 

13 """ 

14 Here for backwards compatibility 

15 """ 

16 pass 

17 

18 

19class EPMCMetadataException(Exception): 

20 def __init__(self, message, rawstring, *args, **kwargs): 

21 super(EPMCMetadataException, self).__init__(message, *args, **kwargs) 

22 self.raw = rawstring 

23 

24 

25class EPMCMetadataXML(object): 

26 def __init__(self, raw=None, xml=None): 

27 self.raw = None 

28 self.xml = None 

29 if raw is not None: 

30 self.raw = raw 

31 try: 

32 self.xml = etree.fromstring(self.raw) 

33 except: 

34 raise JATSException("Unable to parse XML", self.raw) 

35 elif xml is not None: 

36 self.xml = xml 

37 

38 def tostring(self): 

39 if self.raw is not None: 

40 return self.raw 

41 elif self.xml is not None: 

42 return etree.tostring(self.xml) 

43 

44 @property 

45 def title(self): 

46 return xutil.xp_first_text(self.xml, "title") 

47 

48 @property 

49 def publication_type(self): 

50 return xutil.xp_first_text(self.xml, "//pubTypeList/pubType") 

51 

52 @property 

53 def language(self): 

54 return xutil.xp_first_text(self.xml, "language") 

55 

56 @property 

57 def publication_date(self): 

58 pd = xutil.xp_first_text(self.xml, "firstPublicationDate") 

59 if pd is not None: 

60 return pd 

61 pd = xutil.xp_first_text(self.xml, "electronicPublicationDate") 

62 if pd is not None: 

63 return pd 

64 pd = xutil.xp_first_text(self.xml, "//journalInfo/printPublicationDate") 

65 return pd 

66 

67 @property 

68 def pmid(self): 

69 return xutil.xp_first_text(self.xml, "pmid") 

70 

71 @property 

72 def pmcid(self): 

73 return xutil.xp_first_text(self.xml, "pmcid") 

74 

75 @property 

76 def doi(self): 

77 return xutil.xp_first_text(self.xml, "DOI") 

78 

79 @property 

80 def issns(self): 

81 issn = xutil.xp_first_text(self.xml, "//journalInfo/journal/ISSN") 

82 essn = xutil.xp_first_text(self.xml, "//journalInfo/journal/ESSN") 

83 issns = [] 

84 if issn is not None: 

85 issns.append(issn) 

86 if essn is not None: 

87 issns.append(essn) 

88 return issns 

89 

90 @property 

91 def keywords(self): 

92 return xutil.xp_texts(self.xml, "//keywordList/keyword") 

93 

94 @property 

95 def author_string(self): 

96 return xutil.xp_first_text(self.xml, "//authorString") 

97 

98 @property 

99 def authors(self): 

100 """ 

101 <fullName>Cerasoli E</fullName> 

102 <firstName>Eleonora</firstName> 

103 <lastName>Cerasoli</lastName> 

104 <initials>E</initials> 

105 <affiliation>Biotechnology Department, National Physical Laboratory Teddington, UK.</affiliation> 

106 """ 

107 author_elements = self.xml.xpath("//authorList/author") 

108 obs = [] 

109 for ael in author_elements: 

110 ao = {} 

111 

112 fn = ael.find("fullName") 

113 if fn is not None: 

114 ao["fullName"] = fn.text 

115 

116 first = ael.find("firstName") 

117 if first is not None: 

118 ao["firstName"] = first.text 

119 

120 last = ael.find("lastName") 

121 if last is not None: 

122 ao["lastName"] = last.text 

123 

124 inits = ael.find("initials") 

125 if inits is not None: 

126 ao["initials"] = inits.text 

127 

128 aff = ael.find("affiliation") 

129 if aff is not None: 

130 ao["affiliation"] = aff.text 

131 

132 if len(list(ao.keys())) > 0: 

133 obs.append(ao) 

134 

135 return obs 

136 

137 @property 

138 def grants(self): 

139 grant_elements = self.xml.xpath("//grantsList/grant") 

140 obs = [] 

141 for ael in grant_elements: 

142 go = {} 

143 

144 gid = ael.find("grantId") 

145 if gid is not None: 

146 go["grantId"] = gid.text 

147 

148 ag = ael.find("agency") 

149 if ag is not None: 

150 go["agency"] = ag.text 

151 

152 if len(list(go.keys())) > 0: 

153 obs.append(go) 

154 

155 return obs 

156 

157 @property 

158 def mesh_descriptors(self): 

159 return xutil.xp_texts(self.xml, "//meshHeadingList/meshHeading/descriptorName") 

160 

161 

162class EPMCMetadata(dataobj.DataObj): 

163 def __init__(self, raw): 

164 super(EPMCMetadata, self).__init__(raw) 

165 

166 @property 

167 def pmcid(self): 

168 return self._get_single("pmcid", self._utf8_unicode(), allow_coerce_failure=False) 

169 

170 @property 

171 def pmid(self): 

172 return self._get_single("pmid", self._utf8_unicode(), allow_coerce_failure=False) 

173 

174 @property 

175 def doi(self): 

176 return self._get_single("doi", self._utf8_unicode(), allow_coerce_failure=False) 

177 

178 @property 

179 def in_epmc(self): 

180 return self._get_single("inEPMC", self._utf8_unicode(), allow_coerce_failure=False) 

181 

182 @property 

183 def is_oa(self): 

184 return self._get_single("isOpenAccess", self._utf8_unicode(), allow_coerce_failure=False) 

185 

186 @property 

187 def issn(self): 

188 return self._get_single("journalInfo.journal.issn", self._utf8_unicode(), allow_coerce_failure=False) 

189 

190 @property 

191 def journal(self): 

192 return self._get_single("journalInfo.journal.title", self._utf8_unicode(), allow_coerce_failure=False) 

193 

194 @property 

195 def essn(self): 

196 return self._get_single("journalInfo.journal.essn", self._utf8_unicode(), allow_coerce_failure=False) 

197 

198 @property 

199 def title(self): 

200 return self._get_single("title", self._utf8_unicode(), allow_coerce_failure=False) 

201 

202 @property 

203 def journal_volume(self): 

204 return self._get_single("journalInfo.volume", self._utf8_unicode()) 

205 

206 @property 

207 def journal_issue(self): 

208 return self._get_single("journalInfo.issue", self._utf8_unicode()) 

209 

210 @property 

211 def language(self): 

212 return self._get_single("language", self._utf8_unicode()) 

213 

214 @property 

215 def month_of_publication(self): 

216 return self._get_single("journalInfo.monthOfPublication", dataobj.to_int()) 

217 

218 @property 

219 def year_of_publication(self): 

220 return self._get_single("journalInfo.yearOfPublication", dataobj.to_int()) 

221 

222 @property 

223 def page_info(self): 

224 return self._get_single("pageInfo", self._utf8_unicode()) 

225 

226 @property 

227 def start_page(self): 

228 pi = self.page_info 

229 if pi is None: 

230 return None 

231 bits = pi.split("-") 

232 if len(bits) > 0: 

233 return bits[0] 

234 return None 

235 

236 @property 

237 def end_page(self): 

238 pi = self.page_info 

239 if pi is None: 

240 return None 

241 bits = pi.split("-") 

242 if len(bits) > 1: 

243 return bits[1] 

244 return None 

245 

246 @property 

247 def fulltext_urls(self): 

248 return self._get_list("fullTextUrlList.fullTextUrl") 

249 

250 def get_first_fulltext_url(self, availability=None, document_style=None, site=None): 

251 for obj in self.fulltext_urls: 

252 if availability is not None and availability != obj.get("availability"): 

253 continue 

254 if document_style is not None and document_style != obj.get("documentStyle"): 

255 continue 

256 if site is not None and site != obj.get("site"): 

257 continue 

258 return obj.get("url") 

259 return None 

260 

261 @property 

262 def abstract(self): 

263 return self._get_single("abstractText", self._utf8_unicode()) 

264 

265 @property 

266 def authors(self): 

267 return self._get_list("authorList.author") 

268 

269 @property 

270 def author_string(self): 

271 return self._get_single("authorString") 

272 

273 

274class JATS(object): 

275 def __init__(self, raw=None, xml=None): 

276 self.raw = None 

277 self.xml = None 

278 if raw is not None: 

279 self.raw = raw 

280 try: 

281 self.xml = etree.fromstring(self.raw) 

282 except: 

283 raise JATSException("Unable to parse XML", self.raw) 

284 elif xml is not None: 

285 self.xml = xml 

286 

287 @property 

288 def title(self): 

289 return xutil.xp_first_text(self.xml, "//title-group/article-title") 

290 

291 @property 

292 def is_aam(self): 

293 manuscripts = self.xml.xpath("//article-id[@pub-id-type='manuscript']") 

294 return len(manuscripts) > 0 

295 

296 def get_licence_details(self): 

297 # get the licence type 

298 l = self.xml.xpath("//license") 

299 if len(l) > 0: 

300 l = l[0] 

301 else: 

302 return None, None, None 

303 type = l.get("license-type") 

304 url = l.get("{http://www.w3.org/1999/xlink}href") 

305 

306 # get the paragraph(s) describing the licence 

307 para = self.xml.xpath("//license/license-p") 

308 out = "" 

309 for p in para: 

310 out += etree.tostring(p) 

311 

312 return type, url, out 

313 

314 @property 

315 def copyright_statement(self): 

316 return xutil.xp_first_text(self.xml, "//copyright-statement") 

317 

318 @property 

319 def categories(self): 

320 return xutil.xp_texts(self.xml, "//article-categories/subj-group/subject") 

321 

322 @property 

323 def authors(self): 

324 aels = self.xml.xpath("//contrib-group/contrib[@contrib-type='author']") 

325 return self._make_contribs(aels) 

326 

327 @property 

328 def contribs(self): 

329 cs = self.xml.xpath("//contrib-group/contrib") 

330 return self._make_contribs(cs) 

331 

332 @property 

333 def emails(self): 

334 return xutil.xp_texts(self.xml, "//email") 

335 

336 @property 

337 def keywords(self): 

338 return xutil.xp_texts(self.xml, "//kwd-group/kwd") 

339 

340 @property 

341 def publisher(self): 

342 return xutil.xp_first_text(self.xml, "//publisher/publisher-name") 

343 

344 @property 

345 def publication_date(self): 

346 # first look for an explicit publication date 

347 pds = self.xml.xpath("//article-meta/pub-date[@date-type='pub']") 

348 if len(pds) > 0: 

349 return self._make_date(pds[0]) 

350 

351 # if not, look for exactly one pub-date and use that 

352 pds = self.xml.xpath("//article-meta/pub-date") 

353 if len(pds) == 1: 

354 return self._make_date(pds[0]) 

355 

356 # otherwise, insufficient information 

357 return None 

358 

359 @property 

360 def date_accepted(self): 

361 das = self.xml.xpath("//history/date[@date-type='accepted']") 

362 if len(das) > 0: 

363 return self._make_date(das[0]) 

364 

365 @property 

366 def date_submitted(self): 

367 rcs = self.xml.xpath("//history/date[@date-type='received']") 

368 if len(rcs) > 0: 

369 return self._make_date(rcs[0]) 

370 

371 @property 

372 def issn(self): 

373 return xutil.xp_texts(self.xml, "//journal-meta/issn") 

374 

375 @property 

376 def pmcid(self): 

377 id = xutil.xp_first_text(self.xml, "//article-meta/article-id[@pub-id-type='pmcid']") 

378 if id is not None and not id.startswith("PMC"): 

379 id = "PMC" + id 

380 return id 

381 

382 @property 

383 def doi(self): 

384 return xutil.xp_first_text(self.xml, "//article-meta/article-id[@pub-id-type='doi']") 

385 

386 def _make_date(self, element): 

387 ob = xutil.objectify(element) 

388 year = ob.get("year") 

389 month = ob.get("month", "01") 

390 day = ob.get("day", "01") 

391 if len(month) < 2: 

392 month = "0" + month 

393 if len(day) < 2: 

394 day = "0" + day 

395 if year is None or len(year) != 4: 

396 return None 

397 return year + "-" + month + "-" + day 

398 

399 def _make_contribs(self, elements): 

400 obs = [] 

401 

402 for c in elements: 

403 con = {} 

404 

405 # first see if there is a name we can pull out 

406 name = c.find("name") 

407 if name is not None: 

408 sn = name.find("surname") 

409 if sn is not None: 

410 con["surname"] = sn.text 

411 

412 gn = name.find("given-names") 

413 if gn is not None: 

414 con["given-names"] = gn.text 

415 

416 # see if there's an email address 

417 email = c.find("email") 

418 if email is not None: 

419 con["email"] = email.text 

420 

421 # now do the affiliations (by value and by (x)reference) 

422 affs = [] 

423 

424 aff = c.find("aff") 

425 if aff is not None: 

426 contents = aff.xpath("string()") 

427 norm = " ".join(contents.split()) 

428 affs.append(norm) 

429 

430 xrefs = c.findall("xref") 

431 for x in xrefs: 

432 if x.get("ref-type") == "aff": 

433 affid = x.get("rid") 

434 xp = "//aff[@id='" + affid + "']" 

435 aff_elements = self.xml.xpath(xp) 

436 for ae in aff_elements: 

437 contents = ae.xpath("string()") 

438 norm = " ".join(contents.split()) 

439 affs.append(norm) 

440 

441 if len(affs) > 0: 

442 con["affiliations"] = affs 

443 

444 if len(list(con.keys())) > 0: 

445 obs.append(con) 

446 

447 return obs 

448 

449 def tostring(self): 

450 if self.raw is not None: 

451 return self.raw 

452 elif self.xml is not None: 

453 return etree.tostring(self.xml)