Coverage for portality/tasks/harvester_helpers/epmc/models.py: 45%

1from portality.lib import dataobj

2from portality.lib import xmlutil as xutil

3from lxml import etree

6class JATSException(Exception):

7 def __init__(self, message, rawstring, *args, **kwargs):

8 super(JATSException, self).__init__(message, *args, **kwargs)

9 self.raw = rawstring

12class EPMCFullTextException(JATSException):

13 """

14 Here for backwards compatibility

15 """

16 pass

19class EPMCMetadataException(Exception):

20 def __init__(self, message, rawstring, *args, **kwargs):

21 super(EPMCMetadataException, self).__init__(message, *args, **kwargs)

22 self.raw = rawstring

25class EPMCMetadataXML(object):

26 def __init__(self, raw=None, xml=None):

27 self.raw = None

28 self.xml = None

29 if raw is not None:

30 self.raw = raw

31 try:

32 self.xml = etree.fromstring(self.raw)

33 except:

34 raise JATSException("Unable to parse XML", self.raw)

35 elif xml is not None:

36 self.xml = xml

38 def tostring(self):

39 if self.raw is not None:

40 return self.raw

41 elif self.xml is not None:

42 return etree.tostring(self.xml)

44 @property

45 def title(self):

46 return xutil.xp_first_text(self.xml, "title")

48 @property

49 def publication_type(self):

50 return xutil.xp_first_text(self.xml, "//pubTypeList/pubType")

52 @property

53 def language(self):

54 return xutil.xp_first_text(self.xml, "language")

56 @property

57 def publication_date(self):

58 pd = xutil.xp_first_text(self.xml, "firstPublicationDate")

59 if pd is not None:

60 return pd

61 pd = xutil.xp_first_text(self.xml, "electronicPublicationDate")

62 if pd is not None:

63 return pd

64 pd = xutil.xp_first_text(self.xml, "//journalInfo/printPublicationDate")

65 return pd

67 @property

68 def pmid(self):

69 return xutil.xp_first_text(self.xml, "pmid")

71 @property

72 def pmcid(self):

73 return xutil.xp_first_text(self.xml, "pmcid")

75 @property

76 def doi(self):

77 return xutil.xp_first_text(self.xml, "DOI")

79 @property

80 def issns(self):

81 issn = xutil.xp_first_text(self.xml, "//journalInfo/journal/ISSN")

82 essn = xutil.xp_first_text(self.xml, "//journalInfo/journal/ESSN")

83 issns = []

84 if issn is not None:

85 issns.append(issn)

86 if essn is not None:

87 issns.append(essn)

88 return issns

90 @property

91 def keywords(self):

92 return xutil.xp_texts(self.xml, "//keywordList/keyword")

94 @property

95 def author_string(self):

96 return xutil.xp_first_text(self.xml, "//authorString")

98 @property

99 def authors(self):

100 """

101 <fullName>Cerasoli E</fullName>

102 <firstName>Eleonora</firstName>

103 <lastName>Cerasoli</lastName>

104 <initials>E</initials>

105 <affiliation>Biotechnology Department, National Physical Laboratory Teddington, UK.</affiliation>

106 """

107 author_elements = self.xml.xpath("//authorList/author")

108 obs = []

109 for ael in author_elements:

110 ao = {}

111

112 fn = ael.find("fullName")

113 if fn is not None:

114 ao["fullName"] = fn.text

115

116 first = ael.find("firstName")

117 if first is not None:

118 ao["firstName"] = first.text

119

120 last = ael.find("lastName")

121 if last is not None:

122 ao["lastName"] = last.text

123

124 inits = ael.find("initials")

125 if inits is not None:

126 ao["initials"] = inits.text

127

128 aff = ael.find("affiliation")

129 if aff is not None:

130 ao["affiliation"] = aff.text

131

132 if len(list(ao.keys())) > 0:

133 obs.append(ao)

134

135 return obs

136

137 @property

138 def grants(self):

139 grant_elements = self.xml.xpath("//grantsList/grant")

140 obs = []

141 for ael in grant_elements:

142 go = {}

143

144 gid = ael.find("grantId")

145 if gid is not None:

146 go["grantId"] = gid.text

147

148 ag = ael.find("agency")

149 if ag is not None:

150 go["agency"] = ag.text

151

152 if len(list(go.keys())) > 0:

153 obs.append(go)

154

155 return obs

156

157 @property

158 def mesh_descriptors(self):

159 return xutil.xp_texts(self.xml, "//meshHeadingList/meshHeading/descriptorName")

160

161

162class EPMCMetadata(dataobj.DataObj):

163 def __init__(self, raw):

164 super(EPMCMetadata, self).__init__(raw)

165

166 @property

167 def pmcid(self):

168 return self._get_single("pmcid", self._utf8_unicode(), allow_coerce_failure=False)

169

170 @property

171 def pmid(self):

172 return self._get_single("pmid", self._utf8_unicode(), allow_coerce_failure=False)

173

174 @property

175 def doi(self):

176 return self._get_single("doi", self._utf8_unicode(), allow_coerce_failure=False)

177

178 @property

179 def in_epmc(self):

180 return self._get_single("inEPMC", self._utf8_unicode(), allow_coerce_failure=False)

181

182 @property

183 def is_oa(self):

184 return self._get_single("isOpenAccess", self._utf8_unicode(), allow_coerce_failure=False)

185

186 @property

187 def issn(self):

188 return self._get_single("journalInfo.journal.issn", self._utf8_unicode(), allow_coerce_failure=False)

189

190 @property

191 def journal(self):

192 return self._get_single("journalInfo.journal.title", self._utf8_unicode(), allow_coerce_failure=False)

193

194 @property

195 def essn(self):

196 return self._get_single("journalInfo.journal.essn", self._utf8_unicode(), allow_coerce_failure=False)

197

198 @property

199 def title(self):

200 return self._get_single("title", self._utf8_unicode(), allow_coerce_failure=False)

201

202 @property

203 def journal_volume(self):

204 return self._get_single("journalInfo.volume", self._utf8_unicode())

205

206 @property

207 def journal_issue(self):

208 return self._get_single("journalInfo.issue", self._utf8_unicode())

209

210 @property

211 def language(self):

212 return self._get_single("language", self._utf8_unicode())

213

214 @property

215 def month_of_publication(self):

216 return self._get_single("journalInfo.monthOfPublication", dataobj.to_int())

217

218 @property

219 def year_of_publication(self):

220 return self._get_single("journalInfo.yearOfPublication", dataobj.to_int())

221

222 @property

223 def page_info(self):

224 return self._get_single("pageInfo", self._utf8_unicode())

225

226 @property

227 def start_page(self):

228 pi = self.page_info

229 if pi is None:

230 return None

231 bits = pi.split("-")

232 if len(bits) > 0:

233 return bits[0]

234 return None

235

236 @property

237 def end_page(self):

238 pi = self.page_info

239 if pi is None:

240 return None

241 bits = pi.split("-")

242 if len(bits) > 1:

243 return bits[1]

244 return None

245

246 @property

247 def fulltext_urls(self):

248 return self._get_list("fullTextUrlList.fullTextUrl")

249

250 def get_first_fulltext_url(self, availability=None, document_style=None, site=None):

251 for obj in self.fulltext_urls:

252 if availability is not None and availability != obj.get("availability"):

253 continue

254 if document_style is not None and document_style != obj.get("documentStyle"):

255 continue

256 if site is not None and site != obj.get("site"):

257 continue

258 return obj.get("url")

259 return None

260

261 @property

262 def abstract(self):

263 return self._get_single("abstractText", self._utf8_unicode())

264

265 @property

266 def authors(self):

267 return self._get_list("authorList.author")

268

269 @property

270 def author_string(self):

271 return self._get_single("authorString")

272

273

274class JATS(object):

275 def __init__(self, raw=None, xml=None):

276 self.raw = None

277 self.xml = None

278 if raw is not None:

279 self.raw = raw

280 try:

281 self.xml = etree.fromstring(self.raw)

282 except:

283 raise JATSException("Unable to parse XML", self.raw)

284 elif xml is not None:

285 self.xml = xml

286

287 @property

288 def title(self):

289 return xutil.xp_first_text(self.xml, "//title-group/article-title")

290

291 @property

292 def is_aam(self):

293 manuscripts = self.xml.xpath("//article-id[@pub-id-type='manuscript']")

294 return len(manuscripts) > 0

295

296 def get_licence_details(self):

297 # get the licence type

298 l = self.xml.xpath("//license")

299 if len(l) > 0:

300 l = l[0]

301 else:

302 return None, None, None

303 type = l.get("license-type")

304 url = l.get("{http://www.w3.org/1999/xlink}href")

305

306 # get the paragraph(s) describing the licence

307 para = self.xml.xpath("//license/license-p")

308 out = ""

309 for p in para:

310 out += etree.tostring(p)

311

312 return type, url, out

313

314 @property

315 def copyright_statement(self):

316 return xutil.xp_first_text(self.xml, "//copyright-statement")

317

318 @property

319 def categories(self):

320 return xutil.xp_texts(self.xml, "//article-categories/subj-group/subject")

321

322 @property

323 def authors(self):

324 aels = self.xml.xpath("//contrib-group/contrib[@contrib-type='author']")

325 return self._make_contribs(aels)

326

327 @property

328 def contribs(self):

329 cs = self.xml.xpath("//contrib-group/contrib")

330 return self._make_contribs(cs)

331

332 @property

333 def emails(self):

334 return xutil.xp_texts(self.xml, "//email")

335

336 @property

337 def keywords(self):

338 return xutil.xp_texts(self.xml, "//kwd-group/kwd")

339

340 @property

341 def publisher(self):

342 return xutil.xp_first_text(self.xml, "//publisher/publisher-name")

343

344 @property

345 def publication_date(self):

346 # first look for an explicit publication date

347 pds = self.xml.xpath("//article-meta/pub-date[@date-type='pub']")

348 if len(pds) > 0:

349 return self._make_date(pds[0])

350

351 # if not, look for exactly one pub-date and use that

352 pds = self.xml.xpath("//article-meta/pub-date")

353 if len(pds) == 1:

354 return self._make_date(pds[0])

355

356 # otherwise, insufficient information

357 return None

358

359 @property

360 def date_accepted(self):

361 das = self.xml.xpath("//history/date[@date-type='accepted']")

362 if len(das) > 0:

363 return self._make_date(das[0])

364

365 @property

366 def date_submitted(self):

367 rcs = self.xml.xpath("//history/date[@date-type='received']")

368 if len(rcs) > 0:

369 return self._make_date(rcs[0])

370

371 @property

372 def issn(self):

373 return xutil.xp_texts(self.xml, "//journal-meta/issn")

374

375 @property

376 def pmcid(self):

377 id = xutil.xp_first_text(self.xml, "//article-meta/article-id[@pub-id-type='pmcid']")

378 if id is not None and not id.startswith("PMC"):

379 id = "PMC" + id

380 return id

381

382 @property

383 def doi(self):

384 return xutil.xp_first_text(self.xml, "//article-meta/article-id[@pub-id-type='doi']")

385

386 def _make_date(self, element):

387 ob = xutil.objectify(element)

388 year = ob.get("year")

389 month = ob.get("month", "01")

390 day = ob.get("day", "01")

391 if len(month) < 2:

392 month = "0" + month

393 if len(day) < 2:

394 day = "0" + day

395 if year is None or len(year) != 4:

396 return None

397 return year + "-" + month + "-" + day

398

399 def _make_contribs(self, elements):

400 obs = []

401

402 for c in elements:

403 con = {}

404

405 # first see if there is a name we can pull out

406 name = c.find("name")

407 if name is not None:

408 sn = name.find("surname")

409 if sn is not None:

410 con["surname"] = sn.text

411

412 gn = name.find("given-names")

413 if gn is not None:

414 con["given-names"] = gn.text

415

416 # see if there's an email address

417 email = c.find("email")

418 if email is not None:

419 con["email"] = email.text

420

421 # now do the affiliations (by value and by (x)reference)

422 affs = []

423

424 aff = c.find("aff")

425 if aff is not None:

426 contents = aff.xpath("string()")

427 norm = " ".join(contents.split())

428 affs.append(norm)

429

430 xrefs = c.findall("xref")

431 for x in xrefs:

432 if x.get("ref-type") == "aff":

433 affid = x.get("rid")

434 xp = "//aff[@id='" + affid + "']"

435 aff_elements = self.xml.xpath(xp)

436 for ae in aff_elements:

437 contents = ae.xpath("string()")

438 norm = " ".join(contents.split())

439 affs.append(norm)

440

441 if len(affs) > 0:

442 con["affiliations"] = affs

443

444 if len(list(con.keys())) > 0:

445 obs.append(con)

446

447 return obs

448

449 def tostring(self):

450 if self.raw is not None:

451 return self.raw

452 elif self.xml is not None:

453 return etree.tostring(self.xml)