Coverage for portality/crosswalks/article_crossref

1from portality.core import app

2from lxml import etree

3import re

4from portality.bll import exceptions

5from portality.crosswalks.exceptions import CrosswalkException

6from portality import models

7from portality.ui.messages import Messages

11class CrossrefXWalk442(object):

12 """

13 ~~Crossref442XML:Crosswalk->Crossref442:Feature~~

14 """

15 format_name = "crossref442"

16 NS = {'x': 'http://www.crossref.org/schema/4.4.2', 'j': 'http://www.ncbi.nlm.nih.gov/JATS1'}

18 """

19 Example record:

20 <doi_batch version="4.4.2" xmlns="http://www.crossref.org/schema/4.4.2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.crossref.org/schema/4.3.7 http://www.crossref.org/schema/deposit/crossref4.3.7.xsd">

21 <head>

22 <doi_batch_id>1dbb27d1030c6c9d9d-7ff0</doi_batch_id>

23 <timestamp>200504260247</timestamp>

24 <depositor>

25 <depositor_name>your name</depositor_name>

26 <email_address>your@email.com</email_address>

27 </depositor>

28 <registrant>WEB-FORM</registrant>

29 </head>

30 <body>

31 <journal>

32 <journal_metadata>

33 <full_title>Test Publication</full_title>

34 <abbrev_title>TP</abbrev_title>

35 <issn media_type="print">2073-9813</issn>

36 </journal_metadata>

37 <journal_issue>

38 <publication_date media_type="print">

39 <month>12</month>

40 <day>1</day>

41 <year>2005</year>

42 </publication_date>

43 <journal_volume>

44 <volume>12</volume>

45 </journal_volume>

46 <issue>1</issue>

47 </journal_issue>

48

49 <journal_article publication_type="full_text">

50 <titles>

51 <title>First Article</title>

52 </titles>

53 <contributors>

54 <person_name sequence="first" contributor_role="author">

55 <given_name>Bob</given_name>

56 <surname>Surname</surname>

57 <ORCID>http://orcid.org/0000-0002-4011-3590</ORCID>

58 </person_name>

59 </contributors>

60 <publication_date media_type="print">

61 <month>12</month>

62 <day>1</day>

63 <year>2004</year>

64 </publication_date>

65 <pages>

66 <first_page>100</first_page>

67 <last_page>200</last_page>

68 </pages>

69 <doi_data>

70 <doi>10.50505/test_20051229930</doi>

71 <resource>http://www.crossref.org/</resource>

72 </doi_data>

73

74 <citation_list>

75 <citation key="ref1">

76 <journal_title>Current Opinion in Oncology</journal_title>

77 <author>Chauncey</author>

78 <volume>13</volume>

79 <first_page>21</first_page>

80 <cYear>2001</cYear>

81 </citation>

82 <citation key="ref2">

83 <doi>10.5555/small_md_0001</doi>

84 </citation>

85 <citation key="ref=3">

86 <unstructured_citation>Clow GD, McKay CP, Simmons Jr. GM, and Wharton RA, Jr. 1988. Climatological observations and predicted sublimation rates at Lake Hoare, Antarctica. Journal of Climate 1:715-728.</unstructured_citation>

87 </citation>

88 </citation_list>

89 </journal_article>

90 </journal>

91 </body>

92 </doi_batch>

93 """

95 def __init__(self):

96 self.validation_log = ""

97 self.schema_path = app.config.get("SCHEMAS", {}).get(self.format_name)

99 # load the schema into memory for more efficient usage in repeat calls to the crosswalk

100 if self.schema_path is None:

101 raise exceptions.IngestException(

102 message="Unable to validate for " + self.format_name + ", as schema path is not set in config")

103

104 while app.config["CROSSREF442_SCHEMA"] is None:

105 continue

106

107 # ~~->CrossrefXML:Schema~~

108 self.schema = app.config["CROSSREF442_SCHEMA"]

109

110 def validate_file(self, file_handle):

111 # first try to parse the file

112 try:

113 doc = etree.parse(file_handle)

114 except etree.XMLSyntaxError as e: # although the treatment is the same, pulling this out so we remember what the primary kind of exception should be

115 raise CrosswalkException(message="Unable to parse XML file", inner=e)

116 except UnicodeDecodeError as e:

117 msg = 'Text decode failed, expected utf-8 encoded XML.'

118 raise CrosswalkException(message='Unable to parse XML file', inner=e, inner_message=msg)

119 except Exception as e:

120 raise CrosswalkException(message="Unable to parse XML file", inner=e)

121

122 # then pass the doc to the validator

123 valid = self.validate(doc)

124

125 if not valid:

126 msg = "Validation message from schema '{x}': {y}\n".format(x=self.format_name,

127 y=self.validation_log)

128 raise CrosswalkException(message="Unable to validate document with identified schema", inner_message=msg)

129

130 return doc

131

132 def validate(self, doc):

133 valid = self.schema.validate(doc)

134 if not valid:

135 el = self.schema.error_log.__repr__()

136 # strip the filename, as we don't want to leak the path to the UI

137 rx = "[\da-f]{32}.xml:(.*)"

138 match = re.search(rx, el)

139 if match is not None:

140 el = match.group(1)

141 self.validation_log = el

142 return valid

143

144 def crosswalk_file(self, file_handle, add_journal_info):

145 doc = self.validate_file(file_handle)

146 return self.crosswalk_doc(doc)

147

148 def crosswalk_doc(self, doc):

149 # go through the records in the doc and crosswalk each one individually

150 articles = []

151 root = doc.getroot()

152 body = root.find("x:body", self.NS)

153 journals = body.findall("x:journal", self.NS)

154 if journals is not None:

155 for journal in journals:

156 arts = journal.findall("x:journal_article", self.NS)

157 for record in arts:

158 article = self.crosswalk_article(record, journal)

159 articles.append(article)

160

161 return articles

162

163 def crosswalk_article(self, record, journal):

164 article = models.Article() # ~~->Article:Model~~

165 bibjson = article.bibjson()

166

167 self.extract_journal_title(journal, bibjson)

168 self.extract_issns(journal, bibjson)

169 self.extract_publication_date(record, journal, bibjson)

170 self.extract_volume(journal, bibjson)

171 self.extract_issue(journal, bibjson)

172 self.extract_pages(record, journal, bibjson)

173 self.extract_doi(record, journal, bibjson)

174 self.extract_fulltext(record, journal, bibjson)

175 self.extract_article_title(record, journal, bibjson)

176 self.extract_authors(record, journal, bibjson)

177 self.extract_abstract(record, journal, bibjson)

178

179 return article

180

181

182 ###############################################################################

183 ## extractors

184 ###############################################################################

185

186 def extract_journal_title(self, journal, bibjson):

187 jm = journal.find("x:journal_metadata", self.NS)

188 if jm is not None:

189 jt = _element(jm, "x:full_title", self.NS)

190 if jt is not None:

191 bibjson.journal_title = jt

192

193 def extract_issns(self, journal, bibjson):

194 md = journal.find("x:journal_metadata", self.NS)

195 if md is not None:

196 issns = md.findall("x:issn", self.NS)

197

198 # if more than 2 issns raise the exception

199 if len(issns) > 2:

200 raise CrosswalkException(message=Messages.EXCEPTION_TOO_MANY_ISSNS)

201 if len(issns) == 1:

202 if len(issns[0].attrib) == 0 or issns[0].attrib["media_type"] == 'electronic':

203 bibjson.add_identifier(bibjson.E_ISSN, issns[0].text.upper())

204 elif issns[0].attrib["media_type"] == 'print':

205 bibjson.add_identifier(bibjson.P_ISSN, issns[0].text.upper())

206

207 elif len(issns) == 2:

208 attrs = [0, 0]

209 if len(issns[0].attrib) != 0:

210 attrs[0] = issns[0].attrib["media_type"]

211 if len(issns[1].attrib) != 0:

212 attrs[1] = issns[1].attrib["media_type"]

213

214 # if both issns have the same type - raise the exception

215 if attrs[0] != 0 and attrs[0] == attrs[1]:

216 raise CrosswalkException(

217 message=Messages.EXCEPTION_ISSNS_OF_THE_SAME_TYPE.format(type=issns[1].attrib["media_type"]))

218

219 # if both issns have the same value - raise the exception

220 if issns[0].text.upper() == issns[1].text.upper():

221 raise CrosswalkException(

222 message=Messages.EXCEPTION_IDENTICAL_PISSN_AND_EISSN.format(value=issns[0].text.upper()))

223

224 if bool(attrs[0]) != bool(attrs[1]):

225 if attrs[0] != 0:

226 if attrs[0] == "electronic":

227 attrs[1] = "print"

228 else:

229 attrs[1] = "electronic"

230 else:

231 if attrs[1] == "electronic":

232 attrs[0] = "print"

233 else:

234 attrs[0] = "electronic"

235 elif attrs[0] == 0:

236 attrs[0] = "electronic"

237 attrs[1] = "print"

238

239 bibjson.add_identifier(bibjson.P_ISSN if attrs[0] == "print" else bibjson.E_ISSN, issns[0].text.upper())

240 bibjson.add_identifier(bibjson.P_ISSN if attrs[1] == "print" else bibjson.E_ISSN, issns[1].text.upper())

241

242 def extract_publication_date(self, record, journal, bibjson):

243 pd = record.find("x:publication_date", self.NS)

244

245 if pd is not None:

246 bibjson.year = _element(pd, "x:year", self.NS)

247 bibjson.month = _element(pd, "x:month", self.NS)

248

249 def extract_volume(self, journal, bibjson):

250 issue = journal.find("x:journal_issue", self.NS)

251

252 if issue is not None:

253 vol = issue.find("x:journal_volume", self.NS)

254 if vol is not None:

255 volume = _element(vol, "x:volume", self.NS)

256 if volume is not None:

257 bibjson.volume = volume

258

259 def extract_issue(self, journal, bibjson):

260 issue = journal.find("x:journal_issue", self.NS)

261 if issue is not None:

262 number = _element(issue, "x:issue", self.NS)

263 if number is not None:

264 bibjson.number = number

265

266 def extract_pages(self, record, journal, bibjson):

267 pages = record.find('x:pages', self.NS)

268 # start page

269 if pages is not None:

270 sp = _element(pages, "x:first_page", self.NS)

271 if sp is not None:

272 bibjson.start_page = sp

273

274 # end page

275 ep = _element(pages, "x:last_page", self.NS)

276 if ep is not None:

277 bibjson.end_page = ep

278

279 def extract_doi(self, record, journal, bibjson):

280 d = record.find("x:doi_data", self.NS)

281 if d is not None:

282 # doi

283 doi = _element(d, "x:doi", self.NS)

284 if doi is not None:

285 bibjson.add_identifier(bibjson.DOI, doi)

286

287 def extract_fulltext(self, record, journal, bibjson):

288 d = record.find("x:doi_data", self.NS)

289 ftel = _element(d, "x:resource", self.NS)

290 if ftel is not None:

291 bibjson.add_url(ftel, "fulltext", self.NS)

292

293 def extract_article_title(self, record, journal, bibjson):

294 titles = record.find('x:titles', self.NS)

295 if titles is not None:

296 title = _element(titles, "x:title", self.NS)

297 if title is not None:

298 bibjson.title = title

299

300 def extract_authors(self, record, journal, bibjson):

301 contributors = record.find("x:contributors", self.NS)

302 if contributors is None:

303 raise CrosswalkException(message=Messages.EXCEPTION_NO_CONTRIBUTORS_FOUND,

304 inner_message=Messages.EXCEPTION_NO_CONTRIBUTORS_EXPLANATION)

305 contribs = contributors.findall("x:person_name", self.NS)

306 if contribs is not None:

307 for ctb in contribs:

308 if ctb.attrib["contributor_role"] == 'author':

309 name = _element(ctb, "x:surname", self.NS)

310 e = _element(ctb, "x:given_name", self.NS)

311 name = e + ' ' + name if e else name

312 e = _element(ctb, "x:affiliation", self.NS)

313 affiliation = e if e else None

314 e = _element(ctb, "x:ORCID", self.NS)

315 orcid = e if e else None

316 bibjson.add_author(name, affiliation, orcid)

317

318 def extract_abstract(self, record, journal, bibjson):

319 abstract_par = record.find("j:abstract", self.NS)

320 if abstract_par is not None:

321 text_elems = list(abstract_par.iter())

322 text = ""

323 if text_elems is not None:

324 for elems in text_elems:

325 if elems.text is not None:

326 text = text + elems.text

327 bibjson.abstract = text[:30000] # avoids Elasticsearch

328 # exceptions about .exact analyser not being able to handle

329 # more than 32766 UTF8 characters

330

331###############################################################################

332## Crossref 5.3.1 Xwalk

333###############################################################################

334

335class CrossrefXWalk531(CrossrefXWalk442):

336 """

337 ~~Crossref531XML:Crosswalk->Crossref531:Feature~~

338 ~~->Crossref422XML:Crosswalk~~

339 """

340 format_name = "crossref531"

341 NS = {'x': 'http://www.crossref.org/schema/5.3.1', 'j': 'http://www.ncbi.nlm.nih.gov/JATS1'}

342

343 def __init__(self):

344 super(CrossrefXWalk531,self).__init__()

345

346 while app.config["CROSSREF531_SCHEMA"] is None:

347 continue

348

349 self.schema = app.config["CROSSREF531_SCHEMA"]

350

351 def extract_authors(self, record, journal, bibjson):

352 contributors = record.find("x:contributors", self.NS)

353 if contributors is None:

354 raise CrosswalkException(message=Messages.EXCEPTION_NO_CONTRIBUTORS_FOUND,

355 inner_message=Messages.EXCEPTION_NO_CONTRIBUTORS_EXPLANATION)

356 contribs = contributors.findall("x:person_name", self.NS)

357 if contribs is not None:

358 for ctb in contribs:

359 if ctb.attrib["contributor_role"] == 'author':

360 name = _element(ctb, "x:surname", self.NS)

361 e = _element(ctb, "x:given_name", self.NS)

362 name = e + ' ' + name if e else name

363

364 # only first affiliation is supported even if multiple are provided

365 affs = ctb.find("x:affiliations", self.NS)

366 affiliation = None

367 if affs is not None:

368 institution = affs.find("x:institution", self.NS)

369 if institution is not None:

370 inst_name = _element(institution, "x:institution_name", self.NS)

371 affiliation = inst_name if inst_name else None

372

373 e = _element(ctb, "x:ORCID", self.NS)

374 orcid = e if e else None

375 bibjson.add_author(name, affiliation, orcid)

376

377###############################################################################

378## some convenient utilities

379###############################################################################

380

381def _element(xml, field, namespace):

382 el = xml.find(field, namespace)

383 if el is not None:

384 # self converts the entire element to a string, so that we can handle the possibility of

385 # embedded html tags, etc.

386 # etree.tostring doesn't actually produce a string, but a byte array, so we must specify

387 # the encoding and THEN also decode it using that same encoding to get an actual string

388 string = etree.tostring(el, encoding="utf-8").decode("utf-8")

389 start = string.index(">") + 1

390 end = string.rindex('</')

391 text = string[start:end]

392 return text if text else None

393 else:

394 return None

Coverage for portality/crosswalks/article_crossref_xml.py: 96%

223 statements