Coverage for portality/models/article.py: 81%

716 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-11-09 16:22 +0000

1import string 

2import warnings 

3 

4from unidecode import unidecode 

5from functools import reduce 

6from copy import deepcopy 

7from datetime import datetime 

8 

9from portality import datasets, constants 

10from portality.dao import DomainObject 

11from portality.models import Journal 

12from portality.models.v1.bibjson import GenericBibJSON # NOTE that article specifically uses the v1 BibJSON 

13from portality.models.v1 import shared_structs 

14from portality.lib import normalise 

15 

16 

17class NoJournalException(Exception): 

18 pass 

19 

20class NoValidOwnerException(Exception): 

21 pass 

22 

23 

24class Article(DomainObject): 

25 __type__ = "article" 

26 

27 @classmethod 

28 def duplicates(cls, publisher_record_id=None, doi=None, fulltexts=None, title=None, volume=None, number=None, start=None, should_match=None, size=10): 

29 # some input sanitisation 

30 urls = fulltexts if isinstance(fulltexts, list) else [fulltexts] if isinstance(fulltexts, str) or isinstance(fulltexts, str) else [] 

31 

32 # make sure that we're dealing with the normal form of the identifiers 

33 norm_urls = [] 

34 for url in urls: 

35 try: 

36 norm = normalise.normalise_url(url) 

37 norm_urls.append(norm) 

38 except ValueError: 

39 # use the non-normal form 

40 norm_urls.append(url) 

41 urls = norm_urls 

42 

43 try: 

44 doi = normalise.normalise_doi(doi) 

45 except ValueError: 

46 # leave the doi as it is 

47 pass 

48 

49 q = DuplicateArticleQuery(publisher_record_id=publisher_record_id, 

50 doi=doi, 

51 urls=urls, 

52 title=title, 

53 volume=volume, 

54 number=number, 

55 start=start, 

56 should_match=should_match, 

57 size=size) 

58 

59 # res = cls.query(q=q.query()) 

60 # return [cls(**hit.get("_source")) for hit in res.get("hits", {}).get("hits", [])] 

61 return cls.q2obj(q=q.query()) 

62 

63 @classmethod 

64 def list_volumes(cls, issns): 

65 q = ArticleVolumesQuery(issns) 

66 result = cls.query(q=q.query()) 

67 return _human_sort([t.get("key") for t in result.get("aggregations", {}).get("vols", {}).get("buckets", [])]) 

68 

69 @classmethod 

70 def list_volume_issues(cls, issns, volume): 

71 q = ArticleVolumesIssuesQuery(issns, volume) 

72 result = cls.query(q=q.query()) 

73 return _human_sort([t.get("key") for t in result.get("aggregations", {}).get("issues", {}).get("buckets", [])]) 

74 

75 @classmethod 

76 def get_by_volume(cls, issns, volume): 

77 q = ArticleQuery(issns=issns, volume=volume) 

78 articles = cls.iterate(q.query(), page_size=1000) 

79 return articles 

80 

81 @classmethod 

82 def find_by_issns(cls, issns): 

83 q = ArticleQuery(issns=issns) 

84 articles = cls.iterate(q.query(), page_size=1000) 

85 return articles 

86 

87 @classmethod 

88 def count_by_issns(cls, issns): 

89 q = ArticleQuery(issns=issns) 

90 return cls.hit_count(q.query()) 

91 

92 @classmethod 

93 def delete_by_issns(cls, issns, snapshot=True): 

94 q = ArticleQuery(issns=issns) 

95 cls.delete_selected(query=q.query(), snapshot=snapshot) 

96 

97 @classmethod 

98 def delete_selected(cls, query=None, owner=None, snapshot=True): 

99 if owner is not None: 

100 from portality.models import Journal 

101 issns = Journal.issns_by_owner(owner) 

102 q = ArticleQuery(issns=issns) 

103 query = q.query() 

104 

105 if snapshot: 

106 articles = cls.iterate(query, page_size=1000) 

107 for article in articles: 

108 article.snapshot() 

109 return cls.delete_by_query(query) 

110 

111 def bibjson(self, **kwargs): 

112 if "bibjson" not in self.data: 

113 self.data["bibjson"] = {} 

114 return ArticleBibJSON(self.data.get("bibjson"), **kwargs) 

115 

116 def set_bibjson(self, bibjson): 

117 bibjson = bibjson.bibjson if isinstance(bibjson, ArticleBibJSON) else bibjson 

118 self.data["bibjson"] = bibjson 

119 

120 def history(self): 

121 hs = self.data.get("history", []) 

122 tuples = [] 

123 for h in hs: 

124 tuples.append((h.get("date"), ArticleBibJSON(h.get("bibjson")))) 

125 return tuples 

126 

127 def snapshot(self): 

128 from portality.models import ArticleHistory 

129 

130 snap = deepcopy(self.data) 

131 if "id" in snap: 

132 snap["about"] = snap["id"] 

133 del snap["id"] 

134 if "index" in snap: 

135 del snap["index"] 

136 if "last_updated" in snap: 

137 del snap["last_updated"] 

138 if "created_date" in snap: 

139 del snap["created_date"] 

140 

141 hist = ArticleHistory(**snap) 

142 hist.save() 

143 return hist.id 

144 

145 def add_history(self, bibjson, date=None): 

146 """Deprecated""" 

147 bibjson = bibjson.bibjson if isinstance(bibjson, ArticleBibJSON) else bibjson 

148 if date is None: 

149 date = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") 

150 snobj = {"date": date, "bibjson": bibjson} 

151 if "history" not in self.data: 

152 self.data["history"] = [] 

153 self.data["history"].append(snobj) 

154 

155 def is_in_doaj(self): 

156 return self.data.get("admin", {}).get("in_doaj", False) 

157 

158 def set_in_doaj(self, value): 

159 if "admin" not in self.data: 

160 self.data["admin"] = {} 

161 self.data["admin"]["in_doaj"] = value 

162 

163 def has_seal(self): 

164 return self.data.get("admin", {}).get("seal", False) 

165 

166 def set_seal(self, value): 

167 if "admin" not in self.data: 

168 self.data["admin"] = {} 

169 self.data["admin"]["seal"] = value 

170 

171 def publisher_record_id(self): 

172 return self.data.get("admin", {}).get("publisher_record_id") 

173 

174 def set_publisher_record_id(self, pri): 

175 if "admin" not in self.data: 

176 self.data["admin"] = {} 

177 self.data["admin"]["publisher_record_id"] = pri 

178 

179 def upload_id(self): 

180 return self.data.get("admin", {}).get("upload_id") 

181 

182 def set_upload_id(self, uid): 

183 if "admin" not in self.data: 

184 self.data["admin"] = {} 

185 self.data["admin"]["upload_id"] = uid 

186 

187 def get_normalised_doi(self): 

188 if self.data.get("index", {}).get("doi") is not None: 

189 return self.data["index"]["doi"] 

190 doi = self.bibjson().get_one_identifier(constants.IDENT_TYPE_DOI) 

191 if doi is None: 

192 return None 

193 try: 

194 return normalise.normalise_doi(doi) 

195 except ValueError: 

196 # can't be normalised, so we just return the doi as-is 

197 return doi 

198 

199 def get_normalised_fulltext(self): 

200 if self.data.get("index", {}).get("fulltext") is not None: 

201 return self.data["index"]["fulltext"] 

202 fulltexts = self.bibjson().get_urls(constants.LINK_TYPE_FULLTEXT) 

203 if len(fulltexts) == 0: 

204 return None 

205 try: 

206 return normalise.normalise_url(fulltexts[0]) 

207 except ValueError: 

208 # can't be normalised, so we just return the url as-is 

209 return fulltexts[0] 

210 

211 def get_journal(self): 

212 """ 

213 Get this article's associated journal 

214 :return: A Journal, or None if this is an orphan article 

215 """ 

216 bibjson = self.bibjson() 

217 

218 # first, get the ISSNs associated with the record 

219 pissns = bibjson.get_identifiers(bibjson.P_ISSN) 

220 eissns = bibjson.get_identifiers(bibjson.E_ISSN) 

221 allissns = list(set(pissns + eissns)) 

222 

223 # find a matching journal record from the index 

224 journal = None 

225 for issn in allissns: 

226 journals = Journal.find_by_issn(issn) 

227 if len(journals) > 0: 

228 # there should only ever be one, so take the first one 

229 journal = journals[0] 

230 break 

231 

232 return journal 

233 

234 def get_associated_journals(self): 

235 # find all matching journal record from the index 

236 allissns = self.bibjson().issns() 

237 return Journal.find_by_issn(allissns) 

238 

239 def add_journal_metadata(self, j=None, reg=None): 

240 """ 

241 this function makes sure the article is populated 

242 with all the relevant info from its owning parent object 

243 :param j: Pass in a Journal to bypass the (slow) locating step. MAKE SURE IT'S THE RIGHT ONE! 

244 """ 

245 

246 # Record the data that is copied into the article into the "reg"ister, in case the 

247 # caller needs to know exactly and only which information was copied 

248 if reg is None: 

249 reg = Journal() 

250 rbj = reg.bibjson() 

251 

252 if j is None: 

253 journal = self.get_journal() 

254 else: 

255 journal = j 

256 

257 # we were unable to find a journal 

258 if journal is None: 

259 raise NoJournalException("Unable to find a journal associated with this article") 

260 

261 # if we get to here, we have a journal record we want to pull data from 

262 jbib = journal.bibjson() 

263 bibjson = self.bibjson() 

264 

265 # tripwire to be tripped if the journal makes changes to the article 

266 trip = False 

267 

268 if bibjson.subjects() != jbib.subjects(): 

269 trip = True 

270 bibjson.set_subjects(jbib.subjects()) 

271 rbj.set_subjects(jbib.subjects()) 

272 

273 if jbib.title is not None: 

274 if bibjson.journal_title != jbib.title: 

275 trip = True 

276 bibjson.journal_title = jbib.title 

277 rbj.title = jbib.title 

278 

279 if len(jbib.language) > 0: 

280 jlang = jbib.language 

281 alang = bibjson.journal_language 

282 jlang.sort() 

283 alang.sort() 

284 if jlang != alang: 

285 bibjson.journal_language = jbib.language 

286 trip = True 

287 rbj.set_language(jbib.language) 

288 

289 if jbib.country is not None: 

290 if jbib.country != bibjson.journal_country: 

291 bibjson.journal_country = jbib.country 

292 trip = True 

293 rbj.country = jbib.country 

294 

295 if jbib.publisher: 

296 if jbib.publisher != bibjson.publisher: 

297 bibjson.publisher = jbib.publisher 

298 trip = True 

299 rbj.publisher = jbib.publisher 

300 

301 # Copy the seal info, in_doaj status and the journal's ISSNs 

302 if journal.is_in_doaj() != self.is_in_doaj(): 

303 self.set_in_doaj(journal.is_in_doaj()) 

304 trip = True 

305 reg.set_in_doaj(journal.is_in_doaj()) 

306 

307 if journal.has_seal() != self.has_seal(): 

308 self.set_seal(journal.has_seal()) 

309 trip = True 

310 reg.set_seal(journal.has_seal()) 

311 

312 try: 

313 aissns = bibjson.journal_issns 

314 jissns = jbib.issns() 

315 aissns.sort() 

316 jissns.sort() 

317 if aissns != jissns: 

318 bibjson.journal_issns = jbib.issns() 

319 trip = True 

320 

321 eissns = jbib.get_identifiers(jbib.E_ISSN) 

322 pissns = jbib.get_identifiers(jbib.P_ISSN) 

323 if eissns is not None and len(eissns) > 0: 

324 rbj.add_identifier(rbj.E_ISSN, eissns[0]) 

325 if pissns is not None and len(pissns) > 0: 

326 rbj.add_identifier(rbj.P_ISSN, pissns[0]) 

327 except KeyError: 

328 # No issns, don't worry about it for now 

329 pass 

330 

331 return trip 

332 

333 def merge(self, old, take_id=True): 

334 # this takes an old version of the article and brings 

335 # forward any useful information that is needed. The rules of merge are: 

336 # - ignore "index" (it gets regenerated on save) 

337 # - always take the "created_date" 

338 # - any top level field that does not exist in the current item (esp "id" and "history") 

339 # - in "admin", copy any field that does not already exist 

340 

341 # first thing to do is create a snapshot of the old record 

342 old.snapshot() 

343 

344 # now go on and do the merge 

345 

346 # always take the created date 

347 self.set_created(old.created_date) 

348 

349 # take the id 

350 if self.id is None or take_id: 

351 self.set_id(old.id) 

352 

353 # take the history (deprecated) 

354 if len(self.data.get("history", [])) == 0: 

355 self.data["history"] = deepcopy(old.data.get("history", [])) 

356 

357 # take the bibjson 

358 if "bibjson" not in self.data: 

359 self.set_bibjson(deepcopy(old.bibjson())) 

360 

361 # take the admin if there isn't one 

362 if "admin" not in self.data: 

363 self.data["admin"] = deepcopy(old.data.get("admin", {})) 

364 else: 

365 # otherwise, copy any admin keys that don't exist on the current item 

366 oa = old.data.get("admin", {}) 

367 for key in oa: 

368 if key not in self.data["admin"]: 

369 self.data["admin"][key] = deepcopy(oa[key]) 

370 

371 def _generate_index(self): 

372 # the index fields we are going to generate 

373 issns = [] 

374 subjects = [] 

375 schema_subjects = [] 

376 schema_codes = [] 

377 schema_codes_tree = [] 

378 classification = [] 

379 langs = [] 

380 country = None 

381 publisher = [] 

382 classification_paths = [] 

383 unpunctitle = None 

384 asciiunpunctitle = None 

385 doi = None 

386 fulltext = None 

387 

388 # the places we're going to get those fields from 

389 cbib = self.bibjson() 

390 jindex = self.data.get('index', {}) 

391 hist = self.history() 

392 

393 # get the issns out of the current bibjson 

394 issns += cbib.get_identifiers(cbib.P_ISSN) 

395 issns += cbib.get_identifiers(cbib.E_ISSN) 

396 

397 # get the issn from the journal bibjson 

398 if isinstance(cbib.journal_issns, list): 

399 issns += cbib.journal_issns 

400 

401 # de-duplicate the issns 

402 issns = list(set(issns)) 

403 

404 # now get the issns out of the historic records 

405 for date, hbib in hist: 

406 issns += hbib.get_identifiers(hbib.P_ISSN) 

407 issns += hbib.get_identifiers(hbib.E_ISSN) 

408 

409 # get the subjects and concatenate them with their schemes from the current bibjson 

410 for subs in cbib.subjects(): 

411 scheme = subs.get("scheme") 

412 term = subs.get("term") 

413 subjects.append(term) 

414 schema_subjects.append(scheme + ":" + term) 

415 classification.append(term) 

416 if "code" in subs: 

417 schema_codes.append(scheme + ":" + subs.get("code")) 

418 

419 # copy the languages 

420 if len(cbib.journal_language) > 0: 

421 langs = [datasets.name_for_lang(l) for l in cbib.journal_language] 

422 

423 # copy the country 

424 if jindex.get('country'): 

425 country = jindex.get('country') 

426 elif cbib.journal_country: 

427 country = datasets.get_country_name(cbib.journal_country) 

428 

429 # copy the publisher/provider 

430 if cbib.publisher: 

431 publisher.append(cbib.publisher) 

432 

433 # deduplicate the lists 

434 issns = list(set(issns)) 

435 subjects = list(set(subjects)) 

436 schema_subjects = list(set(schema_subjects)) 

437 classification = list(set(classification)) 

438 publisher = list(set(publisher)) 

439 langs = list(set(langs)) 

440 schema_codes = list(set(schema_codes)) 

441 

442 # work out what the date of publication is 

443 date = cbib.get_publication_date() 

444 

445 # calculate the classification paths 

446 from portality.lcc import lcc # inline import since this hits the database 

447 for subs in cbib.subjects(): 

448 scheme = subs.get("scheme") 

449 term = subs.get("term") 

450 if scheme == "LCC": 

451 path = lcc.pathify(term) 

452 if path is not None: 

453 classification_paths.append(path) 

454 

455 # normalise the classification paths, so we only store the longest ones 

456 classification_paths = lcc.longest(classification_paths) 

457 schema_codes_tree = cbib.lcc_codes_full_list() 

458 

459 # create an unpunctitle 

460 if cbib.title is not None: 

461 throwlist = string.punctuation + '\n\t' 

462 unpunctitle = "".join(c for c in cbib.title if c not in throwlist).strip() 

463 try: 

464 asciiunpunctitle = unidecode(unpunctitle) 

465 except: 

466 asciiunpunctitle = unpunctitle 

467 

468 # determine if the seal is applied 

469 has_seal = "Yes" if self.has_seal() else "No" 

470 

471 # create a normalised version of the DOI for deduplication 

472 source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI) 

473 try: 

474 doi = normalise.normalise_doi(source_doi) 

475 except ValueError as e: 

476 # if we can't normalise the DOI, just store it as-is. 

477 doi = source_doi 

478 

479 

480 # create a normalised version of the fulltext URL for deduplication 

481 fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT) 

482 if len(fulltexts) > 0: 

483 source_fulltext = fulltexts[0] 

484 try: 

485 fulltext = normalise.normalise_url(source_fulltext) 

486 except ValueError as e: 

487 # if we can't normalise the fulltext store it as-is 

488 fulltext = source_fulltext 

489 

490 

491 

492 # build the index part of the object 

493 self.data["index"] = {} 

494 if len(issns) > 0: 

495 self.data["index"]["issn"] = issns 

496 if date != "": 

497 self.data["index"]["date"] = date 

498 self.data["index"]["date_toc_fv_month"] = date # Duplicated so we can have year/month facets in fv2 

499 if len(subjects) > 0: 

500 self.data["index"]["subject"] = subjects 

501 if len(schema_subjects) > 0: 

502 self.data["index"]["schema_subject"] = schema_subjects 

503 if len(classification) > 0: 

504 self.data["index"]["classification"] = classification 

505 if len(publisher) > 0: 

506 self.data["index"]["publisher"] = publisher 

507 if len(langs) > 0: 

508 self.data["index"]["language"] = langs 

509 if country is not None: 

510 self.data["index"]["country"] = country 

511 if len(schema_codes) > 0: 

512 self.data["index"]["schema_code"] = schema_codes 

513 if len(classification_paths) > 0: 

514 self.data["index"]["classification_paths"] = classification_paths 

515 if unpunctitle is not None: 

516 self.data["index"]["unpunctitle"] = unpunctitle 

517 if asciiunpunctitle is not None: 

518 self.data["index"]["asciiunpunctitle"] = unpunctitle 

519 if has_seal: 

520 self.data["index"]["has_seal"] = has_seal 

521 if doi is not None: 

522 self.data["index"]["doi"] = doi 

523 if fulltext is not None: 

524 self.data["index"]["fulltext"] = fulltext 

525 if len(schema_codes_tree) > 0: 

526 self.data["index"]["schema_codes_tree"] = schema_codes_tree 

527 

528 def prep(self): 

529 self._generate_index() 

530 self.data['last_updated'] = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") 

531 

532 def save(self, *args, **kwargs): 

533 self._generate_index() 

534 return super(Article, self).save(*args, **kwargs) 

535 

536 def get_owner(self): 

537 b = self.bibjson() 

538 article_issns = b.get_identifiers(b.P_ISSN) 

539 article_issns += b.get_identifiers(b.E_ISSN) 

540 owners = [] 

541 

542 seen_journal_issns = {} 

543 for issn in article_issns: 

544 journals = Journal.find_by_issn(issn) 

545 if journals is not None and len(journals) > 0: 

546 for j in journals: 

547 owners.append(j.owner) 

548 if j.owner not in seen_journal_issns: 

549 seen_journal_issns[j.owner] = [] 

550 seen_journal_issns[j.owner] += j.bibjson().issns() 

551 

552 # deduplicate the list of owners 

553 owners = list(set(owners)) 

554 

555 # no owner means we can't confirm 

556 if len(owners) == 0: 

557 raise NoValidOwnerException 

558 

559 # multiple owners means ownership of this article is confused 

560 if len(owners) > 1: 

561 return NoValidOwnerException 

562 

563 return owners[0] 

564 

565class ArticleBibJSON(GenericBibJSON): 

566 

567 def __init__(self, bibjson=None, **kwargs): 

568 self._add_struct(shared_structs.SHARED_BIBJSON.get("structs", {}).get("bibjson")) 

569 self._add_struct(ARTICLE_BIBJSON_EXTENSION.get("structs", {}).get("bibjson")) 

570 super(ArticleBibJSON, self).__init__(bibjson, **kwargs) 

571 

572 # article-specific simple getters and setters 

573 @property 

574 def year(self): 

575 return self._get_single("year") 

576 

577 @year.setter 

578 def year(self, val): 

579 self._set_with_struct("year", val) 

580 

581 @year.deleter 

582 def year(self): 

583 self._delete("year") 

584 

585 @property 

586 def month(self): 

587 return self._get_single("month") 

588 

589 @month.setter 

590 def month(self, val): 

591 self._set_with_struct("month", val) 

592 

593 @month.deleter 

594 def month(self): 

595 self._delete("month") 

596 

597 @property 

598 def start_page(self): 

599 return self._get_single("start_page") 

600 

601 @start_page.setter 

602 def start_page(self, val): 

603 self._set_with_struct("start_page", val) 

604 

605 @property 

606 def end_page(self): 

607 return self._get_single("end_page") 

608 

609 @end_page.setter 

610 def end_page(self, val): 

611 self._set_with_struct("end_page", val) 

612 

613 @property 

614 def abstract(self): 

615 return self._get_single("abstract") 

616 

617 @abstract.setter 

618 def abstract(self, val): 

619 self._set_with_struct("abstract", val) 

620 

621 # article-specific complex part getters and setters 

622 

623 @property 

624 def volume(self): 

625 return self._get_single("journal.volume") 

626 

627 @volume.setter 

628 def volume(self, value): 

629 self._set_with_struct("journal.volume", value) 

630 

631 @property 

632 def number(self): 

633 return self._get_single("journal.number") 

634 

635 @number.setter 

636 def number(self, value): 

637 self._set_with_struct("journal.number", value) 

638 

639 @property 

640 def journal_title(self): 

641 return self._get_single("journal.title") 

642 

643 @journal_title.setter 

644 def journal_title(self, title): 

645 self._set_with_struct("journal.title", title) 

646 

647 @property 

648 def journal_language(self): 

649 return self._get_list("journal.language") 

650 

651 @journal_language.setter 

652 def journal_language(self, lang): 

653 self._set_with_struct("journal.language", lang) 

654 

655 @property 

656 def journal_country(self): 

657 return self._get_single("journal.country") 

658 

659 @journal_country.setter 

660 def journal_country(self, country): 

661 self._set_single("journal.country", country) 

662 

663 @property 

664 def journal_issns(self): 

665 return self._get_list("journal.issns") 

666 

667 @journal_issns.setter 

668 def journal_issns(self, issns): 

669 self._set_with_struct("journal.issns", issns) 

670 

671 @property 

672 def publisher(self): 

673 return self._get_single("journal.publisher") 

674 

675 @publisher.setter 

676 def publisher(self, value): 

677 self._set_with_struct("journal.publisher", value) 

678 

679 def add_author(self, name, affiliation=None, orcid_id=None): 

680 aobj = {"name": name} 

681 if affiliation is not None: 

682 aobj["affiliation"] = affiliation 

683 if orcid_id is not None: 

684 aobj["orcid_id"] = orcid_id 

685 self._add_to_list_with_struct("author", aobj) 

686 

687 @property 

688 def author(self): 

689 return self._get_list("author") 

690 

691 @author.setter 

692 def author(self, authors): 

693 self._set_with_struct("author", authors) 

694 

695 def get_publication_date(self, date_format='%Y-%m-%dT%H:%M:%SZ'): 

696 # work out what the date of publication is 

697 date = "" 

698 if self.year is not None: 

699 if type(self.year) is str: # It should be, if the mappings are correct. but len() needs a sequence. 

700 # fix 2 digit years 

701 if len(self.year) == 2: 

702 try: 

703 intyear = int(self.year) 

704 except ValueError: 

705 # if it's 2 chars long and the 2 chars don't make an integer, 

706 # forget it 

707 return date 

708 

709 # In the case of truncated years, assume it's this century if before the current year 

710 if intyear <= int(str(datetime.utcnow().year)[:-2]): 

711 self.year = "20" + self.year # For readability over long-lasting code, I have refrained 

712 else: # from using str(datetime.utcnow().year)[:2] here. 

713 self.year = "19" + self.year # But don't come crying to me 90-ish years from now. 

714 

715 # if we still don't have a 4 digit year, forget it 

716 if len(self.year) != 4: 

717 return date 

718 

719 # build up our proposed datestamp 

720 date += str(self.year) 

721 if self.month is not None: 

722 try: 

723 if type(self.month) is int: 

724 if 1 <= int(self.month) <= 12: 

725 month_number = self.month 

726 else: 

727 month_number = 1 

728 elif len(self.month) <= 2: 

729 if 1 <= int(self.month) <= 12: 

730 month_number = self.month 

731 else: 

732 month_number = '1' 

733 elif len(self.month) == 3: # 'May' works with either case, obvz. 

734 month_number = datetime.strptime(self.month, '%b').month 

735 else: 

736 month_number = datetime.strptime(self.month, '%B').month 

737 

738 

739 # pad the month number to two digits. This accepts int or string 

740 date += '-{:0>2}'.format(month_number) 

741 except: 

742 # If something goes wrong, just assume it's January 

743 date += "-01" 

744 else: 

745 date += "-01" 

746 date += "-01T00:00:00Z" 

747 

748 # attempt to confirm the format of our datestamp 

749 try: 

750 datecheck = datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ") 

751 date = datecheck.strftime(date_format) 

752 except: 

753 return "" 

754 return date 

755 

756 def remove_journal_metadata(self): 

757 self._delete("journal") 

758 

759 def vancouver_citation(self): 

760 jtitle = self.journal_title 

761 year = self.year 

762 vol = self.volume 

763 iss = self.number 

764 start = self.start_page 

765 end = self.end_page 

766 

767 citation = "" 

768 

769 if year: 

770 citation += year + ";" 

771 

772 if vol: 

773 citation += vol 

774 

775 if iss: 

776 citation += "(" + iss + ")" 

777 

778 if start or end: 

779 if citation != "": 

780 citation += ":" 

781 if start: 

782 citation += start 

783 if end: 

784 if start: 

785 citation += "-" 

786 citation += end 

787 

788 return jtitle.strip(), citation 

789 

790 def lcc_codes_full_list(self): 

791 full_list = set() 

792 

793 from portality.lcc import lcc # inline import since this hits the database 

794 for subs in self.subjects(): 

795 scheme = subs.get("scheme") 

796 if scheme != "LCC": 

797 continue 

798 code = subs.get("code") 

799 expanded = lcc.expand_codes(code) 

800 full_list.update(expanded) 

801 

802 return ["LCC:" + x for x in full_list if x is not None] 

803 

804ARTICLE_BIBJSON_EXTENSION = { 

805 "objects" : ["bibjson"], 

806 "structs" : { 

807 "bibjson" : { 

808 "fields" : { 

809 "year" : {"coerce" : "unicode"}, 

810 "month" : {"coerce" : "unicode"}, 

811 "start_page" : {"coerce" : "unicode"}, 

812 "end_page" : {"coerce" : "unicode"}, 

813 "abstract" : {"coerce" : "unicode"} 

814 }, 

815 "lists" : { 

816 "author" : {"contains" : "object"} 

817 }, 

818 "objects" : [ 

819 "journal" 

820 ], 

821 

822 "structs" : { 

823 "author" : { 

824 "fields" : { 

825 "name" : {"coerce" : "unicode"}, 

826 "affiliation" : {"coerce" : "unicode"}, 

827 "email" : {"coerce": "unicode"}, 

828 "orcid_id" : {"coerce" : "unicode"} 

829 } 

830 }, 

831 

832 "journal" : { 

833 "fields" : { 

834 "volume" : {"coerce" : "unicode"}, 

835 "number" : {"coerce" : "unicode"}, 

836 "publisher" : {"coerce" : "unicode"}, 

837 "title" : {"coerce" : "unicode"}, 

838 "country" : {"coerce" : "unicode"} 

839 }, 

840 "lists" : { 

841 "language" : {"contains" : "field", "coerce" : "unicode"}, 

842 "issns" : {"contains" : "field", "coerce" : "unicode"} 

843 } 

844 } 

845 } 

846 

847 } 

848 } 

849} 

850 

851################################################## 

852 

853class ArticleQuery(object): 

854 base_query = { 

855 "track_total_hits" : True, 

856 "query" : { 

857 "bool" : { 

858 "must" : [] 

859 } 

860 } 

861 } 

862 

863 _issn_terms = { "terms" : {"index.issn.exact" : ["<list of issns here>"]} } 

864 _volume_term = { "term" : {"bibjson.journal.volume.exact" : "<volume here>"} } 

865 

866 def __init__(self, issns=None, volume=None): 

867 self.issns = issns 

868 self.volume = volume 

869 

870 def query(self): 

871 q = deepcopy(self.base_query) 

872 

873 if self.issns is not None: 

874 iq = deepcopy(self._issn_terms) 

875 iq["terms"]["index.issn.exact"] = self.issns 

876 q["query"]["bool"]["must"].append(iq) 

877 

878 if self.volume is not None: 

879 vq = deepcopy(self._volume_term) 

880 vq["term"]["bibjson.journal.volume.exact"] = self.volume 

881 q["query"]["bool"]["must"].append(vq) 

882 

883 return q 

884 

885class ArticleVolumesQuery(object): 

886 base_query = { 

887 "track_total_hits": True, 

888 "query" : { 

889 "bool": { 

890 "filter": { 

891 "terms" : {"index.issn.exact" : ["<list of issns here>"]} 

892 } 

893 } 

894 }, 

895 "size" : 0, 

896 "aggs" : { 

897 "vols" : { 

898 "terms" : { 

899 "field" : "bibjson.journal.volume.exact", 

900 "order": {"_key" : "desc"}, 

901 "size" : 1000 

902 } 

903 } 

904 } 

905 } 

906 

907 def __init__(self, issns=None): 

908 self.issns = issns 

909 

910 def query(self): 

911 q = deepcopy(self.base_query) 

912 q["query"]["bool"]["filter"]["terms"]["index.issn.exact"] = self.issns 

913 return q 

914 

915 

916class ArticleVolumesIssuesQuery(object): 

917 base_query = { 

918 "track_total_hits": True, 

919 "query" : { 

920 "bool": { 

921 "filter": { 

922 "bool": { 

923 "must": [ 

924 {"terms" : {"index.issn.exact" : ["<list of issns here>"]}}, 

925 {"term" : {"bibjson.journal.volume.exact" : "<volume here>"}} 

926 ] 

927 } 

928 } 

929 } 

930 }, 

931 "size" : 0, 

932 "aggs" : { 

933 "issues" : { 

934 "terms" : { 

935 "field" : "bibjson.journal.number.exact", 

936 "order": {"_key", "desc"}, 

937 "size" : 1000 

938 } 

939 } 

940 } 

941 } 

942 

943 def __init__(self, issns=None, volume=None): 

944 self.issns = issns 

945 self.volume = volume 

946 

947 def query(self): 

948 q = deepcopy(self.base_query) 

949 q["query"]["bool"]["filter"]["bool"]["must"][0]["terms"]["index.issn.exact"] = self.issns 

950 q["query"]["bool"]["filter"]["bool"]["must"][1]["term"]["bibjson.journal.volume.exact"] = self.volume 

951 return q 

952 

953 

954class DuplicateArticleQuery(object): 

955 base_query = { 

956 "track_total_hits" : True, 

957 "query": { 

958 "bool": { 

959 "must": [] 

960 } 

961 }, 

962 "sort": [{"last_updated": {"order": "desc"}}] 

963 } 

964 

965 _should = { 

966 "should" : [], 

967 "minimum_should_match" : 2 

968 } 

969 

970 _volume_term = {"term" : {"bibjson.journal.volume.exact" : "<volume>"}} 

971 _number_term = {"term" : {"bibjson.journal.number.exact" : "<issue number>"}} 

972 _start_term = {"term" : {"bibjson.start_page.exact" : "<start page>"}} 

973 _issn_terms = {"terms" : { "index.issn.exact" : ["<list of issns>"] }} 

974 _pubrec_term = {"term" : {"admin.publisher_record_id.exact" : "<publisher record id>"}} 

975 _identifier_term = {"term" : {"bibjson.identifier.id.exact" : "<issn here>"}} 

976 _doi_term = {"term" : {"index.doi.exact" : "<doi here>"}} 

977 _fulltext_terms = {"terms" : {"index.fulltext.exact" : ["<fulltext here>"]}} 

978 _fuzzy_title = {"fuzzy" : {"bibjson.title.exact" : "<title here>"}} 

979 

980 def __init__(self, issns=None, publisher_record_id=None, doi=None, urls=None, title=None, volume=None, number=None, start=None, should_match=None, size=10): 

981 self.issns = issns if isinstance(issns, list) else [] 

982 self.publisher_record_id = publisher_record_id 

983 self.doi = doi 

984 self.urls = urls if isinstance(urls, list) else [urls] if isinstance(urls, str) or isinstance(urls, str) else [] 

985 self.title = title 

986 self.volume = volume 

987 self.number = number 

988 self.start = start 

989 self.should_match = should_match 

990 self.size = size 

991 

992 def query(self): 

993 # - MUST be from at least one of the ISSNs 

994 # - MUST have the publisher record id 

995 # - MUST have the doi unless should_match is set 

996 # - MUST have the one of the fulltext urls unless should_match is set 

997 # - MUST fuzzy match the title 

998 # - SHOULD have <should_match> of: volume, issue, start page, fulltext url, doi 

999 

1000 q = deepcopy(self.base_query) 

1001 if len(self.issns) > 0: 

1002 it = deepcopy(self._issn_terms) 

1003 it["terms"]["index.issn.exact"] = self.issns 

1004 q["query"]["bool"]["must"].append(it) 

1005 

1006 if self.publisher_record_id is not None: 

1007 pr = deepcopy(self._pubrec_term) 

1008 pr["term"]["admin.publisher_record_id.exact"] = self.publisher_record_id 

1009 q["query"]["bool"]["must"].append(pr) 

1010 

1011 if self.doi is not None and self.should_match is None: 

1012 idt = deepcopy(self._doi_term) 

1013 # idt["term"]["bibjson.identifier.id.exact"] = self.doi 

1014 idt["term"]["index.doi.exact"] = self.doi 

1015 q["query"]["bool"]["must"].append(idt) 

1016 

1017 if len(self.urls) > 0 and self.should_match is None: 

1018 uq = deepcopy(self._fulltext_terms) 

1019 # uq["terms"]["bibjson.link.url.exact"] = self.urls 

1020 uq["terms"]["index.fulltext.exact"] = self.urls 

1021 q["query"]["bool"]["must"].append(uq) 

1022 

1023 if self.title is not None: 

1024 ft = deepcopy(self._fuzzy_title) 

1025 ft["fuzzy"]["bibjson.title.exact"] = self.title 

1026 q["query"]["bool"]["must"].append(ft) 

1027 

1028 if self.should_match is not None: 

1029 term_count = 0 

1030 s = deepcopy(self._should) 

1031 

1032 if self.volume is not None: 

1033 term_count += 1 

1034 vt = deepcopy(self._volume_term) 

1035 vt["term"]["bibjson.journal.volume.exact"] = self.volume 

1036 s["should"].append(vt) 

1037 

1038 if self.number is not None: 

1039 term_count += 1 

1040 nt = deepcopy(self._number_term) 

1041 nt["term"]["bibjson.journal.number.exact"] = self.number 

1042 s["should"].append(nt) 

1043 

1044 if self.start is not None: 

1045 term_count += 1 

1046 st = deepcopy(self._start_term) 

1047 st["term"]["bibjson.start_page.exact"] = self.start 

1048 s["should"].append(st) 

1049 

1050 if len(self.urls) > 0: 

1051 term_count += 1 

1052 uq = deepcopy(self._url_terms) 

1053 uq["terms"]["bibjson.link.url.exact"] = self.urls 

1054 s["should"].append(uq) 

1055 

1056 if self.doi is not None: 

1057 term_count += 1 

1058 idt = deepcopy(self._identifier_term) 

1059 idt["term"]["bibjson.identifier.id.exact"] = self.doi 

1060 s["should"].append(idt) 

1061 

1062 msm = self.should_match 

1063 if msm > term_count: 

1064 msm = term_count 

1065 s["minimum_should_match"] = msm 

1066 

1067 q["query"]["bool"].update(s) 

1068 

1069 # Allow more results than the default 

1070 q["size"] = self.size 

1071 

1072 return q 

1073 

1074 

1075def _human_sort(things, reverse=True): 

1076 numeric = [] 

1077 non_numeric = [] 

1078 nmap = {} 

1079 for v in things: 

1080 try: 

1081 # try to convert n to an int 

1082 vint = int(v) 

1083 

1084 # remember the original string (it may have leading 0s) 

1085 try: 

1086 nmap[vint].append(v) 

1087 except KeyError: 

1088 nmap[vint] = [v] 

1089 numeric.append(vint) 

1090 except: 

1091 non_numeric.append(v) 

1092 

1093 numeric.sort(reverse=reverse) 

1094 non_numeric.sort(reverse=reverse) 

1095 

1096 # convert the integers back to their string representation 

1097 return reduce(lambda x, y: x+y, [nmap[n] for n in numeric], []) + non_numeric 

1098 

1099 

1100def _sort_articles(articles): 

1101 # first extract the array we want to sort on 

1102 # and make a map of that value to the issue itself 

1103 unsorted = [] 

1104 numbers = [] 

1105 imap = {} 

1106 for art in articles: 

1107 sp = art.get("bibjson.start_page", [None])[0] 

1108 

1109 # can't sort anything that doesn't have a start page 

1110 if sp is None: 

1111 unsorted.append(art) 

1112 continue 

1113 

1114 # deal with start page clashes and record the start pages 

1115 # to sort by 

1116 if sp not in numbers: 

1117 numbers.append(sp) 

1118 if sp in imap: 

1119 imap[sp].append(art) 

1120 else: 

1121 imap[sp] = [art] 

1122 

1123 sorted_keys = _human_sort(numbers, reverse=False) 

1124 

1125 s = [] 

1126 for n in sorted_keys: 

1127 s += [x for x in imap[n]] 

1128 s += [x for x in unsorted] 

1129 

1130 return s