Coverage for portality / models / article.py: 82%

744 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-04 09:41 +0100

1import string 

2 

3from unidecode import unidecode 

4from functools import reduce 

5from copy import deepcopy 

6from datetime import datetime 

7 

8from portality import datasets, constants 

9from portality.core import app 

10from portality.dao import DomainObject 

11from portality.lib import es_data_mapping 

12from portality.lib.coerce import COERCE_MAP 

13from portality.lib.dates import FMT_DATETIME_STD 

14from portality.lib.seamless import SeamlessMixin 

15from portality.models import Journal 

16from portality.models.v1.bibjson import GenericBibJSON # NOTE that article specifically uses the v1 BibJSON 

17from portality.models.v1 import shared_structs 

18from portality.models.v2.shared_structs import ARTICLE_STRUCT 

19from portality.lib import normalise, dates 

20 

21 

22class NoJournalException(Exception): 

23 pass 

24 

25class NoValidOwnerException(Exception): 

26 pass 

27 

28 

29ARTICLE_BIBJSON_EXTENSION = { 

30 "objects" : ["bibjson"], 

31 "structs" : { 

32 "bibjson" : { 

33 "fields" : { 

34 "year" : {"coerce" : "unicode"}, 

35 "month" : {"coerce" : "unicode"}, 

36 "start_page" : {"coerce" : "unicode"}, 

37 "end_page" : {"coerce" : "unicode"}, 

38 "abstract" : {"coerce" : "unicode"} 

39 }, 

40 "lists" : { 

41 "author" : {"contains" : "object"} 

42 }, 

43 "objects" : [ 

44 "journal" 

45 ], 

46 

47 "structs" : { 

48 "author" : { 

49 "fields" : { 

50 "name" : {"coerce" : "unicode"}, 

51 "affiliation" : {"coerce" : "unicode"}, 

52 "email" : {"coerce": "unicode"}, 

53 "orcid_id" : {"coerce" : "unicode"} 

54 } 

55 }, 

56 

57 "journal" : { 

58 "fields" : { 

59 "volume" : {"coerce" : "unicode"}, 

60 "number" : {"coerce" : "unicode"}, 

61 "publisher" : {"coerce" : "unicode"}, 

62 "title" : {"coerce" : "unicode"}, 

63 "country" : {"coerce" : "unicode"} 

64 }, 

65 "lists" : { 

66 "language" : {"contains" : "field", "coerce" : "unicode"}, 

67 "issns" : {"contains" : "field", "coerce" : "unicode"} 

68 } 

69 } 

70 } 

71 

72 } 

73 } 

74} 

75 

76MAPPING_OPTS = { 

77 "dynamic": None, 

78 "coerces": app.config["DATAOBJ_TO_MAPPING_DEFAULTS"], 

79 "exceptions": app.config["ARTICLE_EXCEPTION_MAPPING"], 

80 "additional_mappings": {} 

81} 

82 

83 

84class Article(SeamlessMixin, DomainObject): 

85 __type__ = "article" 

86 

87 __SEAMLESS_STRUCT__ = [ 

88 ARTICLE_STRUCT, 

89 shared_structs.SHARED_BIBJSON, 

90 ARTICLE_BIBJSON_EXTENSION 

91 ] 

92 

93 __SEAMLESS_COERCE__ = COERCE_MAP 

94 

95 def mappings(self): 

96 return es_data_mapping.create_mapping(self.__seamless_struct__.raw, MAPPING_OPTS) 

97 

98 @classmethod 

99 def duplicates(cls, publisher_record_id=None, doi=None, fulltexts=None, title=None, volume=None, number=None, start=None, should_match=None, size=10): 

100 # some input sanitisation 

101 urls = fulltexts if isinstance(fulltexts, list) else [fulltexts] if isinstance(fulltexts, str) or isinstance(fulltexts, str) else [] 

102 

103 # make sure that we're dealing with the normal form of the identifiers 

104 norm_urls = [] 

105 for url in urls: 

106 try: 

107 norm = normalise.normalise_url(url) 

108 norm_urls.append(norm) 

109 except ValueError: 

110 # use the non-normal form 

111 norm_urls.append(url) 

112 urls = norm_urls 

113 

114 try: 

115 doi = normalise.normalise_doi(doi) 

116 except ValueError: 

117 # leave the doi as it is 

118 pass 

119 

120 q = DuplicateArticleQuery(publisher_record_id=publisher_record_id, 

121 doi=doi, 

122 urls=urls, 

123 title=title, 

124 volume=volume, 

125 number=number, 

126 start=start, 

127 should_match=should_match, 

128 size=size) 

129 

130 # res = cls.query(q=q.query()) 

131 # return [cls(**hit.get("_source")) for hit in res.get("hits", {}).get("hits", [])] 

132 return cls.q2obj(q=q.query()) 

133 

134 @classmethod 

135 def list_volumes(cls, issns): 

136 q = ArticleVolumesQuery(issns) 

137 result = cls.query(q=q.query()) 

138 return _human_sort([t.get("key") for t in result.get("aggregations", {}).get("vols", {}).get("buckets", [])]) 

139 

140 @classmethod 

141 def list_volume_issues(cls, issns, volume): 

142 q = ArticleVolumesIssuesQuery(issns, volume) 

143 result = cls.query(q=q.query()) 

144 return _human_sort([t.get("key") for t in result.get("aggregations", {}).get("issues", {}).get("buckets", [])]) 

145 

146 @classmethod 

147 def get_by_volume(cls, issns, volume): 

148 q = ArticleQuery(issns=issns, volume=volume) 

149 articles = cls.iterate(q.query(), page_size=1000) 

150 return articles 

151 

152 @classmethod 

153 def find_by_issns(cls, issns): 

154 q = ArticleQuery(issns=issns) 

155 articles = cls.iterate(q.query(), page_size=1000) 

156 return articles 

157 

158 @classmethod 

159 def count_by_issns(cls, issns, in_doaj=None): 

160 q = ArticleQuery(issns=issns, in_doaj=in_doaj) 

161 return cls.hit_count(q.query()) 

162 

163 @classmethod 

164 def delete_by_issns(cls, issns, snapshot=True): 

165 q = ArticleQuery(issns=issns) 

166 cls.delete_selected(query=q.query(), snapshot=snapshot) 

167 

168 @classmethod 

169 def delete_selected(cls, query=None, owner=None, snapshot=True, tombstone=True): 

170 if owner is not None: 

171 from portality.models import Journal 

172 issns = Journal.issns_by_owner(owner) 

173 q = ArticleQuery(issns=issns) 

174 query = q.query() 

175 

176 if snapshot or tombstone: 

177 articles = cls.iterate(query, page_size=1000) 

178 for article in articles: 

179 if snapshot: 

180 article.snapshot() 

181 if tombstone: 

182 article._tombstone() 

183 return cls.delete_by_query(query) 

184 

185 def delete(self): 

186 self._tombstone() 

187 super(Article, self).delete() 

188 

189 def bibjson(self, **kwargs): 

190 if "bibjson" not in self.data: 

191 self.data["bibjson"] = {} 

192 return ArticleBibJSON(self.data.get("bibjson"), **kwargs) 

193 

194 def set_bibjson(self, bibjson): 

195 bibjson = bibjson.bibjson if isinstance(bibjson, ArticleBibJSON) else bibjson 

196 self.data["bibjson"] = bibjson 

197 

198 def history(self): 

199 hs = self.data.get("history", []) 

200 tuples = [] 

201 for h in hs: 

202 tuples.append((h.get("date"), ArticleBibJSON(h.get("bibjson")))) 

203 return tuples 

204 

205 def snapshot(self): 

206 from portality.models import ArticleHistory 

207 

208 snap = deepcopy(self.data) 

209 if "id" in snap: 

210 snap["about"] = snap["id"] 

211 del snap["id"] 

212 if "index" in snap: 

213 del snap["index"] 

214 if "last_updated" in snap: 

215 del snap["last_updated"] 

216 if "created_date" in snap: 

217 del snap["created_date"] 

218 

219 hist = ArticleHistory(**snap) 

220 hist.save() 

221 return hist.id 

222 

223 def _tombstone(self): 

224 stone = ArticleTombstone() 

225 stone.set_id(self.id) 

226 sbj = stone.bibjson() 

227 

228 subs = self.bibjson().subjects() 

229 for s in subs: 

230 sbj.add_subject(s.get("scheme"), s.get("term"), s.get("code")) 

231 

232 stone.save() 

233 return stone 

234 

235 def add_history(self, bibjson, date=None): 

236 """Deprecated""" 

237 bibjson = bibjson.bibjson if isinstance(bibjson, ArticleBibJSON) else bibjson 

238 if date is None: 

239 date = dates.now_str() 

240 snobj = {"date": date, "bibjson": bibjson} 

241 if "history" not in self.data: 

242 self.data["history"] = [] 

243 self.data["history"].append(snobj) 

244 

245 def is_in_doaj(self): 

246 try: 

247 return self.data['admin'].get("in_doaj", False) 

248 except KeyError: 

249 # If we have no admin section, return None instead 

250 return None 

251 

252 def set_in_doaj(self, value): 

253 if "admin" not in self.data: 

254 self.data["admin"] = {} 

255 self.data["admin"]["in_doaj"] = value 

256 

257 def publisher_record_id(self): 

258 return self.data.get("admin", {}).get("publisher_record_id") 

259 

260 def set_publisher_record_id(self, pri): 

261 if "admin" not in self.data: 

262 self.data["admin"] = {} 

263 self.data["admin"]["publisher_record_id"] = pri 

264 

265 def upload_id(self): 

266 return self.data.get("admin", {}).get("upload_id") 

267 

268 def set_upload_id(self, uid): 

269 if "admin" not in self.data: 

270 self.data["admin"] = {} 

271 self.data["admin"]["upload_id"] = uid 

272 

273 def get_normalised_doi(self): 

274 if self.data.get("index", {}).get("doi") is not None: 

275 return self.data["index"]["doi"] 

276 doi = self.bibjson().get_one_identifier(constants.IDENT_TYPE_DOI) 

277 if doi is None: 

278 return None 

279 try: 

280 return normalise.normalise_doi(doi) 

281 except ValueError: 

282 # can't be normalised, so we just return the doi as-is 

283 return doi 

284 

285 def get_normalised_fulltext(self): 

286 if self.data.get("index", {}).get("fulltext") is not None: 

287 return self.data["index"]["fulltext"] 

288 fulltexts = self.bibjson().get_urls(constants.LINK_TYPE_FULLTEXT) 

289 if len(fulltexts) == 0: 

290 return None 

291 try: 

292 return normalise.normalise_url(fulltexts[0]) 

293 except ValueError: 

294 # can't be normalised, so we just return the url as-is 

295 return fulltexts[0] 

296 

297 def get_journal(self): 

298 """ 

299 Get this article's associated journal 

300 :return: A Journal, or None if this is an orphan article 

301 """ 

302 bibjson = self.bibjson() 

303 

304 # first, get the ISSNs associated with the record 

305 pissns = bibjson.get_identifiers(bibjson.P_ISSN) 

306 eissns = bibjson.get_identifiers(bibjson.E_ISSN) 

307 allissns = list(set(pissns + eissns)) 

308 

309 # find a matching journal record from the index 

310 best_match = None 

311 

312 for issn in allissns: 

313 journals = Journal.find_by_issn(issn) 

314 if len(journals) > 0: 

315 # Get the best journal match: 

316 # 1. Prefer the most recently updated journal that is in DOAJ. 

317 # 2. If none are in DOAJ, fall back to the most recently updated journal outside DOAJ. 

318 

319 matches = [j for j in journals if j.is_in_doaj()] 

320 

321 if len(matches) == 0: 

322 matches = journals 

323 

324 best_match = max( 

325 matches, 

326 key=lambda j: j.last_updated, 

327 default=None 

328 ) 

329 

330 return best_match 

331 

332 def get_associated_journals(self): 

333 # find all matching journal record from the index 

334 allissns = self.bibjson().issns() 

335 return Journal.find_by_issn(allissns) 

336 

337 def add_journal_metadata(self, j=None, reg=None): 

338 """ 

339 this function makes sure the article is populated 

340 with all the relevant info from its owning parent object 

341 :param j: Pass in a Journal to bypass the (slow) locating step. MAKE SURE IT'S THE RIGHT ONE! 

342 """ 

343 

344 # Record the data that is copied into the article into the "reg"ister, in case the 

345 # caller needs to know exactly and only which information was copied 

346 if reg is None: 

347 reg = Journal() 

348 rbj = reg.bibjson() 

349 

350 if j is None: 

351 journal = self.get_journal() 

352 else: 

353 journal = j 

354 

355 # we were unable to find a journal 

356 if journal is None: 

357 raise NoJournalException("Unable to find a journal associated with this article") 

358 

359 # if we get to here, we have a journal record we want to pull data from 

360 jbib = journal.bibjson() 

361 bibjson = self.bibjson() 

362 

363 # tripwire to be tripped if the journal makes changes to the article 

364 trip = False 

365 

366 if bibjson.subjects() != jbib.subjects(): 

367 trip = True 

368 bibjson.set_subjects(jbib.subjects()) 

369 rbj.set_subjects(jbib.subjects()) 

370 

371 if jbib.title is not None: 

372 if bibjson.journal_title != jbib.title: 

373 trip = True 

374 bibjson.journal_title = jbib.title 

375 rbj.title = jbib.title 

376 

377 if len(jbib.language) > 0: 

378 jlang = jbib.language 

379 alang = bibjson.journal_language 

380 jlang.sort() 

381 alang.sort() 

382 if jlang != alang: 

383 bibjson.journal_language = jbib.language 

384 trip = True 

385 rbj.set_language(jbib.language) 

386 

387 if jbib.country is not None: 

388 if jbib.country != bibjson.journal_country: 

389 bibjson.journal_country = jbib.country 

390 trip = True 

391 rbj.country = jbib.country 

392 

393 if jbib.publisher: 

394 if jbib.publisher != bibjson.publisher: 

395 bibjson.publisher = jbib.publisher 

396 trip = True 

397 rbj.publisher = jbib.publisher 

398 

399 # Copy the in_doaj status and the journal's ISSNs 

400 if journal.is_in_doaj() != self.is_in_doaj(): 

401 self.set_in_doaj(journal.is_in_doaj()) 

402 trip = True 

403 reg.set_in_doaj(journal.is_in_doaj()) 

404 

405 try: 

406 aissns = bibjson.journal_issns 

407 jissns = jbib.issns() 

408 aissns.sort() 

409 jissns.sort() 

410 if aissns != jissns: 

411 bibjson.journal_issns = jbib.issns() 

412 trip = True 

413 

414 eissns = jbib.get_identifiers(jbib.E_ISSN) 

415 pissns = jbib.get_identifiers(jbib.P_ISSN) 

416 if eissns is not None and len(eissns) > 0: 

417 rbj.add_identifier(rbj.E_ISSN, eissns[0]) 

418 if pissns is not None and len(pissns) > 0: 

419 rbj.add_identifier(rbj.P_ISSN, pissns[0]) 

420 except KeyError: 

421 # No issns, don't worry about it for now 

422 pass 

423 

424 return trip 

425 

426 def merge(self, old, take_id=True): 

427 # this takes an old version of the article and brings 

428 # forward any useful information that is needed. The rules of merge are: 

429 # - ignore "index" (it gets regenerated on save) 

430 # - always take the "created_date" 

431 # - any top level field that does not exist in the current item (esp "id" and "history") 

432 # - in "admin", copy any field that does not already exist 

433 

434 # first thing to do is create a snapshot of the old record 

435 old.snapshot() 

436 

437 # now go on and do the merge 

438 

439 # always take the created date 

440 self.set_created(old.created_date) 

441 

442 # take the id 

443 if self.id is None or take_id: 

444 self.set_id(old.id) 

445 

446 # take the history (deprecated) 

447 if len(self.data.get("history", [])) == 0: 

448 self.data["history"] = deepcopy(old.data.get("history", [])) 

449 

450 # take the bibjson 

451 if "bibjson" not in self.data: 

452 self.set_bibjson(deepcopy(old.bibjson())) 

453 

454 # take the admin if there isn't one 

455 if "admin" not in self.data: 

456 self.data["admin"] = deepcopy(old.data.get("admin", {})) 

457 else: 

458 # otherwise, copy any admin keys that don't exist on the current item 

459 oa = old.data.get("admin", {}) 

460 for key in oa: 

461 if key not in self.data["admin"]: 

462 self.data["admin"][key] = deepcopy(oa[key]) 

463 

464 def _generate_index(self): 

465 # the index fields we are going to generate 

466 issns = [] 

467 subjects = [] 

468 schema_subjects = [] 

469 schema_codes = [] 

470 schema_codes_tree = [] 

471 classification = [] 

472 langs = [] 

473 country = None 

474 publisher = [] 

475 classification_paths = [] 

476 unpunctitle = None 

477 asciiunpunctitle = None 

478 doi = None 

479 fulltext = None 

480 

481 # the places we're going to get those fields from 

482 cbib = self.bibjson() 

483 jindex = self.data.get('index', {}) 

484 hist = self.history() 

485 

486 # get the issns out of the current bibjson 

487 issns += cbib.get_identifiers(cbib.P_ISSN) 

488 issns += cbib.get_identifiers(cbib.E_ISSN) 

489 

490 # get the issn from the journal bibjson 

491 if isinstance(cbib.journal_issns, list): 

492 issns += cbib.journal_issns 

493 

494 # de-duplicate the issns 

495 issns = list(set(issns)) 

496 

497 # now get the issns out of the historic records 

498 for date, hbib in hist: 

499 issns += hbib.get_identifiers(hbib.P_ISSN) 

500 issns += hbib.get_identifiers(hbib.E_ISSN) 

501 

502 # get the subjects and concatenate them with their schemes from the current bibjson 

503 for subs in cbib.subjects(): 

504 scheme = subs.get("scheme") 

505 term = subs.get("term") 

506 subjects.append(term) 

507 schema_subjects.append(scheme + ":" + term) 

508 classification.append(term) 

509 if "code" in subs: 

510 schema_codes.append(scheme + ":" + subs.get("code")) 

511 

512 # copy the languages 

513 if len(cbib.journal_language) > 0: 

514 langs = [datasets.name_for_lang(l) for l in cbib.journal_language] 

515 

516 # Get the country name from the bibjson country code 

517 if cbib.journal_country: 

518 country = datasets.get_country_name(cbib.journal_country) 

519 

520 # copy the publisher/provider 

521 if cbib.publisher: 

522 publisher.append(cbib.publisher) 

523 

524 # deduplicate the lists 

525 issns = list(set(issns)) 

526 subjects = list(set(subjects)) 

527 schema_subjects = list(set(schema_subjects)) 

528 classification = list(set(classification)) 

529 publisher = list(set(publisher)) 

530 langs = list(set(langs)) 

531 schema_codes = list(set(schema_codes)) 

532 

533 # work out what the date of publication is 

534 date = cbib.get_publication_date() 

535 

536 # calculate the classification paths 

537 from portality.lcc import lcc # inline import since this hits the database 

538 for subs in cbib.subjects(): 

539 scheme = subs.get("scheme") 

540 term = subs.get("term") 

541 if scheme == "LCC": 

542 path = lcc.pathify(term) 

543 if path is not None: 

544 classification_paths.append(path) 

545 

546 # normalise the classification paths, so we only store the longest ones 

547 classification_paths = lcc.longest(classification_paths) 

548 schema_codes_tree = cbib.lcc_codes_full_list() 

549 

550 # create an unpunctitle 

551 if cbib.title is not None: 

552 throwlist = string.punctuation + '\n\t' 

553 unpunctitle = "".join(c for c in cbib.title if c not in throwlist).strip() 

554 try: 

555 asciiunpunctitle = unidecode(unpunctitle) 

556 except: 

557 asciiunpunctitle = unpunctitle 

558 

559 # create a normalised version of the DOI for deduplication 

560 source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI) 

561 try: 

562 doi = normalise.normalise_doi(source_doi) 

563 except ValueError as e: 

564 # if we can't normalise the DOI, just store it cast to lower case. 

565 doi = source_doi.lower() 

566 

567 # create a normalised version of the fulltext URL for deduplication 

568 fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT) 

569 if len(fulltexts) > 0: 

570 source_fulltext = fulltexts[0] 

571 try: 

572 fulltext = normalise.normalise_url(source_fulltext) 

573 except ValueError as e: 

574 # if we can't normalise the fulltext store it as-is 

575 fulltext = source_fulltext 

576 

577 # build the index part of the object 

578 self.data["index"] = {} 

579 if len(issns) > 0: 

580 self.data["index"]["issn"] = issns 

581 if date != "": 

582 self.data["index"]["date"] = date 

583 self.data["index"]["date_toc_fv_month"] = date # Duplicated so we can have year/month facets in fv2 

584 if len(subjects) > 0: 

585 self.data["index"]["subject"] = subjects 

586 if len(schema_subjects) > 0: 

587 self.data["index"]["schema_subject"] = schema_subjects 

588 if len(classification) > 0: 

589 self.data["index"]["classification"] = classification 

590 if len(publisher) > 0: 

591 self.data["index"]["publisher"] = publisher 

592 if len(langs) > 0: 

593 self.data["index"]["language"] = langs 

594 if country is not None: 

595 self.data["index"]["country"] = country 

596 if len(schema_codes) > 0: 

597 self.data["index"]["schema_code"] = schema_codes 

598 if len(classification_paths) > 0: 

599 self.data["index"]["classification_paths"] = classification_paths 

600 if unpunctitle is not None: 

601 self.data["index"]["unpunctitle"] = unpunctitle 

602 if asciiunpunctitle is not None: 

603 self.data["index"]["asciiunpunctitle"] = unpunctitle 

604 if doi is not None: 

605 self.data["index"]["doi"] = doi 

606 if fulltext is not None: 

607 self.data["index"]["fulltext"] = fulltext 

608 if len(schema_codes_tree) > 0: 

609 self.data["index"]["schema_codes_tree"] = schema_codes_tree 

610 

611 def prep(self): 

612 self._generate_index() 

613 self.data['last_updated'] = dates.now_str() 

614 

615 def save(self, *args, **kwargs): 

616 self._generate_index() 

617 return super(Article, self).save(*args, **kwargs) 

618 

619 def get_owner(self): 

620 b = self.bibjson() 

621 article_issns = b.get_identifiers(b.P_ISSN) 

622 article_issns += b.get_identifiers(b.E_ISSN) 

623 owners = [] 

624 

625 seen_journal_issns = {} 

626 for issn in article_issns: 

627 journals = Journal.find_by_issn(issn) 

628 if journals is not None and len(journals) > 0: 

629 for j in journals: 

630 owners.append(j.owner) 

631 if j.owner not in seen_journal_issns: 

632 seen_journal_issns[j.owner] = [] 

633 seen_journal_issns[j.owner] += j.bibjson().issns() 

634 

635 # deduplicate the list of owners 

636 owners = list(set(owners)) 

637 

638 # no owner means we can't confirm 

639 if len(owners) == 0: 

640 raise NoValidOwnerException 

641 

642 # multiple owners means ownership of this article is confused 

643 if len(owners) > 1: 

644 return NoValidOwnerException 

645 

646 return owners[0] 

647 

648 

649class ArticleTombstone(Article): 

650 __type__ = "article_tombstone" 

651 

652 def snapshot(self): 

653 return None 

654 

655 def is_in_doaj(self): 

656 return False 

657 

658 def prep(self): 

659 self.data['last_updated'] = dates.now_str() 

660 

661 def save(self, *args, **kwargs): 

662 return super(ArticleTombstone, self).save(*args, **kwargs) 

663 

664 

665class ArticleBibJSON(GenericBibJSON): 

666 

667 def __init__(self, bibjson=None, **kwargs): 

668 self._add_struct(shared_structs.SHARED_BIBJSON.get("structs", {}).get("bibjson")) 

669 self._add_struct(ARTICLE_BIBJSON_EXTENSION.get("structs", {}).get("bibjson")) 

670 super(ArticleBibJSON, self).__init__(bibjson, **kwargs) 

671 

672 # article-specific simple getters and setters 

673 @property 

674 def year(self): 

675 return self._get_single("year") 

676 

677 @year.setter 

678 def year(self, val): 

679 self._set_with_struct("year", val) 

680 

681 @year.deleter 

682 def year(self): 

683 self._delete("year") 

684 

685 @property 

686 def month(self): 

687 return self._get_single("month") 

688 

689 @month.setter 

690 def month(self, val): 

691 self._set_with_struct("month", val) 

692 

693 @month.deleter 

694 def month(self): 

695 self._delete("month") 

696 

697 @property 

698 def start_page(self): 

699 return self._get_single("start_page") 

700 

701 @start_page.setter 

702 def start_page(self, val): 

703 self._set_with_struct("start_page", val) 

704 

705 @property 

706 def end_page(self): 

707 return self._get_single("end_page") 

708 

709 @end_page.setter 

710 def end_page(self, val): 

711 self._set_with_struct("end_page", val) 

712 

713 @property 

714 def abstract(self): 

715 return self._get_single("abstract") 

716 

717 @abstract.setter 

718 def abstract(self, val): 

719 self._set_with_struct("abstract", val) 

720 

721 # article-specific complex part getters and setters 

722 

723 @property 

724 def volume(self): 

725 return self._get_single("journal.volume") 

726 

727 @volume.setter 

728 def volume(self, value): 

729 self._set_with_struct("journal.volume", value) 

730 

731 @property 

732 def number(self): 

733 return self._get_single("journal.number") 

734 

735 @number.setter 

736 def number(self, value): 

737 self._set_with_struct("journal.number", value) 

738 

739 @property 

740 def journal_title(self): 

741 return self._get_single("journal.title") 

742 

743 @journal_title.setter 

744 def journal_title(self, title): 

745 self._set_with_struct("journal.title", title) 

746 

747 @property 

748 def journal_language(self): 

749 return self._get_list("journal.language") 

750 

751 @journal_language.setter 

752 def journal_language(self, lang): 

753 self._set_with_struct("journal.language", lang) 

754 

755 @property 

756 def journal_country(self): 

757 return self._get_single("journal.country") 

758 

759 @journal_country.setter 

760 def journal_country(self, country): 

761 self._set_single("journal.country", country) 

762 

763 @property 

764 def journal_issns(self): 

765 return self._get_list("journal.issns") 

766 

767 @journal_issns.setter 

768 def journal_issns(self, issns): 

769 self._set_with_struct("journal.issns", issns) 

770 

771 @property 

772 def publisher(self): 

773 return self._get_single("journal.publisher") 

774 

775 @publisher.setter 

776 def publisher(self, value): 

777 self._set_with_struct("journal.publisher", value) 

778 

779 def add_author(self, name, affiliation=None, orcid_id=None): 

780 aobj = {"name": name} 

781 if affiliation is not None: 

782 aobj["affiliation"] = affiliation 

783 if orcid_id is not None: 

784 aobj["orcid_id"] = orcid_id 

785 self._add_to_list_with_struct("author", aobj) 

786 

787 @property 

788 def author(self): 

789 return self._get_list("author") 

790 

791 @author.setter 

792 def author(self, authors): 

793 self._set_with_struct("author", authors) 

794 

795 def get_publication_date(self, date_format=FMT_DATETIME_STD): 

796 # work out what the date of publication is 

797 date = "" 

798 if self.year is not None: 

799 if type(self.year) is str: # It should be, if the mappings are correct. but len() needs a sequence. 

800 # fix 2 digit years 

801 if len(self.year) == 2: 

802 try: 

803 intyear = int(self.year) 

804 except ValueError: 

805 # if it's 2 chars long and the 2 chars don't make an integer, 

806 # forget it 

807 return date 

808 

809 # In the case of truncated years, assume it's this century if before the current year 

810 if intyear <= int(str(dates.now().year)[:-2]): 

811 self.year = "20" + self.year # For readability over long-lasting code, I have refrained 

812 else: # from using str(dates.now().year)[:2] here. 

813 self.year = "19" + self.year # But don't come crying to me 90-ish years from now. 

814 

815 # if we still don't have a 4 digit year, forget it 

816 if len(self.year) != 4: 

817 return date 

818 

819 # build up our proposed datestamp 

820 date += str(self.year) 

821 if self.month is not None: 

822 try: 

823 if type(self.month) is int: 

824 if 1 <= int(self.month) <= 12: 

825 month_number = self.month 

826 else: 

827 month_number = 1 

828 elif len(self.month) <= 2: 

829 if 1 <= int(self.month) <= 12: 

830 month_number = self.month 

831 else: 

832 month_number = '1' 

833 elif len(self.month) == 3: # 'May' works with either case, obvz. 

834 month_number = datetime.strptime(self.month, '%b').month 

835 else: 

836 month_number = datetime.strptime(self.month, '%B').month 

837 

838 

839 # pad the month number to two digits. This accepts int or string 

840 date += '-{:0>2}'.format(month_number) 

841 except: 

842 # If something goes wrong, just assume it's January 

843 date += "-01" 

844 else: 

845 date += "-01" 

846 date += "-01T00:00:00Z" 

847 

848 # attempt to confirm the format of our datestamp 

849 try: 

850 datecheck = dates.parse(date) 

851 date = datecheck.strftime(date_format) 

852 except: 

853 return "" 

854 return date 

855 

856 def remove_journal_metadata(self): 

857 self._delete("journal") 

858 

859 def vancouver_citation(self): 

860 jtitle = self.journal_title 

861 year = self.year 

862 vol = self.volume 

863 iss = self.number 

864 start = self.start_page 

865 end = self.end_page 

866 

867 citation = "" 

868 

869 if year: 

870 citation += year + ";" 

871 

872 if vol: 

873 citation += vol 

874 

875 if iss: 

876 citation += "(" + iss + ")" 

877 

878 if start or end: 

879 if citation != "": 

880 citation += ":" 

881 if start: 

882 citation += start 

883 if end: 

884 if start: 

885 citation += "-" 

886 citation += end 

887 

888 return jtitle.strip(), citation 

889 

890 def lcc_codes_full_list(self): 

891 full_list = set() 

892 

893 from portality.lcc import lcc # inline import since this hits the database 

894 for subs in self.subjects(): 

895 scheme = subs.get("scheme") 

896 if scheme != "LCC": 

897 continue 

898 code = subs.get("code") 

899 expanded = lcc.expand_codes(code) 

900 full_list.update(expanded) 

901 

902 return ["LCC:" + x for x in full_list if x is not None] 

903 

904 

905################################################## 

906 

907class ArticleQuery(object): 

908 base_query = { 

909 "track_total_hits" : True, 

910 "query" : { 

911 "bool" : { 

912 "must" : [] 

913 } 

914 } 

915 } 

916 

917 _issn_terms = { "terms" : {"index.issn.exact" : ["<list of issns here>"]} } 

918 _volume_term = { "term" : {"bibjson.journal.volume.exact" : "<volume here>"} } 

919 

920 def __init__(self, issns=None, volume=None, in_doaj=None): 

921 self.issns = issns 

922 self.volume = volume 

923 self.in_doaj = in_doaj 

924 

925 def query(self): 

926 q = deepcopy(self.base_query) 

927 

928 if self.issns is not None: 

929 iq = deepcopy(self._issn_terms) 

930 iq["terms"]["index.issn.exact"] = self.issns 

931 q["query"]["bool"]["must"].append(iq) 

932 

933 if self.volume is not None: 

934 vq = deepcopy(self._volume_term) 

935 vq["term"]["bibjson.journal.volume.exact"] = self.volume 

936 q["query"]["bool"]["must"].append(vq) 

937 

938 if self.in_doaj is not None: 

939 q["query"]["bool"]["must"].append({"term": {"admin.in_doaj": self.in_doaj}}) 

940 

941 return q 

942 

943class ArticleVolumesQuery(object): 

944 base_query = { 

945 "track_total_hits": True, 

946 "query" : { 

947 "bool": { 

948 "filter": { 

949 "terms" : {"index.issn.exact" : ["<list of issns here>"]} 

950 } 

951 } 

952 }, 

953 "size" : 0, 

954 "aggs" : { 

955 "vols" : { 

956 "terms" : { 

957 "field" : "bibjson.journal.volume.exact", 

958 "order": {"_key" : "desc"}, 

959 "size" : 1000 

960 } 

961 } 

962 } 

963 } 

964 

965 def __init__(self, issns=None): 

966 self.issns = issns 

967 

968 def query(self): 

969 q = deepcopy(self.base_query) 

970 q["query"]["bool"]["filter"]["terms"]["index.issn.exact"] = self.issns 

971 return q 

972 

973 

974class ArticleVolumesIssuesQuery(object): 

975 base_query = { 

976 "track_total_hits": True, 

977 "query" : { 

978 "bool": { 

979 "filter": { 

980 "bool": { 

981 "must": [ 

982 {"terms" : {"index.issn.exact" : ["<list of issns here>"]}}, 

983 {"term" : {"bibjson.journal.volume.exact" : "<volume here>"}} 

984 ] 

985 } 

986 } 

987 } 

988 }, 

989 "size" : 0, 

990 "aggs" : { 

991 "issues" : { 

992 "terms" : { 

993 "field" : "bibjson.journal.number.exact", 

994 "order": {"_key", "desc"}, 

995 "size" : 1000 

996 } 

997 } 

998 } 

999 } 

1000 

1001 def __init__(self, issns=None, volume=None): 

1002 self.issns = issns 

1003 self.volume = volume 

1004 

1005 def query(self): 

1006 q = deepcopy(self.base_query) 

1007 q["query"]["bool"]["filter"]["bool"]["must"][0]["terms"]["index.issn.exact"] = self.issns 

1008 q["query"]["bool"]["filter"]["bool"]["must"][1]["term"]["bibjson.journal.volume.exact"] = self.volume 

1009 return q 

1010 

1011 

1012class DuplicateArticleQuery(object): 

1013 base_query = { 

1014 "track_total_hits" : True, 

1015 "query": { 

1016 "bool": { 

1017 "must": [] 

1018 } 

1019 }, 

1020 "sort": [{"last_updated": {"order": "desc"}}] 

1021 } 

1022 

1023 _should = { 

1024 "should" : [], 

1025 "minimum_should_match" : 2 

1026 } 

1027 

1028 _volume_term = {"term" : {"bibjson.journal.volume.exact" : "<volume>"}} 

1029 _number_term = {"term" : {"bibjson.journal.number.exact" : "<issue number>"}} 

1030 _start_term = {"term" : {"bibjson.start_page.exact" : "<start page>"}} 

1031 _issn_terms = {"terms" : { "index.issn.exact" : ["<list of issns>"] }} 

1032 _pubrec_term = {"term" : {"admin.publisher_record_id.exact" : "<publisher record id>"}} 

1033 _identifier_term = {"term" : {"bibjson.identifier.id.exact" : "<issn here>"}} 

1034 _doi_term = {"term" : {"index.doi.exact" : "<doi here>"}} 

1035 _fulltext_terms = {"terms" : {"index.fulltext.exact" : ["<fulltext here>"]}} 

1036 _fuzzy_title = {"fuzzy" : {"bibjson.title.exact" : "<title here>"}} 

1037 

1038 def __init__(self, issns=None, publisher_record_id=None, doi=None, urls=None, title=None, volume=None, number=None, start=None, should_match=None, size=10): 

1039 self.issns = issns if isinstance(issns, list) else [] 

1040 self.publisher_record_id = publisher_record_id 

1041 self.doi = doi 

1042 self.urls = urls if isinstance(urls, list) else [urls] if isinstance(urls, str) or isinstance(urls, str) else [] 

1043 self.title = title 

1044 self.volume = volume 

1045 self.number = number 

1046 self.start = start 

1047 self.should_match = should_match 

1048 self.size = size 

1049 

1050 def query(self): 

1051 # - MUST be from at least one of the ISSNs 

1052 # - MUST have the publisher record id 

1053 # - MUST have the doi unless should_match is set 

1054 # - MUST have the one of the fulltext urls unless should_match is set 

1055 # - MUST fuzzy match the title 

1056 # - SHOULD have <should_match> of: volume, issue, start page, fulltext url, doi 

1057 

1058 q = deepcopy(self.base_query) 

1059 if len(self.issns) > 0: 

1060 it = deepcopy(self._issn_terms) 

1061 it["terms"]["index.issn.exact"] = self.issns 

1062 q["query"]["bool"]["must"].append(it) 

1063 

1064 if self.publisher_record_id is not None: 

1065 pr = deepcopy(self._pubrec_term) 

1066 pr["term"]["admin.publisher_record_id.exact"] = self.publisher_record_id 

1067 q["query"]["bool"]["must"].append(pr) 

1068 

1069 if self.doi is not None and self.should_match is None: 

1070 idt = deepcopy(self._doi_term) 

1071 # idt["term"]["bibjson.identifier.id.exact"] = self.doi 

1072 idt["term"]["index.doi.exact"] = self.doi 

1073 q["query"]["bool"]["must"].append(idt) 

1074 

1075 if len(self.urls) > 0 and self.should_match is None: 

1076 uq = deepcopy(self._fulltext_terms) 

1077 # uq["terms"]["bibjson.link.url.exact"] = self.urls 

1078 uq["terms"]["index.fulltext.exact"] = self.urls 

1079 q["query"]["bool"]["must"].append(uq) 

1080 

1081 if self.title is not None: 

1082 ft = deepcopy(self._fuzzy_title) 

1083 ft["fuzzy"]["bibjson.title.exact"] = self.title 

1084 q["query"]["bool"]["must"].append(ft) 

1085 

1086 if self.should_match is not None: 

1087 term_count = 0 

1088 s = deepcopy(self._should) 

1089 

1090 if self.volume is not None: 

1091 term_count += 1 

1092 vt = deepcopy(self._volume_term) 

1093 vt["term"]["bibjson.journal.volume.exact"] = self.volume 

1094 s["should"].append(vt) 

1095 

1096 if self.number is not None: 

1097 term_count += 1 

1098 nt = deepcopy(self._number_term) 

1099 nt["term"]["bibjson.journal.number.exact"] = self.number 

1100 s["should"].append(nt) 

1101 

1102 if self.start is not None: 

1103 term_count += 1 

1104 st = deepcopy(self._start_term) 

1105 st["term"]["bibjson.start_page.exact"] = self.start 

1106 s["should"].append(st) 

1107 

1108 if len(self.urls) > 0: 

1109 term_count += 1 

1110 uq = deepcopy(self._url_terms) 

1111 uq["terms"]["bibjson.link.url.exact"] = self.urls 

1112 s["should"].append(uq) 

1113 

1114 if self.doi is not None: 

1115 term_count += 1 

1116 idt = deepcopy(self._identifier_term) 

1117 idt["term"]["bibjson.identifier.id.exact"] = self.doi 

1118 s["should"].append(idt) 

1119 

1120 msm = self.should_match 

1121 if msm > term_count: 

1122 msm = term_count 

1123 s["minimum_should_match"] = msm 

1124 

1125 q["query"]["bool"].update(s) 

1126 

1127 # Allow more results than the default 

1128 q["size"] = self.size 

1129 

1130 return q 

1131 

1132 

1133def _human_sort(things, reverse=True): 

1134 numeric = [] 

1135 non_numeric = [] 

1136 nmap = {} 

1137 for v in things: 

1138 try: 

1139 # try to convert n to an int 

1140 vint = int(v) 

1141 

1142 # remember the original string (it may have leading 0s) 

1143 try: 

1144 nmap[vint].append(v) 

1145 except KeyError: 

1146 nmap[vint] = [v] 

1147 numeric.append(vint) 

1148 except: 

1149 non_numeric.append(v) 

1150 

1151 numeric.sort(reverse=reverse) 

1152 non_numeric.sort(reverse=reverse) 

1153 

1154 # convert the integers back to their string representation 

1155 return reduce(lambda x, y: x+y, [nmap[n] for n in numeric], []) + non_numeric 

1156 

1157 

1158def _sort_articles(articles): 

1159 # first extract the array we want to sort on 

1160 # and make a map of that value to the issue itself 

1161 unsorted = [] 

1162 numbers = [] 

1163 imap = {} 

1164 for art in articles: 

1165 sp = art.get("bibjson.start_page", [None])[0] 

1166 

1167 # can't sort anything that doesn't have a start page 

1168 if sp is None: 

1169 unsorted.append(art) 

1170 continue 

1171 

1172 # deal with start page clashes and record the start pages 

1173 # to sort by 

1174 if sp not in numbers: 

1175 numbers.append(sp) 

1176 if sp in imap: 

1177 imap[sp].append(art) 

1178 else: 

1179 imap[sp] = [art] 

1180 

1181 sorted_keys = _human_sort(numbers, reverse=False) 

1182 

1183 s = [] 

1184 for n in sorted_keys: 

1185 s += [x for x in imap[n]] 

1186 s += [x for x in unsorted] 

1187 

1188 return s