Coverage for portality/models/article.py: 82%

1import string

3from unidecode import unidecode

4from functools import reduce

5from copy import deepcopy

6from datetime import datetime

8from portality import datasets, constants

9from portality.core import app

10from portality.dao import DomainObject

11from portality.lib import es_data_mapping

12from portality.lib.coerce import COERCE_MAP

13from portality.lib.dates import FMT_DATETIME_STD

14from portality.lib.seamless import SeamlessMixin

15from portality.models import Journal

16from portality.models.v1.bibjson import GenericBibJSON # NOTE that article specifically uses the v1 BibJSON

17from portality.models.v1 import shared_structs

18from portality.models.v2.shared_structs import ARTICLE_STRUCT

19from portality.lib import normalise, dates

22class NoJournalException(Exception):

23 pass

25class NoValidOwnerException(Exception):

26 pass

29ARTICLE_BIBJSON_EXTENSION = {

30 "objects" : ["bibjson"],

31 "structs" : {

32 "bibjson" : {

33 "fields" : {

34 "year" : {"coerce" : "unicode"},

35 "month" : {"coerce" : "unicode"},

36 "start_page" : {"coerce" : "unicode"},

37 "end_page" : {"coerce" : "unicode"},

38 "abstract" : {"coerce" : "unicode"}

39 },

40 "lists" : {

41 "author" : {"contains" : "object"}

42 },

43 "objects" : [

44 "journal"

45 ],

47 "structs" : {

48 "author" : {

49 "fields" : {

50 "name" : {"coerce" : "unicode"},

51 "affiliation" : {"coerce" : "unicode"},

52 "email" : {"coerce": "unicode"},

53 "orcid_id" : {"coerce" : "unicode"}

54 }

55 },

57 "journal" : {

58 "fields" : {

59 "volume" : {"coerce" : "unicode"},

60 "number" : {"coerce" : "unicode"},

61 "publisher" : {"coerce" : "unicode"},

62 "title" : {"coerce" : "unicode"},

63 "country" : {"coerce" : "unicode"}

64 },

65 "lists" : {

66 "language" : {"contains" : "field", "coerce" : "unicode"},

67 "issns" : {"contains" : "field", "coerce" : "unicode"}

68 }

69 }

70 }

72 }

73 }

74}

76MAPPING_OPTS = {

77 "dynamic": None,

78 "coerces": app.config["DATAOBJ_TO_MAPPING_DEFAULTS"],

79 "exceptions": app.config["ARTICLE_EXCEPTION_MAPPING"],

80 "additional_mappings": {}

81}

84class Article(SeamlessMixin, DomainObject):

85 __type__ = "article"

87 __SEAMLESS_STRUCT__ = [

88 ARTICLE_STRUCT,

89 shared_structs.SHARED_BIBJSON,

90 ARTICLE_BIBJSON_EXTENSION

91 ]

93 __SEAMLESS_COERCE__ = COERCE_MAP

95 def mappings(self):

96 return es_data_mapping.create_mapping(self.__seamless_struct__.raw, MAPPING_OPTS)

98 @classmethod

99 def duplicates(cls, publisher_record_id=None, doi=None, fulltexts=None, title=None, volume=None, number=None, start=None, should_match=None, size=10):

100 # some input sanitisation

101 urls = fulltexts if isinstance(fulltexts, list) else [fulltexts] if isinstance(fulltexts, str) or isinstance(fulltexts, str) else []

102

103 # make sure that we're dealing with the normal form of the identifiers

104 norm_urls = []

105 for url in urls:

106 try:

107 norm = normalise.normalise_url(url)

108 norm_urls.append(norm)

109 except ValueError:

110 # use the non-normal form

111 norm_urls.append(url)

112 urls = norm_urls

113

114 try:

115 doi = normalise.normalise_doi(doi)

116 except ValueError:

117 # leave the doi as it is

118 pass

119

120 q = DuplicateArticleQuery(publisher_record_id=publisher_record_id,

121 doi=doi,

122 urls=urls,

123 title=title,

124 volume=volume,

125 number=number,

126 start=start,

127 should_match=should_match,

128 size=size)

129

130 # res = cls.query(q=q.query())

131 # return [cls(**hit.get("_source")) for hit in res.get("hits", {}).get("hits", [])]

132 return cls.q2obj(q=q.query())

133

134 @classmethod

135 def list_volumes(cls, issns):

136 q = ArticleVolumesQuery(issns)

137 result = cls.query(q=q.query())

138 return _human_sort([t.get("key") for t in result.get("aggregations", {}).get("vols", {}).get("buckets", [])])

139

140 @classmethod

141 def list_volume_issues(cls, issns, volume):

142 q = ArticleVolumesIssuesQuery(issns, volume)

143 result = cls.query(q=q.query())

144 return _human_sort([t.get("key") for t in result.get("aggregations", {}).get("issues", {}).get("buckets", [])])

145

146 @classmethod

147 def get_by_volume(cls, issns, volume):

148 q = ArticleQuery(issns=issns, volume=volume)

149 articles = cls.iterate(q.query(), page_size=1000)

150 return articles

151

152 @classmethod

153 def find_by_issns(cls, issns):

154 q = ArticleQuery(issns=issns)

155 articles = cls.iterate(q.query(), page_size=1000)

156 return articles

157

158 @classmethod

159 def count_by_issns(cls, issns, in_doaj=None):

160 q = ArticleQuery(issns=issns, in_doaj=in_doaj)

161 return cls.hit_count(q.query())

162

163 @classmethod

164 def delete_by_issns(cls, issns, snapshot=True):

165 q = ArticleQuery(issns=issns)

166 cls.delete_selected(query=q.query(), snapshot=snapshot)

167

168 @classmethod

169 def delete_selected(cls, query=None, owner=None, snapshot=True, tombstone=True):

170 if owner is not None:

171 from portality.models import Journal

172 issns = Journal.issns_by_owner(owner)

173 q = ArticleQuery(issns=issns)

174 query = q.query()

175

176 if snapshot or tombstone:

177 articles = cls.iterate(query, page_size=1000)

178 for article in articles:

179 if snapshot:

180 article.snapshot()

181 if tombstone:

182 article._tombstone()

183 return cls.delete_by_query(query)

184

185 def delete(self):

186 self._tombstone()

187 super(Article, self).delete()

188

189 def bibjson(self, **kwargs):

190 if "bibjson" not in self.data:

191 self.data["bibjson"] = {}

192 return ArticleBibJSON(self.data.get("bibjson"), **kwargs)

193

194 def set_bibjson(self, bibjson):

195 bibjson = bibjson.bibjson if isinstance(bibjson, ArticleBibJSON) else bibjson

196 self.data["bibjson"] = bibjson

197

198 def history(self):

199 hs = self.data.get("history", [])

200 tuples = []

201 for h in hs:

202 tuples.append((h.get("date"), ArticleBibJSON(h.get("bibjson"))))

203 return tuples

204

205 def snapshot(self):

206 from portality.models import ArticleHistory

207

208 snap = deepcopy(self.data)

209 if "id" in snap:

210 snap["about"] = snap["id"]

211 del snap["id"]

212 if "index" in snap:

213 del snap["index"]

214 if "last_updated" in snap:

215 del snap["last_updated"]

216 if "created_date" in snap:

217 del snap["created_date"]

218

219 hist = ArticleHistory(**snap)

220 hist.save()

221 return hist.id

222

223 def _tombstone(self):

224 stone = ArticleTombstone()

225 stone.set_id(self.id)

226 sbj = stone.bibjson()

227

228 subs = self.bibjson().subjects()

229 for s in subs:

230 sbj.add_subject(s.get("scheme"), s.get("term"), s.get("code"))

231

232 stone.save()

233 return stone

234

235 def add_history(self, bibjson, date=None):

236 """Deprecated"""

237 bibjson = bibjson.bibjson if isinstance(bibjson, ArticleBibJSON) else bibjson

238 if date is None:

239 date = dates.now_str()

240 snobj = {"date": date, "bibjson": bibjson}

241 if "history" not in self.data:

242 self.data["history"] = []

243 self.data["history"].append(snobj)

244

245 def is_in_doaj(self):

246 try:

247 return self.data['admin'].get("in_doaj", False)

248 except KeyError:

249 # If we have no admin section, return None instead

250 return None

251

252 def set_in_doaj(self, value):

253 if "admin" not in self.data:

254 self.data["admin"] = {}

255 self.data["admin"]["in_doaj"] = value

256

257 def publisher_record_id(self):

258 return self.data.get("admin", {}).get("publisher_record_id")

259

260 def set_publisher_record_id(self, pri):

261 if "admin" not in self.data:

262 self.data["admin"] = {}

263 self.data["admin"]["publisher_record_id"] = pri

264

265 def upload_id(self):

266 return self.data.get("admin", {}).get("upload_id")

267

268 def set_upload_id(self, uid):

269 if "admin" not in self.data:

270 self.data["admin"] = {}

271 self.data["admin"]["upload_id"] = uid

272

273 def get_normalised_doi(self):

274 if self.data.get("index", {}).get("doi") is not None:

275 return self.data["index"]["doi"]

276 doi = self.bibjson().get_one_identifier(constants.IDENT_TYPE_DOI)

277 if doi is None:

278 return None

279 try:

280 return normalise.normalise_doi(doi)

281 except ValueError:

282 # can't be normalised, so we just return the doi as-is

283 return doi

284

285 def get_normalised_fulltext(self):

286 if self.data.get("index", {}).get("fulltext") is not None:

287 return self.data["index"]["fulltext"]

288 fulltexts = self.bibjson().get_urls(constants.LINK_TYPE_FULLTEXT)

289 if len(fulltexts) == 0:

290 return None

291 try:

292 return normalise.normalise_url(fulltexts[0])

293 except ValueError:

294 # can't be normalised, so we just return the url as-is

295 return fulltexts[0]

296

297 def get_journal(self):

298 """

299 Get this article's associated journal

300 :return: A Journal, or None if this is an orphan article

301 """

302 bibjson = self.bibjson()

303

304 # first, get the ISSNs associated with the record

305 pissns = bibjson.get_identifiers(bibjson.P_ISSN)

306 eissns = bibjson.get_identifiers(bibjson.E_ISSN)

307 allissns = list(set(pissns + eissns))

308

309 # find a matching journal record from the index

310 best_match = None

311

312 for issn in allissns:

313 journals = Journal.find_by_issn(issn)

314 if len(journals) > 0:

315 # Get the best journal match:

316 # 1. Prefer the most recently updated journal that is in DOAJ.

317 # 2. If none are in DOAJ, fall back to the most recently updated journal outside DOAJ.

318

319 matches = [j for j in journals if j.is_in_doaj()]

320

321 if len(matches) == 0:

322 matches = journals

323

324 best_match = max(

325 matches,

326 key=lambda j: j.last_updated,

327 default=None

328 )

329

330 return best_match

331

332 def get_associated_journals(self):

333 # find all matching journal record from the index

334 allissns = self.bibjson().issns()

335 return Journal.find_by_issn(allissns)

336

337 def add_journal_metadata(self, j=None, reg=None):

338 """

339 this function makes sure the article is populated

340 with all the relevant info from its owning parent object

341 :param j: Pass in a Journal to bypass the (slow) locating step. MAKE SURE IT'S THE RIGHT ONE!

342 """

343

344 # Record the data that is copied into the article into the "reg"ister, in case the

345 # caller needs to know exactly and only which information was copied

346 if reg is None:

347 reg = Journal()

348 rbj = reg.bibjson()

349

350 if j is None:

351 journal = self.get_journal()

352 else:

353 journal = j

354

355 # we were unable to find a journal

356 if journal is None:

357 raise NoJournalException("Unable to find a journal associated with this article")

358

359 # if we get to here, we have a journal record we want to pull data from

360 jbib = journal.bibjson()

361 bibjson = self.bibjson()

362

363 # tripwire to be tripped if the journal makes changes to the article

364 trip = False

365

366 if bibjson.subjects() != jbib.subjects():

367 trip = True

368 bibjson.set_subjects(jbib.subjects())

369 rbj.set_subjects(jbib.subjects())

370

371 if jbib.title is not None:

372 if bibjson.journal_title != jbib.title:

373 trip = True

374 bibjson.journal_title = jbib.title

375 rbj.title = jbib.title

376

377 if len(jbib.language) > 0:

378 jlang = jbib.language

379 alang = bibjson.journal_language

380 jlang.sort()

381 alang.sort()

382 if jlang != alang:

383 bibjson.journal_language = jbib.language

384 trip = True

385 rbj.set_language(jbib.language)

386

387 if jbib.country is not None:

388 if jbib.country != bibjson.journal_country:

389 bibjson.journal_country = jbib.country

390 trip = True

391 rbj.country = jbib.country

392

393 if jbib.publisher:

394 if jbib.publisher != bibjson.publisher:

395 bibjson.publisher = jbib.publisher

396 trip = True

397 rbj.publisher = jbib.publisher

398

399 # Copy the in_doaj status and the journal's ISSNs

400 if journal.is_in_doaj() != self.is_in_doaj():

401 self.set_in_doaj(journal.is_in_doaj())

402 trip = True

403 reg.set_in_doaj(journal.is_in_doaj())

404

405 try:

406 aissns = bibjson.journal_issns

407 jissns = jbib.issns()

408 aissns.sort()

409 jissns.sort()

410 if aissns != jissns:

411 bibjson.journal_issns = jbib.issns()

412 trip = True

413

414 eissns = jbib.get_identifiers(jbib.E_ISSN)

415 pissns = jbib.get_identifiers(jbib.P_ISSN)

416 if eissns is not None and len(eissns) > 0:

417 rbj.add_identifier(rbj.E_ISSN, eissns[0])

418 if pissns is not None and len(pissns) > 0:

419 rbj.add_identifier(rbj.P_ISSN, pissns[0])

420 except KeyError:

421 # No issns, don't worry about it for now

422 pass

423

424 return trip

425

426 def merge(self, old, take_id=True):

427 # this takes an old version of the article and brings

428 # forward any useful information that is needed. The rules of merge are:

429 # - ignore "index" (it gets regenerated on save)

430 # - always take the "created_date"

431 # - any top level field that does not exist in the current item (esp "id" and "history")

432 # - in "admin", copy any field that does not already exist

433

434 # first thing to do is create a snapshot of the old record

435 old.snapshot()

436

437 # now go on and do the merge

438

439 # always take the created date

440 self.set_created(old.created_date)

441

442 # take the id

443 if self.id is None or take_id:

444 self.set_id(old.id)

445

446 # take the history (deprecated)

447 if len(self.data.get("history", [])) == 0:

448 self.data["history"] = deepcopy(old.data.get("history", []))

449

450 # take the bibjson

451 if "bibjson" not in self.data:

452 self.set_bibjson(deepcopy(old.bibjson()))

453

454 # take the admin if there isn't one

455 if "admin" not in self.data:

456 self.data["admin"] = deepcopy(old.data.get("admin", {}))

457 else:

458 # otherwise, copy any admin keys that don't exist on the current item

459 oa = old.data.get("admin", {})

460 for key in oa:

461 if key not in self.data["admin"]:

462 self.data["admin"][key] = deepcopy(oa[key])

463

464 def _generate_index(self):

465 # the index fields we are going to generate

466 issns = []

467 subjects = []

468 schema_subjects = []

469 schema_codes = []

470 schema_codes_tree = []

471 classification = []

472 langs = []

473 country = None

474 publisher = []

475 classification_paths = []

476 unpunctitle = None

477 asciiunpunctitle = None

478 doi = None

479 fulltext = None

480

481 # the places we're going to get those fields from

482 cbib = self.bibjson()

483 jindex = self.data.get('index', {})

484 hist = self.history()

485

486 # get the issns out of the current bibjson

487 issns += cbib.get_identifiers(cbib.P_ISSN)

488 issns += cbib.get_identifiers(cbib.E_ISSN)

489

490 # get the issn from the journal bibjson

491 if isinstance(cbib.journal_issns, list):

492 issns += cbib.journal_issns

493

494 # de-duplicate the issns

495 issns = list(set(issns))

496

497 # now get the issns out of the historic records

498 for date, hbib in hist:

499 issns += hbib.get_identifiers(hbib.P_ISSN)

500 issns += hbib.get_identifiers(hbib.E_ISSN)

501

502 # get the subjects and concatenate them with their schemes from the current bibjson

503 for subs in cbib.subjects():

504 scheme = subs.get("scheme")

505 term = subs.get("term")

506 subjects.append(term)

507 schema_subjects.append(scheme + ":" + term)

508 classification.append(term)

509 if "code" in subs:

510 schema_codes.append(scheme + ":" + subs.get("code"))

511

512 # copy the languages

513 if len(cbib.journal_language) > 0:

514 langs = [datasets.name_for_lang(l) for l in cbib.journal_language]

515

516 # Get the country name from the bibjson country code

517 if cbib.journal_country:

518 country = datasets.get_country_name(cbib.journal_country)

519

520 # copy the publisher/provider

521 if cbib.publisher:

522 publisher.append(cbib.publisher)

523

524 # deduplicate the lists

525 issns = list(set(issns))

526 subjects = list(set(subjects))

527 schema_subjects = list(set(schema_subjects))

528 classification = list(set(classification))

529 publisher = list(set(publisher))

530 langs = list(set(langs))

531 schema_codes = list(set(schema_codes))

532

533 # work out what the date of publication is

534 date = cbib.get_publication_date()

535

536 # calculate the classification paths

537 from portality.lcc import lcc # inline import since this hits the database

538 for subs in cbib.subjects():

539 scheme = subs.get("scheme")

540 term = subs.get("term")

541 if scheme == "LCC":

542 path = lcc.pathify(term)

543 if path is not None:

544 classification_paths.append(path)

545

546 # normalise the classification paths, so we only store the longest ones

547 classification_paths = lcc.longest(classification_paths)

548 schema_codes_tree = cbib.lcc_codes_full_list()

549

550 # create an unpunctitle

551 if cbib.title is not None:

552 throwlist = string.punctuation + '\n\t'

553 unpunctitle = "".join(c for c in cbib.title if c not in throwlist).strip()

554 try:

555 asciiunpunctitle = unidecode(unpunctitle)

556 except:

557 asciiunpunctitle = unpunctitle

558

559 # create a normalised version of the DOI for deduplication

560 source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI)

561 try:

562 doi = normalise.normalise_doi(source_doi)

563 except ValueError as e:

564 # if we can't normalise the DOI, just store it cast to lower case.

565 doi = source_doi.lower()

566

567 # create a normalised version of the fulltext URL for deduplication

568 fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT)

569 if len(fulltexts) > 0:

570 source_fulltext = fulltexts[0]

571 try:

572 fulltext = normalise.normalise_url(source_fulltext)

573 except ValueError as e:

574 # if we can't normalise the fulltext store it as-is

575 fulltext = source_fulltext

576

577 # build the index part of the object

578 self.data["index"] = {}

579 if len(issns) > 0:

580 self.data["index"]["issn"] = issns

581 if date != "":

582 self.data["index"]["date"] = date

583 self.data["index"]["date_toc_fv_month"] = date # Duplicated so we can have year/month facets in fv2

584 if len(subjects) > 0:

585 self.data["index"]["subject"] = subjects

586 if len(schema_subjects) > 0:

587 self.data["index"]["schema_subject"] = schema_subjects

588 if len(classification) > 0:

589 self.data["index"]["classification"] = classification

590 if len(publisher) > 0:

591 self.data["index"]["publisher"] = publisher

592 if len(langs) > 0:

593 self.data["index"]["language"] = langs

594 if country is not None:

595 self.data["index"]["country"] = country

596 if len(schema_codes) > 0:

597 self.data["index"]["schema_code"] = schema_codes

598 if len(classification_paths) > 0:

599 self.data["index"]["classification_paths"] = classification_paths

600 if unpunctitle is not None:

601 self.data["index"]["unpunctitle"] = unpunctitle

602 if asciiunpunctitle is not None:

603 self.data["index"]["asciiunpunctitle"] = unpunctitle

604 if doi is not None:

605 self.data["index"]["doi"] = doi

606 if fulltext is not None:

607 self.data["index"]["fulltext"] = fulltext

608 if len(schema_codes_tree) > 0:

609 self.data["index"]["schema_codes_tree"] = schema_codes_tree

610

611 def prep(self):

612 self._generate_index()

613 self.data['last_updated'] = dates.now_str()

614

615 def save(self, *args, **kwargs):

616 self._generate_index()

617 return super(Article, self).save(*args, **kwargs)

618

619 def get_owner(self):

620 b = self.bibjson()

621 article_issns = b.get_identifiers(b.P_ISSN)

622 article_issns += b.get_identifiers(b.E_ISSN)

623 owners = []

624

625 seen_journal_issns = {}

626 for issn in article_issns:

627 journals = Journal.find_by_issn(issn)

628 if journals is not None and len(journals) > 0:

629 for j in journals:

630 owners.append(j.owner)

631 if j.owner not in seen_journal_issns:

632 seen_journal_issns[j.owner] = []

633 seen_journal_issns[j.owner] += j.bibjson().issns()

634

635 # deduplicate the list of owners

636 owners = list(set(owners))

637

638 # no owner means we can't confirm

639 if len(owners) == 0:

640 raise NoValidOwnerException

641

642 # multiple owners means ownership of this article is confused

643 if len(owners) > 1:

644 return NoValidOwnerException

645

646 return owners[0]

647

648

649class ArticleTombstone(Article):

650 __type__ = "article_tombstone"

651

652 def snapshot(self):

653 return None

654

655 def is_in_doaj(self):

656 return False

657

658 def prep(self):

659 self.data['last_updated'] = dates.now_str()

660

661 def save(self, *args, **kwargs):

662 return super(ArticleTombstone, self).save(*args, **kwargs)

663

664

665class ArticleBibJSON(GenericBibJSON):

666

667 def __init__(self, bibjson=None, **kwargs):

668 self._add_struct(shared_structs.SHARED_BIBJSON.get("structs", {}).get("bibjson"))

669 self._add_struct(ARTICLE_BIBJSON_EXTENSION.get("structs", {}).get("bibjson"))

670 super(ArticleBibJSON, self).__init__(bibjson, **kwargs)

671

672 # article-specific simple getters and setters

673 @property

674 def year(self):

675 return self._get_single("year")

676

677 @year.setter

678 def year(self, val):

679 self._set_with_struct("year", val)

680

681 @year.deleter

682 def year(self):

683 self._delete("year")

684

685 @property

686 def month(self):

687 return self._get_single("month")

688

689 @month.setter

690 def month(self, val):

691 self._set_with_struct("month", val)

692

693 @month.deleter

694 def month(self):

695 self._delete("month")

696

697 @property

698 def start_page(self):

699 return self._get_single("start_page")

700

701 @start_page.setter

702 def start_page(self, val):

703 self._set_with_struct("start_page", val)

704

705 @property

706 def end_page(self):

707 return self._get_single("end_page")

708

709 @end_page.setter

710 def end_page(self, val):

711 self._set_with_struct("end_page", val)

712

713 @property

714 def abstract(self):

715 return self._get_single("abstract")

716

717 @abstract.setter

718 def abstract(self, val):

719 self._set_with_struct("abstract", val)

720

721 # article-specific complex part getters and setters

722

723 @property

724 def volume(self):

725 return self._get_single("journal.volume")

726

727 @volume.setter

728 def volume(self, value):

729 self._set_with_struct("journal.volume", value)

730

731 @property

732 def number(self):

733 return self._get_single("journal.number")

734

735 @number.setter

736 def number(self, value):

737 self._set_with_struct("journal.number", value)

738

739 @property

740 def journal_title(self):

741 return self._get_single("journal.title")

742

743 @journal_title.setter

744 def journal_title(self, title):

745 self._set_with_struct("journal.title", title)

746

747 @property

748 def journal_language(self):

749 return self._get_list("journal.language")

750

751 @journal_language.setter

752 def journal_language(self, lang):

753 self._set_with_struct("journal.language", lang)

754

755 @property

756 def journal_country(self):

757 return self._get_single("journal.country")

758

759 @journal_country.setter

760 def journal_country(self, country):

761 self._set_single("journal.country", country)

762

763 @property

764 def journal_issns(self):

765 return self._get_list("journal.issns")

766

767 @journal_issns.setter

768 def journal_issns(self, issns):

769 self._set_with_struct("journal.issns", issns)

770

771 @property

772 def publisher(self):

773 return self._get_single("journal.publisher")

774

775 @publisher.setter

776 def publisher(self, value):

777 self._set_with_struct("journal.publisher", value)

778

779 def add_author(self, name, affiliation=None, orcid_id=None):

780 aobj = {"name": name}

781 if affiliation is not None:

782 aobj["affiliation"] = affiliation

783 if orcid_id is not None:

784 aobj["orcid_id"] = orcid_id

785 self._add_to_list_with_struct("author", aobj)

786

787 @property

788 def author(self):

789 return self._get_list("author")

790

791 @author.setter

792 def author(self, authors):

793 self._set_with_struct("author", authors)

794

795 def get_publication_date(self, date_format=FMT_DATETIME_STD):

796 # work out what the date of publication is

797 date = ""

798 if self.year is not None:

799 if type(self.year) is str: # It should be, if the mappings are correct. but len() needs a sequence.

800 # fix 2 digit years

801 if len(self.year) == 2:

802 try:

803 intyear = int(self.year)

804 except ValueError:

805 # if it's 2 chars long and the 2 chars don't make an integer,

806 # forget it

807 return date

808

809 # In the case of truncated years, assume it's this century if before the current year

810 if intyear <= int(str(dates.now().year)[:-2]):

811 self.year = "20" + self.year # For readability over long-lasting code, I have refrained

812 else: # from using str(dates.now().year)[:2] here.

813 self.year = "19" + self.year # But don't come crying to me 90-ish years from now.

814

815 # if we still don't have a 4 digit year, forget it

816 if len(self.year) != 4:

817 return date

818

819 # build up our proposed datestamp

820 date += str(self.year)

821 if self.month is not None:

822 try:

823 if type(self.month) is int:

824 if 1 <= int(self.month) <= 12:

825 month_number = self.month

826 else:

827 month_number = 1

828 elif len(self.month) <= 2:

829 if 1 <= int(self.month) <= 12:

830 month_number = self.month

831 else:

832 month_number = '1'

833 elif len(self.month) == 3: # 'May' works with either case, obvz.

834 month_number = datetime.strptime(self.month, '%b').month

835 else:

836 month_number = datetime.strptime(self.month, '%B').month

837

838

839 # pad the month number to two digits. This accepts int or string

840 date += '-{:0>2}'.format(month_number)

841 except:

842 # If something goes wrong, just assume it's January

843 date += "-01"

844 else:

845 date += "-01"

846 date += "-01T00:00:00Z"

847

848 # attempt to confirm the format of our datestamp

849 try:

850 datecheck = dates.parse(date)

851 date = datecheck.strftime(date_format)

852 except:

853 return ""

854 return date

855

856 def remove_journal_metadata(self):

857 self._delete("journal")

858

859 def vancouver_citation(self):

860 jtitle = self.journal_title

861 year = self.year

862 vol = self.volume

863 iss = self.number

864 start = self.start_page

865 end = self.end_page

866

867 citation = ""

868

869 if year:

870 citation += year + ";"

871

872 if vol:

873 citation += vol

874

875 if iss:

876 citation += "(" + iss + ")"

877

878 if start or end:

879 if citation != "":

880 citation += ":"

881 if start:

882 citation += start

883 if end:

884 if start:

885 citation += "-"

886 citation += end

887

888 return jtitle.strip(), citation

889

890 def lcc_codes_full_list(self):

891 full_list = set()

892

893 from portality.lcc import lcc # inline import since this hits the database

894 for subs in self.subjects():

895 scheme = subs.get("scheme")

896 if scheme != "LCC":

897 continue

898 code = subs.get("code")

899 expanded = lcc.expand_codes(code)

900 full_list.update(expanded)

901

902 return ["LCC:" + x for x in full_list if x is not None]

903

904

905##################################################

906

907class ArticleQuery(object):

908 base_query = {

909 "track_total_hits" : True,

910 "query" : {

911 "bool" : {

912 "must" : []

913 }

914 }

915 }

916

917 _issn_terms = { "terms" : {"index.issn.exact" : ["<list of issns here>"]} }

918 _volume_term = { "term" : {"bibjson.journal.volume.exact" : "<volume here>"} }

919

920 def __init__(self, issns=None, volume=None, in_doaj=None):

921 self.issns = issns

922 self.volume = volume

923 self.in_doaj = in_doaj

924

925 def query(self):

926 q = deepcopy(self.base_query)

927

928 if self.issns is not None:

929 iq = deepcopy(self._issn_terms)

930 iq["terms"]["index.issn.exact"] = self.issns

931 q["query"]["bool"]["must"].append(iq)

932

933 if self.volume is not None:

934 vq = deepcopy(self._volume_term)

935 vq["term"]["bibjson.journal.volume.exact"] = self.volume

936 q["query"]["bool"]["must"].append(vq)

937

938 if self.in_doaj is not None:

939 q["query"]["bool"]["must"].append({"term": {"admin.in_doaj": self.in_doaj}})

940

941 return q

942

943class ArticleVolumesQuery(object):

944 base_query = {

945 "track_total_hits": True,

946 "query" : {

947 "bool": {

948 "filter": {

949 "terms" : {"index.issn.exact" : ["<list of issns here>"]}

950 }

951 }

952 },

953 "size" : 0,

954 "aggs" : {

955 "vols" : {

956 "terms" : {

957 "field" : "bibjson.journal.volume.exact",

958 "order": {"_key" : "desc"},

959 "size" : 1000

960 }

961 }

962 }

963 }

964

965 def __init__(self, issns=None):

966 self.issns = issns

967

968 def query(self):

969 q = deepcopy(self.base_query)

970 q["query"]["bool"]["filter"]["terms"]["index.issn.exact"] = self.issns

971 return q

972

973

974class ArticleVolumesIssuesQuery(object):

975 base_query = {

976 "track_total_hits": True,

977 "query" : {

978 "bool": {

979 "filter": {

980 "bool": {

981 "must": [

982 {"terms" : {"index.issn.exact" : ["<list of issns here>"]}},

983 {"term" : {"bibjson.journal.volume.exact" : "<volume here>"}}

984 ]

985 }

986 }

987 }

988 },

989 "size" : 0,

990 "aggs" : {

991 "issues" : {

992 "terms" : {

993 "field" : "bibjson.journal.number.exact",

994 "order": {"_key", "desc"},

995 "size" : 1000

996 }

997 }

998 }

999 }

1000

1001 def __init__(self, issns=None, volume=None):

1002 self.issns = issns

1003 self.volume = volume

1004

1005 def query(self):

1006 q = deepcopy(self.base_query)

1007 q["query"]["bool"]["filter"]["bool"]["must"][0]["terms"]["index.issn.exact"] = self.issns

1008 q["query"]["bool"]["filter"]["bool"]["must"][1]["term"]["bibjson.journal.volume.exact"] = self.volume

1009 return q

1010

1011

1012class DuplicateArticleQuery(object):

1013 base_query = {

1014 "track_total_hits" : True,

1015 "query": {

1016 "bool": {

1017 "must": []

1018 }

1019 },

1020 "sort": [{"last_updated": {"order": "desc"}}]

1021 }

1022

1023 _should = {

1024 "should" : [],

1025 "minimum_should_match" : 2

1026 }

1027

1028 _volume_term = {"term" : {"bibjson.journal.volume.exact" : "<volume>"}}

1029 _number_term = {"term" : {"bibjson.journal.number.exact" : "<issue number>"}}

1030 _start_term = {"term" : {"bibjson.start_page.exact" : "<start page>"}}

1031 _issn_terms = {"terms" : { "index.issn.exact" : ["<list of issns>"] }}

1032 _pubrec_term = {"term" : {"admin.publisher_record_id.exact" : "<publisher record id>"}}

1033 _identifier_term = {"term" : {"bibjson.identifier.id.exact" : "<issn here>"}}

1034 _doi_term = {"term" : {"index.doi.exact" : "<doi here>"}}

1035 _fulltext_terms = {"terms" : {"index.fulltext.exact" : ["<fulltext here>"]}}

1036 _fuzzy_title = {"fuzzy" : {"bibjson.title.exact" : "<title here>"}}

1037

1038 def __init__(self, issns=None, publisher_record_id=None, doi=None, urls=None, title=None, volume=None, number=None, start=None, should_match=None, size=10):

1039 self.issns = issns if isinstance(issns, list) else []

1040 self.publisher_record_id = publisher_record_id

1041 self.doi = doi

1042 self.urls = urls if isinstance(urls, list) else [urls] if isinstance(urls, str) or isinstance(urls, str) else []

1043 self.title = title

1044 self.volume = volume

1045 self.number = number

1046 self.start = start

1047 self.should_match = should_match

1048 self.size = size

1049

1050 def query(self):

1051 # - MUST be from at least one of the ISSNs

1052 # - MUST have the publisher record id

1053 # - MUST have the doi unless should_match is set

1054 # - MUST have the one of the fulltext urls unless should_match is set

1055 # - MUST fuzzy match the title

1056 # - SHOULD have <should_match> of: volume, issue, start page, fulltext url, doi

1057

1058 q = deepcopy(self.base_query)

1059 if len(self.issns) > 0:

1060 it = deepcopy(self._issn_terms)

1061 it["terms"]["index.issn.exact"] = self.issns

1062 q["query"]["bool"]["must"].append(it)

1063

1064 if self.publisher_record_id is not None:

1065 pr = deepcopy(self._pubrec_term)

1066 pr["term"]["admin.publisher_record_id.exact"] = self.publisher_record_id

1067 q["query"]["bool"]["must"].append(pr)

1068

1069 if self.doi is not None and self.should_match is None:

1070 idt = deepcopy(self._doi_term)

1071 # idt["term"]["bibjson.identifier.id.exact"] = self.doi

1072 idt["term"]["index.doi.exact"] = self.doi

1073 q["query"]["bool"]["must"].append(idt)

1074

1075 if len(self.urls) > 0 and self.should_match is None:

1076 uq = deepcopy(self._fulltext_terms)

1077 # uq["terms"]["bibjson.link.url.exact"] = self.urls

1078 uq["terms"]["index.fulltext.exact"] = self.urls

1079 q["query"]["bool"]["must"].append(uq)

1080

1081 if self.title is not None:

1082 ft = deepcopy(self._fuzzy_title)

1083 ft["fuzzy"]["bibjson.title.exact"] = self.title

1084 q["query"]["bool"]["must"].append(ft)

1085

1086 if self.should_match is not None:

1087 term_count = 0

1088 s = deepcopy(self._should)

1089

1090 if self.volume is not None:

1091 term_count += 1

1092 vt = deepcopy(self._volume_term)

1093 vt["term"]["bibjson.journal.volume.exact"] = self.volume

1094 s["should"].append(vt)

1095

1096 if self.number is not None:

1097 term_count += 1

1098 nt = deepcopy(self._number_term)

1099 nt["term"]["bibjson.journal.number.exact"] = self.number

1100 s["should"].append(nt)

1101

1102 if self.start is not None:

1103 term_count += 1

1104 st = deepcopy(self._start_term)

1105 st["term"]["bibjson.start_page.exact"] = self.start

1106 s["should"].append(st)

1107

1108 if len(self.urls) > 0:

1109 term_count += 1

1110 uq = deepcopy(self._url_terms)

1111 uq["terms"]["bibjson.link.url.exact"] = self.urls

1112 s["should"].append(uq)

1113

1114 if self.doi is not None:

1115 term_count += 1

1116 idt = deepcopy(self._identifier_term)

1117 idt["term"]["bibjson.identifier.id.exact"] = self.doi

1118 s["should"].append(idt)

1119

1120 msm = self.should_match

1121 if msm > term_count:

1122 msm = term_count

1123 s["minimum_should_match"] = msm

1124

1125 q["query"]["bool"].update(s)

1126

1127 # Allow more results than the default

1128 q["size"] = self.size

1129

1130 return q

1131

1132

1133def _human_sort(things, reverse=True):

1134 numeric = []

1135 non_numeric = []

1136 nmap = {}

1137 for v in things:

1138 try:

1139 # try to convert n to an int

1140 vint = int(v)

1141

1142 # remember the original string (it may have leading 0s)

1143 try:

1144 nmap[vint].append(v)

1145 except KeyError:

1146 nmap[vint] = [v]

1147 numeric.append(vint)

1148 except:

1149 non_numeric.append(v)

1150

1151 numeric.sort(reverse=reverse)

1152 non_numeric.sort(reverse=reverse)

1153

1154 # convert the integers back to their string representation

1155 return reduce(lambda x, y: x+y, [nmap[n] for n in numeric], []) + non_numeric

1156

1157

1158def _sort_articles(articles):

1159 # first extract the array we want to sort on

1160 # and make a map of that value to the issue itself

1161 unsorted = []

1162 numbers = []

1163 imap = {}

1164 for art in articles:

1165 sp = art.get("bibjson.start_page", [None])[0]

1166

1167 # can't sort anything that doesn't have a start page

1168 if sp is None:

1169 unsorted.append(art)

1170 continue

1171

1172 # deal with start page clashes and record the start pages

1173 # to sort by

1174 if sp not in numbers:

1175 numbers.append(sp)

1176 if sp in imap:

1177 imap[sp].append(art)

1178 else:

1179 imap[sp] = [art]

1180

1181 sorted_keys = _human_sort(numbers, reverse=False)

1182

1183 s = []

1184 for n in sorted_keys:

1185 s += [x for x in imap[n]]

1186 s += [x for x in unsorted]

1187

1188 return s

Coverage for portality / models / article.py: 82%

744 statements