Coverage for portality/models/article.py: 81%

1import string

2import warnings

4from unidecode import unidecode

5from functools import reduce

6from copy import deepcopy

7from datetime import datetime

9from portality import datasets, constants

10from portality.dao import DomainObject

11from portality.models import Journal

12from portality.models.v1.bibjson import GenericBibJSON # NOTE that article specifically uses the v1 BibJSON

13from portality.models.v1 import shared_structs

14from portality.lib import normalise

17class NoJournalException(Exception):

18 pass

20class NoValidOwnerException(Exception):

21 pass

24class Article(DomainObject):

25 __type__ = "article"

27 @classmethod

28 def duplicates(cls, publisher_record_id=None, doi=None, fulltexts=None, title=None, volume=None, number=None, start=None, should_match=None, size=10):

29 # some input sanitisation

30 urls = fulltexts if isinstance(fulltexts, list) else [fulltexts] if isinstance(fulltexts, str) or isinstance(fulltexts, str) else []

32 # make sure that we're dealing with the normal form of the identifiers

33 norm_urls = []

34 for url in urls:

35 try:

36 norm = normalise.normalise_url(url)

37 norm_urls.append(norm)

38 except ValueError:

39 # use the non-normal form

40 norm_urls.append(url)

41 urls = norm_urls

43 try:

44 doi = normalise.normalise_doi(doi)

45 except ValueError:

46 # leave the doi as it is

47 pass

49 q = DuplicateArticleQuery(publisher_record_id=publisher_record_id,

50 doi=doi,

51 urls=urls,

52 title=title,

53 volume=volume,

54 number=number,

55 start=start,

56 should_match=should_match,

57 size=size)

59 # res = cls.query(q=q.query())

60 # return [cls(**hit.get("_source")) for hit in res.get("hits", {}).get("hits", [])]

61 return cls.q2obj(q=q.query())

63 @classmethod

64 def list_volumes(cls, issns):

65 q = ArticleVolumesQuery(issns)

66 result = cls.query(q=q.query())

67 return _human_sort([t.get("key") for t in result.get("aggregations", {}).get("vols", {}).get("buckets", [])])

69 @classmethod

70 def list_volume_issues(cls, issns, volume):

71 q = ArticleVolumesIssuesQuery(issns, volume)

72 result = cls.query(q=q.query())

73 return _human_sort([t.get("key") for t in result.get("aggregations", {}).get("issues", {}).get("buckets", [])])

75 @classmethod

76 def get_by_volume(cls, issns, volume):

77 q = ArticleQuery(issns=issns, volume=volume)

78 articles = cls.iterate(q.query(), page_size=1000)

79 return articles

81 @classmethod

82 def find_by_issns(cls, issns):

83 q = ArticleQuery(issns=issns)

84 articles = cls.iterate(q.query(), page_size=1000)

85 return articles

87 @classmethod

88 def count_by_issns(cls, issns):

89 q = ArticleQuery(issns=issns)

90 return cls.hit_count(q.query())

92 @classmethod

93 def delete_by_issns(cls, issns, snapshot=True):

94 q = ArticleQuery(issns=issns)

95 cls.delete_selected(query=q.query(), snapshot=snapshot)

97 @classmethod

98 def delete_selected(cls, query=None, owner=None, snapshot=True):

99 if owner is not None:

100 from portality.models import Journal

101 issns = Journal.issns_by_owner(owner)

102 q = ArticleQuery(issns=issns)

103 query = q.query()

104

105 if snapshot:

106 articles = cls.iterate(query, page_size=1000)

107 for article in articles:

108 article.snapshot()

109 return cls.delete_by_query(query)

110

111 def bibjson(self, **kwargs):

112 if "bibjson" not in self.data:

113 self.data["bibjson"] = {}

114 return ArticleBibJSON(self.data.get("bibjson"), **kwargs)

115

116 def set_bibjson(self, bibjson):

117 bibjson = bibjson.bibjson if isinstance(bibjson, ArticleBibJSON) else bibjson

118 self.data["bibjson"] = bibjson

119

120 def history(self):

121 hs = self.data.get("history", [])

122 tuples = []

123 for h in hs:

124 tuples.append((h.get("date"), ArticleBibJSON(h.get("bibjson"))))

125 return tuples

126

127 def snapshot(self):

128 from portality.models import ArticleHistory

129

130 snap = deepcopy(self.data)

131 if "id" in snap:

132 snap["about"] = snap["id"]

133 del snap["id"]

134 if "index" in snap:

135 del snap["index"]

136 if "last_updated" in snap:

137 del snap["last_updated"]

138 if "created_date" in snap:

139 del snap["created_date"]

140

141 hist = ArticleHistory(**snap)

142 hist.save()

143 return hist.id

144

145 def add_history(self, bibjson, date=None):

146 """Deprecated"""

147 bibjson = bibjson.bibjson if isinstance(bibjson, ArticleBibJSON) else bibjson

148 if date is None:

149 date = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")

150 snobj = {"date": date, "bibjson": bibjson}

151 if "history" not in self.data:

152 self.data["history"] = []

153 self.data["history"].append(snobj)

154

155 def is_in_doaj(self):

156 return self.data.get("admin", {}).get("in_doaj", False)

157

158 def set_in_doaj(self, value):

159 if "admin" not in self.data:

160 self.data["admin"] = {}

161 self.data["admin"]["in_doaj"] = value

162

163 def has_seal(self):

164 return self.data.get("admin", {}).get("seal", False)

165

166 def set_seal(self, value):

167 if "admin" not in self.data:

168 self.data["admin"] = {}

169 self.data["admin"]["seal"] = value

170

171 def publisher_record_id(self):

172 return self.data.get("admin", {}).get("publisher_record_id")

173

174 def set_publisher_record_id(self, pri):

175 if "admin" not in self.data:

176 self.data["admin"] = {}

177 self.data["admin"]["publisher_record_id"] = pri

178

179 def upload_id(self):

180 return self.data.get("admin", {}).get("upload_id")

181

182 def set_upload_id(self, uid):

183 if "admin" not in self.data:

184 self.data["admin"] = {}

185 self.data["admin"]["upload_id"] = uid

186

187 def get_normalised_doi(self):

188 if self.data.get("index", {}).get("doi") is not None:

189 return self.data["index"]["doi"]

190 doi = self.bibjson().get_one_identifier(constants.IDENT_TYPE_DOI)

191 if doi is None:

192 return None

193 try:

194 return normalise.normalise_doi(doi)

195 except ValueError:

196 # can't be normalised, so we just return the doi as-is

197 return doi

198

199 def get_normalised_fulltext(self):

200 if self.data.get("index", {}).get("fulltext") is not None:

201 return self.data["index"]["fulltext"]

202 fulltexts = self.bibjson().get_urls(constants.LINK_TYPE_FULLTEXT)

203 if len(fulltexts) == 0:

204 return None

205 try:

206 return normalise.normalise_url(fulltexts[0])

207 except ValueError:

208 # can't be normalised, so we just return the url as-is

209 return fulltexts[0]

210

211 def get_journal(self):

212 """

213 Get this article's associated journal

214 :return: A Journal, or None if this is an orphan article

215 """

216 bibjson = self.bibjson()

217

218 # first, get the ISSNs associated with the record

219 pissns = bibjson.get_identifiers(bibjson.P_ISSN)

220 eissns = bibjson.get_identifiers(bibjson.E_ISSN)

221 allissns = list(set(pissns + eissns))

222

223 # find a matching journal record from the index

224 journal = None

225 for issn in allissns:

226 journals = Journal.find_by_issn(issn)

227 if len(journals) > 0:

228 # there should only ever be one, so take the first one

229 journal = journals[0]

230 break

231

232 return journal

233

234 def get_associated_journals(self):

235 # find all matching journal record from the index

236 allissns = self.bibjson().issns()

237 return Journal.find_by_issn(allissns)

238

239 def add_journal_metadata(self, j=None, reg=None):

240 """

241 this function makes sure the article is populated

242 with all the relevant info from its owning parent object

243 :param j: Pass in a Journal to bypass the (slow) locating step. MAKE SURE IT'S THE RIGHT ONE!

244 """

245

246 # Record the data that is copied into the article into the "reg"ister, in case the

247 # caller needs to know exactly and only which information was copied

248 if reg is None:

249 reg = Journal()

250 rbj = reg.bibjson()

251

252 if j is None:

253 journal = self.get_journal()

254 else:

255 journal = j

256

257 # we were unable to find a journal

258 if journal is None:

259 raise NoJournalException("Unable to find a journal associated with this article")

260

261 # if we get to here, we have a journal record we want to pull data from

262 jbib = journal.bibjson()

263 bibjson = self.bibjson()

264

265 # tripwire to be tripped if the journal makes changes to the article

266 trip = False

267

268 if bibjson.subjects() != jbib.subjects():

269 trip = True

270 bibjson.set_subjects(jbib.subjects())

271 rbj.set_subjects(jbib.subjects())

272

273 if jbib.title is not None:

274 if bibjson.journal_title != jbib.title:

275 trip = True

276 bibjson.journal_title = jbib.title

277 rbj.title = jbib.title

278

279 if len(jbib.language) > 0:

280 jlang = jbib.language

281 alang = bibjson.journal_language

282 jlang.sort()

283 alang.sort()

284 if jlang != alang:

285 bibjson.journal_language = jbib.language

286 trip = True

287 rbj.set_language(jbib.language)

288

289 if jbib.country is not None:

290 if jbib.country != bibjson.journal_country:

291 bibjson.journal_country = jbib.country

292 trip = True

293 rbj.country = jbib.country

294

295 if jbib.publisher:

296 if jbib.publisher != bibjson.publisher:

297 bibjson.publisher = jbib.publisher

298 trip = True

299 rbj.publisher = jbib.publisher

300

301 # Copy the seal info, in_doaj status and the journal's ISSNs

302 if journal.is_in_doaj() != self.is_in_doaj():

303 self.set_in_doaj(journal.is_in_doaj())

304 trip = True

305 reg.set_in_doaj(journal.is_in_doaj())

306

307 if journal.has_seal() != self.has_seal():

308 self.set_seal(journal.has_seal())

309 trip = True

310 reg.set_seal(journal.has_seal())

311

312 try:

313 aissns = bibjson.journal_issns

314 jissns = jbib.issns()

315 aissns.sort()

316 jissns.sort()

317 if aissns != jissns:

318 bibjson.journal_issns = jbib.issns()

319 trip = True

320

321 eissns = jbib.get_identifiers(jbib.E_ISSN)

322 pissns = jbib.get_identifiers(jbib.P_ISSN)

323 if eissns is not None and len(eissns) > 0:

324 rbj.add_identifier(rbj.E_ISSN, eissns[0])

325 if pissns is not None and len(pissns) > 0:

326 rbj.add_identifier(rbj.P_ISSN, pissns[0])

327 except KeyError:

328 # No issns, don't worry about it for now

329 pass

330

331 return trip

332

333 def merge(self, old, take_id=True):

334 # this takes an old version of the article and brings

335 # forward any useful information that is needed. The rules of merge are:

336 # - ignore "index" (it gets regenerated on save)

337 # - always take the "created_date"

338 # - any top level field that does not exist in the current item (esp "id" and "history")

339 # - in "admin", copy any field that does not already exist

340

341 # first thing to do is create a snapshot of the old record

342 old.snapshot()

343

344 # now go on and do the merge

345

346 # always take the created date

347 self.set_created(old.created_date)

348

349 # take the id

350 if self.id is None or take_id:

351 self.set_id(old.id)

352

353 # take the history (deprecated)

354 if len(self.data.get("history", [])) == 0:

355 self.data["history"] = deepcopy(old.data.get("history", []))

356

357 # take the bibjson

358 if "bibjson" not in self.data:

359 self.set_bibjson(deepcopy(old.bibjson()))

360

361 # take the admin if there isn't one

362 if "admin" not in self.data:

363 self.data["admin"] = deepcopy(old.data.get("admin", {}))

364 else:

365 # otherwise, copy any admin keys that don't exist on the current item

366 oa = old.data.get("admin", {})

367 for key in oa:

368 if key not in self.data["admin"]:

369 self.data["admin"][key] = deepcopy(oa[key])

370

371 def _generate_index(self):

372 # the index fields we are going to generate

373 issns = []

374 subjects = []

375 schema_subjects = []

376 schema_codes = []

377 schema_codes_tree = []

378 classification = []

379 langs = []

380 country = None

381 publisher = []

382 classification_paths = []

383 unpunctitle = None

384 asciiunpunctitle = None

385 doi = None

386 fulltext = None

387

388 # the places we're going to get those fields from

389 cbib = self.bibjson()

390 jindex = self.data.get('index', {})

391 hist = self.history()

392

393 # get the issns out of the current bibjson

394 issns += cbib.get_identifiers(cbib.P_ISSN)

395 issns += cbib.get_identifiers(cbib.E_ISSN)

396

397 # get the issn from the journal bibjson

398 if isinstance(cbib.journal_issns, list):

399 issns += cbib.journal_issns

400

401 # de-duplicate the issns

402 issns = list(set(issns))

403

404 # now get the issns out of the historic records

405 for date, hbib in hist:

406 issns += hbib.get_identifiers(hbib.P_ISSN)

407 issns += hbib.get_identifiers(hbib.E_ISSN)

408

409 # get the subjects and concatenate them with their schemes from the current bibjson

410 for subs in cbib.subjects():

411 scheme = subs.get("scheme")

412 term = subs.get("term")

413 subjects.append(term)

414 schema_subjects.append(scheme + ":" + term)

415 classification.append(term)

416 if "code" in subs:

417 schema_codes.append(scheme + ":" + subs.get("code"))

418

419 # copy the languages

420 if len(cbib.journal_language) > 0:

421 langs = [datasets.name_for_lang(l) for l in cbib.journal_language]

422

423 # copy the country

424 if jindex.get('country'):

425 country = jindex.get('country')

426 elif cbib.journal_country:

427 country = datasets.get_country_name(cbib.journal_country)

428

429 # copy the publisher/provider

430 if cbib.publisher:

431 publisher.append(cbib.publisher)

432

433 # deduplicate the lists

434 issns = list(set(issns))

435 subjects = list(set(subjects))

436 schema_subjects = list(set(schema_subjects))

437 classification = list(set(classification))

438 publisher = list(set(publisher))

439 langs = list(set(langs))

440 schema_codes = list(set(schema_codes))

441

442 # work out what the date of publication is

443 date = cbib.get_publication_date()

444

445 # calculate the classification paths

446 from portality.lcc import lcc # inline import since this hits the database

447 for subs in cbib.subjects():

448 scheme = subs.get("scheme")

449 term = subs.get("term")

450 if scheme == "LCC":

451 path = lcc.pathify(term)

452 if path is not None:

453 classification_paths.append(path)

454

455 # normalise the classification paths, so we only store the longest ones

456 classification_paths = lcc.longest(classification_paths)

457 schema_codes_tree = cbib.lcc_codes_full_list()

458

459 # create an unpunctitle

460 if cbib.title is not None:

461 throwlist = string.punctuation + '\n\t'

462 unpunctitle = "".join(c for c in cbib.title if c not in throwlist).strip()

463 try:

464 asciiunpunctitle = unidecode(unpunctitle)

465 except:

466 asciiunpunctitle = unpunctitle

467

468 # determine if the seal is applied

469 has_seal = "Yes" if self.has_seal() else "No"

470

471 # create a normalised version of the DOI for deduplication

472 source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI)

473 try:

474 doi = normalise.normalise_doi(source_doi)

475 except ValueError as e:

476 # if we can't normalise the DOI, just store it as-is.

477 doi = source_doi

478

479

480 # create a normalised version of the fulltext URL for deduplication

481 fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT)

482 if len(fulltexts) > 0:

483 source_fulltext = fulltexts[0]

484 try:

485 fulltext = normalise.normalise_url(source_fulltext)

486 except ValueError as e:

487 # if we can't normalise the fulltext store it as-is

488 fulltext = source_fulltext

489

490

491

492 # build the index part of the object

493 self.data["index"] = {}

494 if len(issns) > 0:

495 self.data["index"]["issn"] = issns

496 if date != "":

497 self.data["index"]["date"] = date

498 self.data["index"]["date_toc_fv_month"] = date # Duplicated so we can have year/month facets in fv2

499 if len(subjects) > 0:

500 self.data["index"]["subject"] = subjects

501 if len(schema_subjects) > 0:

502 self.data["index"]["schema_subject"] = schema_subjects

503 if len(classification) > 0:

504 self.data["index"]["classification"] = classification

505 if len(publisher) > 0:

506 self.data["index"]["publisher"] = publisher

507 if len(langs) > 0:

508 self.data["index"]["language"] = langs

509 if country is not None:

510 self.data["index"]["country"] = country

511 if len(schema_codes) > 0:

512 self.data["index"]["schema_code"] = schema_codes

513 if len(classification_paths) > 0:

514 self.data["index"]["classification_paths"] = classification_paths

515 if unpunctitle is not None:

516 self.data["index"]["unpunctitle"] = unpunctitle

517 if asciiunpunctitle is not None:

518 self.data["index"]["asciiunpunctitle"] = unpunctitle

519 if has_seal:

520 self.data["index"]["has_seal"] = has_seal

521 if doi is not None:

522 self.data["index"]["doi"] = doi

523 if fulltext is not None:

524 self.data["index"]["fulltext"] = fulltext

525 if len(schema_codes_tree) > 0:

526 self.data["index"]["schema_codes_tree"] = schema_codes_tree

527

528 def prep(self):

529 self._generate_index()

530 self.data['last_updated'] = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")

531

532 def save(self, *args, **kwargs):

533 self._generate_index()

534 return super(Article, self).save(*args, **kwargs)

535

536 def get_owner(self):

537 b = self.bibjson()

538 article_issns = b.get_identifiers(b.P_ISSN)

539 article_issns += b.get_identifiers(b.E_ISSN)

540 owners = []

541

542 seen_journal_issns = {}

543 for issn in article_issns:

544 journals = Journal.find_by_issn(issn)

545 if journals is not None and len(journals) > 0:

546 for j in journals:

547 owners.append(j.owner)

548 if j.owner not in seen_journal_issns:

549 seen_journal_issns[j.owner] = []

550 seen_journal_issns[j.owner] += j.bibjson().issns()

551

552 # deduplicate the list of owners

553 owners = list(set(owners))

554

555 # no owner means we can't confirm

556 if len(owners) == 0:

557 raise NoValidOwnerException

558

559 # multiple owners means ownership of this article is confused

560 if len(owners) > 1:

561 return NoValidOwnerException

562

563 return owners[0]

564

565class ArticleBibJSON(GenericBibJSON):

566

567 def __init__(self, bibjson=None, **kwargs):

568 self._add_struct(shared_structs.SHARED_BIBJSON.get("structs", {}).get("bibjson"))

569 self._add_struct(ARTICLE_BIBJSON_EXTENSION.get("structs", {}).get("bibjson"))

570 super(ArticleBibJSON, self).__init__(bibjson, **kwargs)

571

572 # article-specific simple getters and setters

573 @property

574 def year(self):

575 return self._get_single("year")

576

577 @year.setter

578 def year(self, val):

579 self._set_with_struct("year", val)

580

581 @year.deleter

582 def year(self):

583 self._delete("year")

584

585 @property

586 def month(self):

587 return self._get_single("month")

588

589 @month.setter

590 def month(self, val):

591 self._set_with_struct("month", val)

592

593 @month.deleter

594 def month(self):

595 self._delete("month")

596

597 @property

598 def start_page(self):

599 return self._get_single("start_page")

600

601 @start_page.setter

602 def start_page(self, val):

603 self._set_with_struct("start_page", val)

604

605 @property

606 def end_page(self):

607 return self._get_single("end_page")

608

609 @end_page.setter

610 def end_page(self, val):

611 self._set_with_struct("end_page", val)

612

613 @property

614 def abstract(self):

615 return self._get_single("abstract")

616

617 @abstract.setter

618 def abstract(self, val):

619 self._set_with_struct("abstract", val)

620

621 # article-specific complex part getters and setters

622

623 @property

624 def volume(self):

625 return self._get_single("journal.volume")

626

627 @volume.setter

628 def volume(self, value):

629 self._set_with_struct("journal.volume", value)

630

631 @property

632 def number(self):

633 return self._get_single("journal.number")

634

635 @number.setter

636 def number(self, value):

637 self._set_with_struct("journal.number", value)

638

639 @property

640 def journal_title(self):

641 return self._get_single("journal.title")

642

643 @journal_title.setter

644 def journal_title(self, title):

645 self._set_with_struct("journal.title", title)

646

647 @property

648 def journal_language(self):

649 return self._get_list("journal.language")

650

651 @journal_language.setter

652 def journal_language(self, lang):

653 self._set_with_struct("journal.language", lang)

654

655 @property

656 def journal_country(self):

657 return self._get_single("journal.country")

658

659 @journal_country.setter

660 def journal_country(self, country):

661 self._set_single("journal.country", country)

662

663 @property

664 def journal_issns(self):

665 return self._get_list("journal.issns")

666

667 @journal_issns.setter

668 def journal_issns(self, issns):

669 self._set_with_struct("journal.issns", issns)

670

671 @property

672 def publisher(self):

673 return self._get_single("journal.publisher")

674

675 @publisher.setter

676 def publisher(self, value):

677 self._set_with_struct("journal.publisher", value)

678

679 def add_author(self, name, affiliation=None, orcid_id=None):

680 aobj = {"name": name}

681 if affiliation is not None:

682 aobj["affiliation"] = affiliation

683 if orcid_id is not None:

684 aobj["orcid_id"] = orcid_id

685 self._add_to_list_with_struct("author", aobj)

686

687 @property

688 def author(self):

689 return self._get_list("author")

690

691 @author.setter

692 def author(self, authors):

693 self._set_with_struct("author", authors)

694

695 def get_publication_date(self, date_format='%Y-%m-%dT%H:%M:%SZ'):

696 # work out what the date of publication is

697 date = ""

698 if self.year is not None:

699 if type(self.year) is str: # It should be, if the mappings are correct. but len() needs a sequence.

700 # fix 2 digit years

701 if len(self.year) == 2:

702 try:

703 intyear = int(self.year)

704 except ValueError:

705 # if it's 2 chars long and the 2 chars don't make an integer,

706 # forget it

707 return date

708

709 # In the case of truncated years, assume it's this century if before the current year

710 if intyear <= int(str(datetime.utcnow().year)[:-2]):

711 self.year = "20" + self.year # For readability over long-lasting code, I have refrained

712 else: # from using str(datetime.utcnow().year)[:2] here.

713 self.year = "19" + self.year # But don't come crying to me 90-ish years from now.

714

715 # if we still don't have a 4 digit year, forget it

716 if len(self.year) != 4:

717 return date

718

719 # build up our proposed datestamp

720 date += str(self.year)

721 if self.month is not None:

722 try:

723 if type(self.month) is int:

724 if 1 <= int(self.month) <= 12:

725 month_number = self.month

726 else:

727 month_number = 1

728 elif len(self.month) <= 2:

729 if 1 <= int(self.month) <= 12:

730 month_number = self.month

731 else:

732 month_number = '1'

733 elif len(self.month) == 3: # 'May' works with either case, obvz.

734 month_number = datetime.strptime(self.month, '%b').month

735 else:

736 month_number = datetime.strptime(self.month, '%B').month

737

738

739 # pad the month number to two digits. This accepts int or string

740 date += '-{:0>2}'.format(month_number)

741 except:

742 # If something goes wrong, just assume it's January

743 date += "-01"

744 else:

745 date += "-01"

746 date += "-01T00:00:00Z"

747

748 # attempt to confirm the format of our datestamp

749 try:

750 datecheck = datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")

751 date = datecheck.strftime(date_format)

752 except:

753 return ""

754 return date

755

756 def remove_journal_metadata(self):

757 self._delete("journal")

758

759 def vancouver_citation(self):

760 jtitle = self.journal_title

761 year = self.year

762 vol = self.volume

763 iss = self.number

764 start = self.start_page

765 end = self.end_page

766

767 citation = ""

768

769 if year:

770 citation += year + ";"

771

772 if vol:

773 citation += vol

774

775 if iss:

776 citation += "(" + iss + ")"

777

778 if start or end:

779 if citation != "":

780 citation += ":"

781 if start:

782 citation += start

783 if end:

784 if start:

785 citation += "-"

786 citation += end

787

788 return jtitle.strip(), citation

789

790 def lcc_codes_full_list(self):

791 full_list = set()

792

793 from portality.lcc import lcc # inline import since this hits the database

794 for subs in self.subjects():

795 scheme = subs.get("scheme")

796 if scheme != "LCC":

797 continue

798 code = subs.get("code")

799 expanded = lcc.expand_codes(code)

800 full_list.update(expanded)

801

802 return ["LCC:" + x for x in full_list if x is not None]

803

804ARTICLE_BIBJSON_EXTENSION = {

805 "objects" : ["bibjson"],

806 "structs" : {

807 "bibjson" : {

808 "fields" : {

809 "year" : {"coerce" : "unicode"},

810 "month" : {"coerce" : "unicode"},

811 "start_page" : {"coerce" : "unicode"},

812 "end_page" : {"coerce" : "unicode"},

813 "abstract" : {"coerce" : "unicode"}

814 },

815 "lists" : {

816 "author" : {"contains" : "object"}

817 },

818 "objects" : [

819 "journal"

820 ],

821

822 "structs" : {

823 "author" : {

824 "fields" : {

825 "name" : {"coerce" : "unicode"},

826 "affiliation" : {"coerce" : "unicode"},

827 "email" : {"coerce": "unicode"},

828 "orcid_id" : {"coerce" : "unicode"}

829 }

830 },

831

832 "journal" : {

833 "fields" : {

834 "volume" : {"coerce" : "unicode"},

835 "number" : {"coerce" : "unicode"},

836 "publisher" : {"coerce" : "unicode"},

837 "title" : {"coerce" : "unicode"},

838 "country" : {"coerce" : "unicode"}

839 },

840 "lists" : {

841 "language" : {"contains" : "field", "coerce" : "unicode"},

842 "issns" : {"contains" : "field", "coerce" : "unicode"}

843 }

844 }

845 }

846

847 }

848 }

849}

850

851##################################################

852

853class ArticleQuery(object):

854 base_query = {

855 "track_total_hits" : True,

856 "query" : {

857 "bool" : {

858 "must" : []

859 }

860 }

861 }

862

863 _issn_terms = { "terms" : {"index.issn.exact" : ["<list of issns here>"]} }

864 _volume_term = { "term" : {"bibjson.journal.volume.exact" : "<volume here>"} }

865

866 def __init__(self, issns=None, volume=None):

867 self.issns = issns

868 self.volume = volume

869

870 def query(self):

871 q = deepcopy(self.base_query)

872

873 if self.issns is not None:

874 iq = deepcopy(self._issn_terms)

875 iq["terms"]["index.issn.exact"] = self.issns

876 q["query"]["bool"]["must"].append(iq)

877

878 if self.volume is not None:

879 vq = deepcopy(self._volume_term)

880 vq["term"]["bibjson.journal.volume.exact"] = self.volume

881 q["query"]["bool"]["must"].append(vq)

882

883 return q

884

885class ArticleVolumesQuery(object):

886 base_query = {

887 "track_total_hits": True,

888 "query" : {

889 "bool": {

890 "filter": {

891 "terms" : {"index.issn.exact" : ["<list of issns here>"]}

892 }

893 }

894 },

895 "size" : 0,

896 "aggs" : {

897 "vols" : {

898 "terms" : {

899 "field" : "bibjson.journal.volume.exact",

900 "order": {"_key" : "desc"},

901 "size" : 1000

902 }

903 }

904 }

905 }

906

907 def __init__(self, issns=None):

908 self.issns = issns

909

910 def query(self):

911 q = deepcopy(self.base_query)

912 q["query"]["bool"]["filter"]["terms"]["index.issn.exact"] = self.issns

913 return q

914

915

916class ArticleVolumesIssuesQuery(object):

917 base_query = {

918 "track_total_hits": True,

919 "query" : {

920 "bool": {

921 "filter": {

922 "bool": {

923 "must": [

924 {"terms" : {"index.issn.exact" : ["<list of issns here>"]}},

925 {"term" : {"bibjson.journal.volume.exact" : "<volume here>"}}

926 ]

927 }

928 }

929 }

930 },

931 "size" : 0,

932 "aggs" : {

933 "issues" : {

934 "terms" : {

935 "field" : "bibjson.journal.number.exact",

936 "order": {"_key", "desc"},

937 "size" : 1000

938 }

939 }

940 }

941 }

942

943 def __init__(self, issns=None, volume=None):

944 self.issns = issns

945 self.volume = volume

946

947 def query(self):

948 q = deepcopy(self.base_query)

949 q["query"]["bool"]["filter"]["bool"]["must"][0]["terms"]["index.issn.exact"] = self.issns

950 q["query"]["bool"]["filter"]["bool"]["must"][1]["term"]["bibjson.journal.volume.exact"] = self.volume

951 return q

952

953

954class DuplicateArticleQuery(object):

955 base_query = {

956 "track_total_hits" : True,

957 "query": {

958 "bool": {

959 "must": []

960 }

961 },

962 "sort": [{"last_updated": {"order": "desc"}}]

963 }

964

965 _should = {

966 "should" : [],

967 "minimum_should_match" : 2

968 }

969

970 _volume_term = {"term" : {"bibjson.journal.volume.exact" : "<volume>"}}

971 _number_term = {"term" : {"bibjson.journal.number.exact" : "<issue number>"}}

972 _start_term = {"term" : {"bibjson.start_page.exact" : "<start page>"}}

973 _issn_terms = {"terms" : { "index.issn.exact" : ["<list of issns>"] }}

974 _pubrec_term = {"term" : {"admin.publisher_record_id.exact" : "<publisher record id>"}}

975 _identifier_term = {"term" : {"bibjson.identifier.id.exact" : "<issn here>"}}

976 _doi_term = {"term" : {"index.doi.exact" : "<doi here>"}}

977 _fulltext_terms = {"terms" : {"index.fulltext.exact" : ["<fulltext here>"]}}

978 _fuzzy_title = {"fuzzy" : {"bibjson.title.exact" : "<title here>"}}

979

980 def __init__(self, issns=None, publisher_record_id=None, doi=None, urls=None, title=None, volume=None, number=None, start=None, should_match=None, size=10):

981 self.issns = issns if isinstance(issns, list) else []

982 self.publisher_record_id = publisher_record_id

983 self.doi = doi

984 self.urls = urls if isinstance(urls, list) else [urls] if isinstance(urls, str) or isinstance(urls, str) else []

985 self.title = title

986 self.volume = volume

987 self.number = number

988 self.start = start

989 self.should_match = should_match

990 self.size = size

991

992 def query(self):

993 # - MUST be from at least one of the ISSNs

994 # - MUST have the publisher record id

995 # - MUST have the doi unless should_match is set

996 # - MUST have the one of the fulltext urls unless should_match is set

997 # - MUST fuzzy match the title

998 # - SHOULD have <should_match> of: volume, issue, start page, fulltext url, doi

999

1000 q = deepcopy(self.base_query)

1001 if len(self.issns) > 0:

1002 it = deepcopy(self._issn_terms)

1003 it["terms"]["index.issn.exact"] = self.issns

1004 q["query"]["bool"]["must"].append(it)

1005

1006 if self.publisher_record_id is not None:

1007 pr = deepcopy(self._pubrec_term)

1008 pr["term"]["admin.publisher_record_id.exact"] = self.publisher_record_id

1009 q["query"]["bool"]["must"].append(pr)

1010

1011 if self.doi is not None and self.should_match is None:

1012 idt = deepcopy(self._doi_term)

1013 # idt["term"]["bibjson.identifier.id.exact"] = self.doi

1014 idt["term"]["index.doi.exact"] = self.doi

1015 q["query"]["bool"]["must"].append(idt)

1016

1017 if len(self.urls) > 0 and self.should_match is None:

1018 uq = deepcopy(self._fulltext_terms)

1019 # uq["terms"]["bibjson.link.url.exact"] = self.urls

1020 uq["terms"]["index.fulltext.exact"] = self.urls

1021 q["query"]["bool"]["must"].append(uq)

1022

1023 if self.title is not None:

1024 ft = deepcopy(self._fuzzy_title)

1025 ft["fuzzy"]["bibjson.title.exact"] = self.title

1026 q["query"]["bool"]["must"].append(ft)

1027

1028 if self.should_match is not None:

1029 term_count = 0

1030 s = deepcopy(self._should)

1031

1032 if self.volume is not None:

1033 term_count += 1

1034 vt = deepcopy(self._volume_term)

1035 vt["term"]["bibjson.journal.volume.exact"] = self.volume

1036 s["should"].append(vt)

1037

1038 if self.number is not None:

1039 term_count += 1

1040 nt = deepcopy(self._number_term)

1041 nt["term"]["bibjson.journal.number.exact"] = self.number

1042 s["should"].append(nt)

1043

1044 if self.start is not None:

1045 term_count += 1

1046 st = deepcopy(self._start_term)

1047 st["term"]["bibjson.start_page.exact"] = self.start

1048 s["should"].append(st)

1049

1050 if len(self.urls) > 0:

1051 term_count += 1

1052 uq = deepcopy(self._url_terms)

1053 uq["terms"]["bibjson.link.url.exact"] = self.urls

1054 s["should"].append(uq)

1055

1056 if self.doi is not None:

1057 term_count += 1

1058 idt = deepcopy(self._identifier_term)

1059 idt["term"]["bibjson.identifier.id.exact"] = self.doi

1060 s["should"].append(idt)

1061

1062 msm = self.should_match

1063 if msm > term_count:

1064 msm = term_count

1065 s["minimum_should_match"] = msm

1066

1067 q["query"]["bool"].update(s)

1068

1069 # Allow more results than the default

1070 q["size"] = self.size

1071

1072 return q

1073

1074

1075def _human_sort(things, reverse=True):

1076 numeric = []

1077 non_numeric = []

1078 nmap = {}

1079 for v in things:

1080 try:

1081 # try to convert n to an int

1082 vint = int(v)

1083

1084 # remember the original string (it may have leading 0s)

1085 try:

1086 nmap[vint].append(v)

1087 except KeyError:

1088 nmap[vint] = [v]

1089 numeric.append(vint)

1090 except:

1091 non_numeric.append(v)

1092

1093 numeric.sort(reverse=reverse)

1094 non_numeric.sort(reverse=reverse)

1095

1096 # convert the integers back to their string representation

1097 return reduce(lambda x, y: x+y, [nmap[n] for n in numeric], []) + non_numeric

1098

1099

1100def _sort_articles(articles):

1101 # first extract the array we want to sort on

1102 # and make a map of that value to the issue itself

1103 unsorted = []

1104 numbers = []

1105 imap = {}

1106 for art in articles:

1107 sp = art.get("bibjson.start_page", [None])[0]

1108

1109 # can't sort anything that doesn't have a start page

1110 if sp is None:

1111 unsorted.append(art)

1112 continue

1113

1114 # deal with start page clashes and record the start pages

1115 # to sort by

1116 if sp not in numbers:

1117 numbers.append(sp)

1118 if sp in imap:

1119 imap[sp].append(art)

1120 else:

1121 imap[sp] = [art]

1122

1123 sorted_keys = _human_sort(numbers, reverse=False)

1124

1125 s = []

1126 for n in sorted_keys:

1127 s += [x for x in imap[n]]

1128 s += [x for x in unsorted]

1129

1130 return s