Coverage for portality / models / article.py: 82%
744 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-05 00:09 +0100
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-05 00:09 +0100
1import string
3from unidecode import unidecode
4from functools import reduce
5from copy import deepcopy
6from datetime import datetime
8from portality import datasets, constants
9from portality.core import app
10from portality.dao import DomainObject
11from portality.lib import es_data_mapping
12from portality.lib.coerce import COERCE_MAP
13from portality.lib.dates import FMT_DATETIME_STD
14from portality.lib.seamless import SeamlessMixin
15from portality.models import Journal
16from portality.models.v1.bibjson import GenericBibJSON # NOTE that article specifically uses the v1 BibJSON
17from portality.models.v1 import shared_structs
18from portality.models.v2.shared_structs import ARTICLE_STRUCT
19from portality.lib import normalise, dates
22class NoJournalException(Exception):
23 pass
25class NoValidOwnerException(Exception):
26 pass
29ARTICLE_BIBJSON_EXTENSION = {
30 "objects" : ["bibjson"],
31 "structs" : {
32 "bibjson" : {
33 "fields" : {
34 "year" : {"coerce" : "unicode"},
35 "month" : {"coerce" : "unicode"},
36 "start_page" : {"coerce" : "unicode"},
37 "end_page" : {"coerce" : "unicode"},
38 "abstract" : {"coerce" : "unicode"}
39 },
40 "lists" : {
41 "author" : {"contains" : "object"}
42 },
43 "objects" : [
44 "journal"
45 ],
47 "structs" : {
48 "author" : {
49 "fields" : {
50 "name" : {"coerce" : "unicode"},
51 "affiliation" : {"coerce" : "unicode"},
52 "email" : {"coerce": "unicode"},
53 "orcid_id" : {"coerce" : "unicode"}
54 }
55 },
57 "journal" : {
58 "fields" : {
59 "volume" : {"coerce" : "unicode"},
60 "number" : {"coerce" : "unicode"},
61 "publisher" : {"coerce" : "unicode"},
62 "title" : {"coerce" : "unicode"},
63 "country" : {"coerce" : "unicode"}
64 },
65 "lists" : {
66 "language" : {"contains" : "field", "coerce" : "unicode"},
67 "issns" : {"contains" : "field", "coerce" : "unicode"}
68 }
69 }
70 }
72 }
73 }
74}
76MAPPING_OPTS = {
77 "dynamic": None,
78 "coerces": app.config["DATAOBJ_TO_MAPPING_DEFAULTS"],
79 "exceptions": app.config["ARTICLE_EXCEPTION_MAPPING"],
80 "additional_mappings": {}
81}
84class Article(SeamlessMixin, DomainObject):
85 __type__ = "article"
87 __SEAMLESS_STRUCT__ = [
88 ARTICLE_STRUCT,
89 shared_structs.SHARED_BIBJSON,
90 ARTICLE_BIBJSON_EXTENSION
91 ]
93 __SEAMLESS_COERCE__ = COERCE_MAP
95 def mappings(self):
96 return es_data_mapping.create_mapping(self.__seamless_struct__.raw, MAPPING_OPTS)
98 @classmethod
99 def duplicates(cls, publisher_record_id=None, doi=None, fulltexts=None, title=None, volume=None, number=None, start=None, should_match=None, size=10):
100 # some input sanitisation
101 urls = fulltexts if isinstance(fulltexts, list) else [fulltexts] if isinstance(fulltexts, str) or isinstance(fulltexts, str) else []
103 # make sure that we're dealing with the normal form of the identifiers
104 norm_urls = []
105 for url in urls:
106 try:
107 norm = normalise.normalise_url(url)
108 norm_urls.append(norm)
109 except ValueError:
110 # use the non-normal form
111 norm_urls.append(url)
112 urls = norm_urls
114 try:
115 doi = normalise.normalise_doi(doi)
116 except ValueError:
117 # leave the doi as it is
118 pass
120 q = DuplicateArticleQuery(publisher_record_id=publisher_record_id,
121 doi=doi,
122 urls=urls,
123 title=title,
124 volume=volume,
125 number=number,
126 start=start,
127 should_match=should_match,
128 size=size)
130 # res = cls.query(q=q.query())
131 # return [cls(**hit.get("_source")) for hit in res.get("hits", {}).get("hits", [])]
132 return cls.q2obj(q=q.query())
134 @classmethod
135 def list_volumes(cls, issns):
136 q = ArticleVolumesQuery(issns)
137 result = cls.query(q=q.query())
138 return _human_sort([t.get("key") for t in result.get("aggregations", {}).get("vols", {}).get("buckets", [])])
140 @classmethod
141 def list_volume_issues(cls, issns, volume):
142 q = ArticleVolumesIssuesQuery(issns, volume)
143 result = cls.query(q=q.query())
144 return _human_sort([t.get("key") for t in result.get("aggregations", {}).get("issues", {}).get("buckets", [])])
146 @classmethod
147 def get_by_volume(cls, issns, volume):
148 q = ArticleQuery(issns=issns, volume=volume)
149 articles = cls.iterate(q.query(), page_size=1000)
150 return articles
152 @classmethod
153 def find_by_issns(cls, issns):
154 q = ArticleQuery(issns=issns)
155 articles = cls.iterate(q.query(), page_size=1000)
156 return articles
158 @classmethod
159 def count_by_issns(cls, issns, in_doaj=None):
160 q = ArticleQuery(issns=issns, in_doaj=in_doaj)
161 return cls.hit_count(q.query())
163 @classmethod
164 def delete_by_issns(cls, issns, snapshot=True):
165 q = ArticleQuery(issns=issns)
166 cls.delete_selected(query=q.query(), snapshot=snapshot)
168 @classmethod
169 def delete_selected(cls, query=None, owner=None, snapshot=True, tombstone=True):
170 if owner is not None:
171 from portality.models import Journal
172 issns = Journal.issns_by_owner(owner)
173 q = ArticleQuery(issns=issns)
174 query = q.query()
176 if snapshot or tombstone:
177 articles = cls.iterate(query, page_size=1000)
178 for article in articles:
179 if snapshot:
180 article.snapshot()
181 if tombstone:
182 article._tombstone()
183 return cls.delete_by_query(query)
185 def delete(self):
186 self._tombstone()
187 super(Article, self).delete()
189 def bibjson(self, **kwargs):
190 if "bibjson" not in self.data:
191 self.data["bibjson"] = {}
192 return ArticleBibJSON(self.data.get("bibjson"), **kwargs)
194 def set_bibjson(self, bibjson):
195 bibjson = bibjson.bibjson if isinstance(bibjson, ArticleBibJSON) else bibjson
196 self.data["bibjson"] = bibjson
198 def history(self):
199 hs = self.data.get("history", [])
200 tuples = []
201 for h in hs:
202 tuples.append((h.get("date"), ArticleBibJSON(h.get("bibjson"))))
203 return tuples
205 def snapshot(self):
206 from portality.models import ArticleHistory
208 snap = deepcopy(self.data)
209 if "id" in snap:
210 snap["about"] = snap["id"]
211 del snap["id"]
212 if "index" in snap:
213 del snap["index"]
214 if "last_updated" in snap:
215 del snap["last_updated"]
216 if "created_date" in snap:
217 del snap["created_date"]
219 hist = ArticleHistory(**snap)
220 hist.save()
221 return hist.id
223 def _tombstone(self):
224 stone = ArticleTombstone()
225 stone.set_id(self.id)
226 sbj = stone.bibjson()
228 subs = self.bibjson().subjects()
229 for s in subs:
230 sbj.add_subject(s.get("scheme"), s.get("term"), s.get("code"))
232 stone.save()
233 return stone
235 def add_history(self, bibjson, date=None):
236 """Deprecated"""
237 bibjson = bibjson.bibjson if isinstance(bibjson, ArticleBibJSON) else bibjson
238 if date is None:
239 date = dates.now_str()
240 snobj = {"date": date, "bibjson": bibjson}
241 if "history" not in self.data:
242 self.data["history"] = []
243 self.data["history"].append(snobj)
245 def is_in_doaj(self):
246 try:
247 return self.data['admin'].get("in_doaj", False)
248 except KeyError:
249 # If we have no admin section, return None instead
250 return None
252 def set_in_doaj(self, value):
253 if "admin" not in self.data:
254 self.data["admin"] = {}
255 self.data["admin"]["in_doaj"] = value
257 def publisher_record_id(self):
258 return self.data.get("admin", {}).get("publisher_record_id")
260 def set_publisher_record_id(self, pri):
261 if "admin" not in self.data:
262 self.data["admin"] = {}
263 self.data["admin"]["publisher_record_id"] = pri
265 def upload_id(self):
266 return self.data.get("admin", {}).get("upload_id")
268 def set_upload_id(self, uid):
269 if "admin" not in self.data:
270 self.data["admin"] = {}
271 self.data["admin"]["upload_id"] = uid
273 def get_normalised_doi(self):
274 if self.data.get("index", {}).get("doi") is not None:
275 return self.data["index"]["doi"]
276 doi = self.bibjson().get_one_identifier(constants.IDENT_TYPE_DOI)
277 if doi is None:
278 return None
279 try:
280 return normalise.normalise_doi(doi)
281 except ValueError:
282 # can't be normalised, so we just return the doi as-is
283 return doi
285 def get_normalised_fulltext(self):
286 if self.data.get("index", {}).get("fulltext") is not None:
287 return self.data["index"]["fulltext"]
288 fulltexts = self.bibjson().get_urls(constants.LINK_TYPE_FULLTEXT)
289 if len(fulltexts) == 0:
290 return None
291 try:
292 return normalise.normalise_url(fulltexts[0])
293 except ValueError:
294 # can't be normalised, so we just return the url as-is
295 return fulltexts[0]
297 def get_journal(self):
298 """
299 Get this article's associated journal
300 :return: A Journal, or None if this is an orphan article
301 """
302 bibjson = self.bibjson()
304 # first, get the ISSNs associated with the record
305 pissns = bibjson.get_identifiers(bibjson.P_ISSN)
306 eissns = bibjson.get_identifiers(bibjson.E_ISSN)
307 allissns = list(set(pissns + eissns))
309 # find a matching journal record from the index
310 best_match = None
312 for issn in allissns:
313 journals = Journal.find_by_issn(issn)
314 if len(journals) > 0:
315 # Get the best journal match:
316 # 1. Prefer the most recently updated journal that is in DOAJ.
317 # 2. If none are in DOAJ, fall back to the most recently updated journal outside DOAJ.
319 matches = [j for j in journals if j.is_in_doaj()]
321 if len(matches) == 0:
322 matches = journals
324 best_match = max(
325 matches,
326 key=lambda j: j.last_updated,
327 default=None
328 )
330 return best_match
332 def get_associated_journals(self):
333 # find all matching journal record from the index
334 allissns = self.bibjson().issns()
335 return Journal.find_by_issn(allissns)
337 def add_journal_metadata(self, j=None, reg=None):
338 """
339 this function makes sure the article is populated
340 with all the relevant info from its owning parent object
341 :param j: Pass in a Journal to bypass the (slow) locating step. MAKE SURE IT'S THE RIGHT ONE!
342 """
344 # Record the data that is copied into the article into the "reg"ister, in case the
345 # caller needs to know exactly and only which information was copied
346 if reg is None:
347 reg = Journal()
348 rbj = reg.bibjson()
350 if j is None:
351 journal = self.get_journal()
352 else:
353 journal = j
355 # we were unable to find a journal
356 if journal is None:
357 raise NoJournalException("Unable to find a journal associated with this article")
359 # if we get to here, we have a journal record we want to pull data from
360 jbib = journal.bibjson()
361 bibjson = self.bibjson()
363 # tripwire to be tripped if the journal makes changes to the article
364 trip = False
366 if bibjson.subjects() != jbib.subjects():
367 trip = True
368 bibjson.set_subjects(jbib.subjects())
369 rbj.set_subjects(jbib.subjects())
371 if jbib.title is not None:
372 if bibjson.journal_title != jbib.title:
373 trip = True
374 bibjson.journal_title = jbib.title
375 rbj.title = jbib.title
377 if len(jbib.language) > 0:
378 jlang = jbib.language
379 alang = bibjson.journal_language
380 jlang.sort()
381 alang.sort()
382 if jlang != alang:
383 bibjson.journal_language = jbib.language
384 trip = True
385 rbj.set_language(jbib.language)
387 if jbib.country is not None:
388 if jbib.country != bibjson.journal_country:
389 bibjson.journal_country = jbib.country
390 trip = True
391 rbj.country = jbib.country
393 if jbib.publisher:
394 if jbib.publisher != bibjson.publisher:
395 bibjson.publisher = jbib.publisher
396 trip = True
397 rbj.publisher = jbib.publisher
399 # Copy the in_doaj status and the journal's ISSNs
400 if journal.is_in_doaj() != self.is_in_doaj():
401 self.set_in_doaj(journal.is_in_doaj())
402 trip = True
403 reg.set_in_doaj(journal.is_in_doaj())
405 try:
406 aissns = bibjson.journal_issns
407 jissns = jbib.issns()
408 aissns.sort()
409 jissns.sort()
410 if aissns != jissns:
411 bibjson.journal_issns = jbib.issns()
412 trip = True
414 eissns = jbib.get_identifiers(jbib.E_ISSN)
415 pissns = jbib.get_identifiers(jbib.P_ISSN)
416 if eissns is not None and len(eissns) > 0:
417 rbj.add_identifier(rbj.E_ISSN, eissns[0])
418 if pissns is not None and len(pissns) > 0:
419 rbj.add_identifier(rbj.P_ISSN, pissns[0])
420 except KeyError:
421 # No issns, don't worry about it for now
422 pass
424 return trip
426 def merge(self, old, take_id=True):
427 # this takes an old version of the article and brings
428 # forward any useful information that is needed. The rules of merge are:
429 # - ignore "index" (it gets regenerated on save)
430 # - always take the "created_date"
431 # - any top level field that does not exist in the current item (esp "id" and "history")
432 # - in "admin", copy any field that does not already exist
434 # first thing to do is create a snapshot of the old record
435 old.snapshot()
437 # now go on and do the merge
439 # always take the created date
440 self.set_created(old.created_date)
442 # take the id
443 if self.id is None or take_id:
444 self.set_id(old.id)
446 # take the history (deprecated)
447 if len(self.data.get("history", [])) == 0:
448 self.data["history"] = deepcopy(old.data.get("history", []))
450 # take the bibjson
451 if "bibjson" not in self.data:
452 self.set_bibjson(deepcopy(old.bibjson()))
454 # take the admin if there isn't one
455 if "admin" not in self.data:
456 self.data["admin"] = deepcopy(old.data.get("admin", {}))
457 else:
458 # otherwise, copy any admin keys that don't exist on the current item
459 oa = old.data.get("admin", {})
460 for key in oa:
461 if key not in self.data["admin"]:
462 self.data["admin"][key] = deepcopy(oa[key])
464 def _generate_index(self):
465 # the index fields we are going to generate
466 issns = []
467 subjects = []
468 schema_subjects = []
469 schema_codes = []
470 schema_codes_tree = []
471 classification = []
472 langs = []
473 country = None
474 publisher = []
475 classification_paths = []
476 unpunctitle = None
477 asciiunpunctitle = None
478 doi = None
479 fulltext = None
481 # the places we're going to get those fields from
482 cbib = self.bibjson()
483 jindex = self.data.get('index', {})
484 hist = self.history()
486 # get the issns out of the current bibjson
487 issns += cbib.get_identifiers(cbib.P_ISSN)
488 issns += cbib.get_identifiers(cbib.E_ISSN)
490 # get the issn from the journal bibjson
491 if isinstance(cbib.journal_issns, list):
492 issns += cbib.journal_issns
494 # de-duplicate the issns
495 issns = list(set(issns))
497 # now get the issns out of the historic records
498 for date, hbib in hist:
499 issns += hbib.get_identifiers(hbib.P_ISSN)
500 issns += hbib.get_identifiers(hbib.E_ISSN)
502 # get the subjects and concatenate them with their schemes from the current bibjson
503 for subs in cbib.subjects():
504 scheme = subs.get("scheme")
505 term = subs.get("term")
506 subjects.append(term)
507 schema_subjects.append(scheme + ":" + term)
508 classification.append(term)
509 if "code" in subs:
510 schema_codes.append(scheme + ":" + subs.get("code"))
512 # copy the languages
513 if len(cbib.journal_language) > 0:
514 langs = [datasets.name_for_lang(l) for l in cbib.journal_language]
516 # Get the country name from the bibjson country code
517 if cbib.journal_country:
518 country = datasets.get_country_name(cbib.journal_country)
520 # copy the publisher/provider
521 if cbib.publisher:
522 publisher.append(cbib.publisher)
524 # deduplicate the lists
525 issns = list(set(issns))
526 subjects = list(set(subjects))
527 schema_subjects = list(set(schema_subjects))
528 classification = list(set(classification))
529 publisher = list(set(publisher))
530 langs = list(set(langs))
531 schema_codes = list(set(schema_codes))
533 # work out what the date of publication is
534 date = cbib.get_publication_date()
536 # calculate the classification paths
537 from portality.lcc import lcc # inline import since this hits the database
538 for subs in cbib.subjects():
539 scheme = subs.get("scheme")
540 term = subs.get("term")
541 if scheme == "LCC":
542 path = lcc.pathify(term)
543 if path is not None:
544 classification_paths.append(path)
546 # normalise the classification paths, so we only store the longest ones
547 classification_paths = lcc.longest(classification_paths)
548 schema_codes_tree = cbib.lcc_codes_full_list()
550 # create an unpunctitle
551 if cbib.title is not None:
552 throwlist = string.punctuation + '\n\t'
553 unpunctitle = "".join(c for c in cbib.title if c not in throwlist).strip()
554 try:
555 asciiunpunctitle = unidecode(unpunctitle)
556 except:
557 asciiunpunctitle = unpunctitle
559 # create a normalised version of the DOI for deduplication
560 source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI)
561 try:
562 doi = normalise.normalise_doi(source_doi)
563 except ValueError as e:
564 # if we can't normalise the DOI, just store it cast to lower case.
565 doi = source_doi.lower()
567 # create a normalised version of the fulltext URL for deduplication
568 fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT)
569 if len(fulltexts) > 0:
570 source_fulltext = fulltexts[0]
571 try:
572 fulltext = normalise.normalise_url(source_fulltext)
573 except ValueError as e:
574 # if we can't normalise the fulltext store it as-is
575 fulltext = source_fulltext
577 # build the index part of the object
578 self.data["index"] = {}
579 if len(issns) > 0:
580 self.data["index"]["issn"] = issns
581 if date != "":
582 self.data["index"]["date"] = date
583 self.data["index"]["date_toc_fv_month"] = date # Duplicated so we can have year/month facets in fv2
584 if len(subjects) > 0:
585 self.data["index"]["subject"] = subjects
586 if len(schema_subjects) > 0:
587 self.data["index"]["schema_subject"] = schema_subjects
588 if len(classification) > 0:
589 self.data["index"]["classification"] = classification
590 if len(publisher) > 0:
591 self.data["index"]["publisher"] = publisher
592 if len(langs) > 0:
593 self.data["index"]["language"] = langs
594 if country is not None:
595 self.data["index"]["country"] = country
596 if len(schema_codes) > 0:
597 self.data["index"]["schema_code"] = schema_codes
598 if len(classification_paths) > 0:
599 self.data["index"]["classification_paths"] = classification_paths
600 if unpunctitle is not None:
601 self.data["index"]["unpunctitle"] = unpunctitle
602 if asciiunpunctitle is not None:
603 self.data["index"]["asciiunpunctitle"] = unpunctitle
604 if doi is not None:
605 self.data["index"]["doi"] = doi
606 if fulltext is not None:
607 self.data["index"]["fulltext"] = fulltext
608 if len(schema_codes_tree) > 0:
609 self.data["index"]["schema_codes_tree"] = schema_codes_tree
611 def prep(self):
612 self._generate_index()
613 self.data['last_updated'] = dates.now_str()
615 def save(self, *args, **kwargs):
616 self._generate_index()
617 return super(Article, self).save(*args, **kwargs)
619 def get_owner(self):
620 b = self.bibjson()
621 article_issns = b.get_identifiers(b.P_ISSN)
622 article_issns += b.get_identifiers(b.E_ISSN)
623 owners = []
625 seen_journal_issns = {}
626 for issn in article_issns:
627 journals = Journal.find_by_issn(issn)
628 if journals is not None and len(journals) > 0:
629 for j in journals:
630 owners.append(j.owner)
631 if j.owner not in seen_journal_issns:
632 seen_journal_issns[j.owner] = []
633 seen_journal_issns[j.owner] += j.bibjson().issns()
635 # deduplicate the list of owners
636 owners = list(set(owners))
638 # no owner means we can't confirm
639 if len(owners) == 0:
640 raise NoValidOwnerException
642 # multiple owners means ownership of this article is confused
643 if len(owners) > 1:
644 return NoValidOwnerException
646 return owners[0]
649class ArticleTombstone(Article):
650 __type__ = "article_tombstone"
652 def snapshot(self):
653 return None
655 def is_in_doaj(self):
656 return False
658 def prep(self):
659 self.data['last_updated'] = dates.now_str()
661 def save(self, *args, **kwargs):
662 return super(ArticleTombstone, self).save(*args, **kwargs)
665class ArticleBibJSON(GenericBibJSON):
667 def __init__(self, bibjson=None, **kwargs):
668 self._add_struct(shared_structs.SHARED_BIBJSON.get("structs", {}).get("bibjson"))
669 self._add_struct(ARTICLE_BIBJSON_EXTENSION.get("structs", {}).get("bibjson"))
670 super(ArticleBibJSON, self).__init__(bibjson, **kwargs)
672 # article-specific simple getters and setters
673 @property
674 def year(self):
675 return self._get_single("year")
677 @year.setter
678 def year(self, val):
679 self._set_with_struct("year", val)
681 @year.deleter
682 def year(self):
683 self._delete("year")
685 @property
686 def month(self):
687 return self._get_single("month")
689 @month.setter
690 def month(self, val):
691 self._set_with_struct("month", val)
693 @month.deleter
694 def month(self):
695 self._delete("month")
697 @property
698 def start_page(self):
699 return self._get_single("start_page")
701 @start_page.setter
702 def start_page(self, val):
703 self._set_with_struct("start_page", val)
705 @property
706 def end_page(self):
707 return self._get_single("end_page")
709 @end_page.setter
710 def end_page(self, val):
711 self._set_with_struct("end_page", val)
713 @property
714 def abstract(self):
715 return self._get_single("abstract")
717 @abstract.setter
718 def abstract(self, val):
719 self._set_with_struct("abstract", val)
721 # article-specific complex part getters and setters
723 @property
724 def volume(self):
725 return self._get_single("journal.volume")
727 @volume.setter
728 def volume(self, value):
729 self._set_with_struct("journal.volume", value)
731 @property
732 def number(self):
733 return self._get_single("journal.number")
735 @number.setter
736 def number(self, value):
737 self._set_with_struct("journal.number", value)
739 @property
740 def journal_title(self):
741 return self._get_single("journal.title")
743 @journal_title.setter
744 def journal_title(self, title):
745 self._set_with_struct("journal.title", title)
747 @property
748 def journal_language(self):
749 return self._get_list("journal.language")
751 @journal_language.setter
752 def journal_language(self, lang):
753 self._set_with_struct("journal.language", lang)
755 @property
756 def journal_country(self):
757 return self._get_single("journal.country")
759 @journal_country.setter
760 def journal_country(self, country):
761 self._set_single("journal.country", country)
763 @property
764 def journal_issns(self):
765 return self._get_list("journal.issns")
767 @journal_issns.setter
768 def journal_issns(self, issns):
769 self._set_with_struct("journal.issns", issns)
771 @property
772 def publisher(self):
773 return self._get_single("journal.publisher")
775 @publisher.setter
776 def publisher(self, value):
777 self._set_with_struct("journal.publisher", value)
779 def add_author(self, name, affiliation=None, orcid_id=None):
780 aobj = {"name": name}
781 if affiliation is not None:
782 aobj["affiliation"] = affiliation
783 if orcid_id is not None:
784 aobj["orcid_id"] = orcid_id
785 self._add_to_list_with_struct("author", aobj)
787 @property
788 def author(self):
789 return self._get_list("author")
791 @author.setter
792 def author(self, authors):
793 self._set_with_struct("author", authors)
795 def get_publication_date(self, date_format=FMT_DATETIME_STD):
796 # work out what the date of publication is
797 date = ""
798 if self.year is not None:
799 if type(self.year) is str: # It should be, if the mappings are correct. but len() needs a sequence.
800 # fix 2 digit years
801 if len(self.year) == 2:
802 try:
803 intyear = int(self.year)
804 except ValueError:
805 # if it's 2 chars long and the 2 chars don't make an integer,
806 # forget it
807 return date
809 # In the case of truncated years, assume it's this century if before the current year
810 if intyear <= int(str(dates.now().year)[:-2]):
811 self.year = "20" + self.year # For readability over long-lasting code, I have refrained
812 else: # from using str(dates.now().year)[:2] here.
813 self.year = "19" + self.year # But don't come crying to me 90-ish years from now.
815 # if we still don't have a 4 digit year, forget it
816 if len(self.year) != 4:
817 return date
819 # build up our proposed datestamp
820 date += str(self.year)
821 if self.month is not None:
822 try:
823 if type(self.month) is int:
824 if 1 <= int(self.month) <= 12:
825 month_number = self.month
826 else:
827 month_number = 1
828 elif len(self.month) <= 2:
829 if 1 <= int(self.month) <= 12:
830 month_number = self.month
831 else:
832 month_number = '1'
833 elif len(self.month) == 3: # 'May' works with either case, obvz.
834 month_number = datetime.strptime(self.month, '%b').month
835 else:
836 month_number = datetime.strptime(self.month, '%B').month
839 # pad the month number to two digits. This accepts int or string
840 date += '-{:0>2}'.format(month_number)
841 except:
842 # If something goes wrong, just assume it's January
843 date += "-01"
844 else:
845 date += "-01"
846 date += "-01T00:00:00Z"
848 # attempt to confirm the format of our datestamp
849 try:
850 datecheck = dates.parse(date)
851 date = datecheck.strftime(date_format)
852 except:
853 return ""
854 return date
856 def remove_journal_metadata(self):
857 self._delete("journal")
859 def vancouver_citation(self):
860 jtitle = self.journal_title
861 year = self.year
862 vol = self.volume
863 iss = self.number
864 start = self.start_page
865 end = self.end_page
867 citation = ""
869 if year:
870 citation += year + ";"
872 if vol:
873 citation += vol
875 if iss:
876 citation += "(" + iss + ")"
878 if start or end:
879 if citation != "":
880 citation += ":"
881 if start:
882 citation += start
883 if end:
884 if start:
885 citation += "-"
886 citation += end
888 return jtitle.strip(), citation
890 def lcc_codes_full_list(self):
891 full_list = set()
893 from portality.lcc import lcc # inline import since this hits the database
894 for subs in self.subjects():
895 scheme = subs.get("scheme")
896 if scheme != "LCC":
897 continue
898 code = subs.get("code")
899 expanded = lcc.expand_codes(code)
900 full_list.update(expanded)
902 return ["LCC:" + x for x in full_list if x is not None]
905##################################################
907class ArticleQuery(object):
908 base_query = {
909 "track_total_hits" : True,
910 "query" : {
911 "bool" : {
912 "must" : []
913 }
914 }
915 }
917 _issn_terms = { "terms" : {"index.issn.exact" : ["<list of issns here>"]} }
918 _volume_term = { "term" : {"bibjson.journal.volume.exact" : "<volume here>"} }
920 def __init__(self, issns=None, volume=None, in_doaj=None):
921 self.issns = issns
922 self.volume = volume
923 self.in_doaj = in_doaj
925 def query(self):
926 q = deepcopy(self.base_query)
928 if self.issns is not None:
929 iq = deepcopy(self._issn_terms)
930 iq["terms"]["index.issn.exact"] = self.issns
931 q["query"]["bool"]["must"].append(iq)
933 if self.volume is not None:
934 vq = deepcopy(self._volume_term)
935 vq["term"]["bibjson.journal.volume.exact"] = self.volume
936 q["query"]["bool"]["must"].append(vq)
938 if self.in_doaj is not None:
939 q["query"]["bool"]["must"].append({"term": {"admin.in_doaj": self.in_doaj}})
941 return q
943class ArticleVolumesQuery(object):
944 base_query = {
945 "track_total_hits": True,
946 "query" : {
947 "bool": {
948 "filter": {
949 "terms" : {"index.issn.exact" : ["<list of issns here>"]}
950 }
951 }
952 },
953 "size" : 0,
954 "aggs" : {
955 "vols" : {
956 "terms" : {
957 "field" : "bibjson.journal.volume.exact",
958 "order": {"_key" : "desc"},
959 "size" : 1000
960 }
961 }
962 }
963 }
965 def __init__(self, issns=None):
966 self.issns = issns
968 def query(self):
969 q = deepcopy(self.base_query)
970 q["query"]["bool"]["filter"]["terms"]["index.issn.exact"] = self.issns
971 return q
974class ArticleVolumesIssuesQuery(object):
975 base_query = {
976 "track_total_hits": True,
977 "query" : {
978 "bool": {
979 "filter": {
980 "bool": {
981 "must": [
982 {"terms" : {"index.issn.exact" : ["<list of issns here>"]}},
983 {"term" : {"bibjson.journal.volume.exact" : "<volume here>"}}
984 ]
985 }
986 }
987 }
988 },
989 "size" : 0,
990 "aggs" : {
991 "issues" : {
992 "terms" : {
993 "field" : "bibjson.journal.number.exact",
994 "order": {"_key", "desc"},
995 "size" : 1000
996 }
997 }
998 }
999 }
1001 def __init__(self, issns=None, volume=None):
1002 self.issns = issns
1003 self.volume = volume
1005 def query(self):
1006 q = deepcopy(self.base_query)
1007 q["query"]["bool"]["filter"]["bool"]["must"][0]["terms"]["index.issn.exact"] = self.issns
1008 q["query"]["bool"]["filter"]["bool"]["must"][1]["term"]["bibjson.journal.volume.exact"] = self.volume
1009 return q
1012class DuplicateArticleQuery(object):
1013 base_query = {
1014 "track_total_hits" : True,
1015 "query": {
1016 "bool": {
1017 "must": []
1018 }
1019 },
1020 "sort": [{"last_updated": {"order": "desc"}}]
1021 }
1023 _should = {
1024 "should" : [],
1025 "minimum_should_match" : 2
1026 }
1028 _volume_term = {"term" : {"bibjson.journal.volume.exact" : "<volume>"}}
1029 _number_term = {"term" : {"bibjson.journal.number.exact" : "<issue number>"}}
1030 _start_term = {"term" : {"bibjson.start_page.exact" : "<start page>"}}
1031 _issn_terms = {"terms" : { "index.issn.exact" : ["<list of issns>"] }}
1032 _pubrec_term = {"term" : {"admin.publisher_record_id.exact" : "<publisher record id>"}}
1033 _identifier_term = {"term" : {"bibjson.identifier.id.exact" : "<issn here>"}}
1034 _doi_term = {"term" : {"index.doi.exact" : "<doi here>"}}
1035 _fulltext_terms = {"terms" : {"index.fulltext.exact" : ["<fulltext here>"]}}
1036 _fuzzy_title = {"fuzzy" : {"bibjson.title.exact" : "<title here>"}}
1038 def __init__(self, issns=None, publisher_record_id=None, doi=None, urls=None, title=None, volume=None, number=None, start=None, should_match=None, size=10):
1039 self.issns = issns if isinstance(issns, list) else []
1040 self.publisher_record_id = publisher_record_id
1041 self.doi = doi
1042 self.urls = urls if isinstance(urls, list) else [urls] if isinstance(urls, str) or isinstance(urls, str) else []
1043 self.title = title
1044 self.volume = volume
1045 self.number = number
1046 self.start = start
1047 self.should_match = should_match
1048 self.size = size
1050 def query(self):
1051 # - MUST be from at least one of the ISSNs
1052 # - MUST have the publisher record id
1053 # - MUST have the doi unless should_match is set
1054 # - MUST have the one of the fulltext urls unless should_match is set
1055 # - MUST fuzzy match the title
1056 # - SHOULD have <should_match> of: volume, issue, start page, fulltext url, doi
1058 q = deepcopy(self.base_query)
1059 if len(self.issns) > 0:
1060 it = deepcopy(self._issn_terms)
1061 it["terms"]["index.issn.exact"] = self.issns
1062 q["query"]["bool"]["must"].append(it)
1064 if self.publisher_record_id is not None:
1065 pr = deepcopy(self._pubrec_term)
1066 pr["term"]["admin.publisher_record_id.exact"] = self.publisher_record_id
1067 q["query"]["bool"]["must"].append(pr)
1069 if self.doi is not None and self.should_match is None:
1070 idt = deepcopy(self._doi_term)
1071 # idt["term"]["bibjson.identifier.id.exact"] = self.doi
1072 idt["term"]["index.doi.exact"] = self.doi
1073 q["query"]["bool"]["must"].append(idt)
1075 if len(self.urls) > 0 and self.should_match is None:
1076 uq = deepcopy(self._fulltext_terms)
1077 # uq["terms"]["bibjson.link.url.exact"] = self.urls
1078 uq["terms"]["index.fulltext.exact"] = self.urls
1079 q["query"]["bool"]["must"].append(uq)
1081 if self.title is not None:
1082 ft = deepcopy(self._fuzzy_title)
1083 ft["fuzzy"]["bibjson.title.exact"] = self.title
1084 q["query"]["bool"]["must"].append(ft)
1086 if self.should_match is not None:
1087 term_count = 0
1088 s = deepcopy(self._should)
1090 if self.volume is not None:
1091 term_count += 1
1092 vt = deepcopy(self._volume_term)
1093 vt["term"]["bibjson.journal.volume.exact"] = self.volume
1094 s["should"].append(vt)
1096 if self.number is not None:
1097 term_count += 1
1098 nt = deepcopy(self._number_term)
1099 nt["term"]["bibjson.journal.number.exact"] = self.number
1100 s["should"].append(nt)
1102 if self.start is not None:
1103 term_count += 1
1104 st = deepcopy(self._start_term)
1105 st["term"]["bibjson.start_page.exact"] = self.start
1106 s["should"].append(st)
1108 if len(self.urls) > 0:
1109 term_count += 1
1110 uq = deepcopy(self._url_terms)
1111 uq["terms"]["bibjson.link.url.exact"] = self.urls
1112 s["should"].append(uq)
1114 if self.doi is not None:
1115 term_count += 1
1116 idt = deepcopy(self._identifier_term)
1117 idt["term"]["bibjson.identifier.id.exact"] = self.doi
1118 s["should"].append(idt)
1120 msm = self.should_match
1121 if msm > term_count:
1122 msm = term_count
1123 s["minimum_should_match"] = msm
1125 q["query"]["bool"].update(s)
1127 # Allow more results than the default
1128 q["size"] = self.size
1130 return q
1133def _human_sort(things, reverse=True):
1134 numeric = []
1135 non_numeric = []
1136 nmap = {}
1137 for v in things:
1138 try:
1139 # try to convert n to an int
1140 vint = int(v)
1142 # remember the original string (it may have leading 0s)
1143 try:
1144 nmap[vint].append(v)
1145 except KeyError:
1146 nmap[vint] = [v]
1147 numeric.append(vint)
1148 except:
1149 non_numeric.append(v)
1151 numeric.sort(reverse=reverse)
1152 non_numeric.sort(reverse=reverse)
1154 # convert the integers back to their string representation
1155 return reduce(lambda x, y: x+y, [nmap[n] for n in numeric], []) + non_numeric
1158def _sort_articles(articles):
1159 # first extract the array we want to sort on
1160 # and make a map of that value to the issue itself
1161 unsorted = []
1162 numbers = []
1163 imap = {}
1164 for art in articles:
1165 sp = art.get("bibjson.start_page", [None])[0]
1167 # can't sort anything that doesn't have a start page
1168 if sp is None:
1169 unsorted.append(art)
1170 continue
1172 # deal with start page clashes and record the start pages
1173 # to sort by
1174 if sp not in numbers:
1175 numbers.append(sp)
1176 if sp in imap:
1177 imap[sp].append(art)
1178 else:
1179 imap[sp] = [art]
1181 sorted_keys = _human_sort(numbers, reverse=False)
1183 s = []
1184 for n in sorted_keys:
1185 s += [x for x in imap[n]]
1186 s += [x for x in unsorted]
1188 return s