Coverage for portality/models/article.py: 81%
716 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-11-09 16:22 +0000
« prev ^ index » next coverage.py v6.4.2, created at 2022-11-09 16:22 +0000
1import string
2import warnings
4from unidecode import unidecode
5from functools import reduce
6from copy import deepcopy
7from datetime import datetime
9from portality import datasets, constants
10from portality.dao import DomainObject
11from portality.models import Journal
12from portality.models.v1.bibjson import GenericBibJSON # NOTE that article specifically uses the v1 BibJSON
13from portality.models.v1 import shared_structs
14from portality.lib import normalise
17class NoJournalException(Exception):
18 pass
20class NoValidOwnerException(Exception):
21 pass
24class Article(DomainObject):
25 __type__ = "article"
27 @classmethod
28 def duplicates(cls, publisher_record_id=None, doi=None, fulltexts=None, title=None, volume=None, number=None, start=None, should_match=None, size=10):
29 # some input sanitisation
30 urls = fulltexts if isinstance(fulltexts, list) else [fulltexts] if isinstance(fulltexts, str) or isinstance(fulltexts, str) else []
32 # make sure that we're dealing with the normal form of the identifiers
33 norm_urls = []
34 for url in urls:
35 try:
36 norm = normalise.normalise_url(url)
37 norm_urls.append(norm)
38 except ValueError:
39 # use the non-normal form
40 norm_urls.append(url)
41 urls = norm_urls
43 try:
44 doi = normalise.normalise_doi(doi)
45 except ValueError:
46 # leave the doi as it is
47 pass
49 q = DuplicateArticleQuery(publisher_record_id=publisher_record_id,
50 doi=doi,
51 urls=urls,
52 title=title,
53 volume=volume,
54 number=number,
55 start=start,
56 should_match=should_match,
57 size=size)
59 # res = cls.query(q=q.query())
60 # return [cls(**hit.get("_source")) for hit in res.get("hits", {}).get("hits", [])]
61 return cls.q2obj(q=q.query())
63 @classmethod
64 def list_volumes(cls, issns):
65 q = ArticleVolumesQuery(issns)
66 result = cls.query(q=q.query())
67 return _human_sort([t.get("key") for t in result.get("aggregations", {}).get("vols", {}).get("buckets", [])])
69 @classmethod
70 def list_volume_issues(cls, issns, volume):
71 q = ArticleVolumesIssuesQuery(issns, volume)
72 result = cls.query(q=q.query())
73 return _human_sort([t.get("key") for t in result.get("aggregations", {}).get("issues", {}).get("buckets", [])])
75 @classmethod
76 def get_by_volume(cls, issns, volume):
77 q = ArticleQuery(issns=issns, volume=volume)
78 articles = cls.iterate(q.query(), page_size=1000)
79 return articles
81 @classmethod
82 def find_by_issns(cls, issns):
83 q = ArticleQuery(issns=issns)
84 articles = cls.iterate(q.query(), page_size=1000)
85 return articles
87 @classmethod
88 def count_by_issns(cls, issns):
89 q = ArticleQuery(issns=issns)
90 return cls.hit_count(q.query())
92 @classmethod
93 def delete_by_issns(cls, issns, snapshot=True):
94 q = ArticleQuery(issns=issns)
95 cls.delete_selected(query=q.query(), snapshot=snapshot)
97 @classmethod
98 def delete_selected(cls, query=None, owner=None, snapshot=True):
99 if owner is not None:
100 from portality.models import Journal
101 issns = Journal.issns_by_owner(owner)
102 q = ArticleQuery(issns=issns)
103 query = q.query()
105 if snapshot:
106 articles = cls.iterate(query, page_size=1000)
107 for article in articles:
108 article.snapshot()
109 return cls.delete_by_query(query)
111 def bibjson(self, **kwargs):
112 if "bibjson" not in self.data:
113 self.data["bibjson"] = {}
114 return ArticleBibJSON(self.data.get("bibjson"), **kwargs)
116 def set_bibjson(self, bibjson):
117 bibjson = bibjson.bibjson if isinstance(bibjson, ArticleBibJSON) else bibjson
118 self.data["bibjson"] = bibjson
120 def history(self):
121 hs = self.data.get("history", [])
122 tuples = []
123 for h in hs:
124 tuples.append((h.get("date"), ArticleBibJSON(h.get("bibjson"))))
125 return tuples
127 def snapshot(self):
128 from portality.models import ArticleHistory
130 snap = deepcopy(self.data)
131 if "id" in snap:
132 snap["about"] = snap["id"]
133 del snap["id"]
134 if "index" in snap:
135 del snap["index"]
136 if "last_updated" in snap:
137 del snap["last_updated"]
138 if "created_date" in snap:
139 del snap["created_date"]
141 hist = ArticleHistory(**snap)
142 hist.save()
143 return hist.id
145 def add_history(self, bibjson, date=None):
146 """Deprecated"""
147 bibjson = bibjson.bibjson if isinstance(bibjson, ArticleBibJSON) else bibjson
148 if date is None:
149 date = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
150 snobj = {"date": date, "bibjson": bibjson}
151 if "history" not in self.data:
152 self.data["history"] = []
153 self.data["history"].append(snobj)
155 def is_in_doaj(self):
156 return self.data.get("admin", {}).get("in_doaj", False)
158 def set_in_doaj(self, value):
159 if "admin" not in self.data:
160 self.data["admin"] = {}
161 self.data["admin"]["in_doaj"] = value
163 def has_seal(self):
164 return self.data.get("admin", {}).get("seal", False)
166 def set_seal(self, value):
167 if "admin" not in self.data:
168 self.data["admin"] = {}
169 self.data["admin"]["seal"] = value
171 def publisher_record_id(self):
172 return self.data.get("admin", {}).get("publisher_record_id")
174 def set_publisher_record_id(self, pri):
175 if "admin" not in self.data:
176 self.data["admin"] = {}
177 self.data["admin"]["publisher_record_id"] = pri
179 def upload_id(self):
180 return self.data.get("admin", {}).get("upload_id")
182 def set_upload_id(self, uid):
183 if "admin" not in self.data:
184 self.data["admin"] = {}
185 self.data["admin"]["upload_id"] = uid
187 def get_normalised_doi(self):
188 if self.data.get("index", {}).get("doi") is not None:
189 return self.data["index"]["doi"]
190 doi = self.bibjson().get_one_identifier(constants.IDENT_TYPE_DOI)
191 if doi is None:
192 return None
193 try:
194 return normalise.normalise_doi(doi)
195 except ValueError:
196 # can't be normalised, so we just return the doi as-is
197 return doi
199 def get_normalised_fulltext(self):
200 if self.data.get("index", {}).get("fulltext") is not None:
201 return self.data["index"]["fulltext"]
202 fulltexts = self.bibjson().get_urls(constants.LINK_TYPE_FULLTEXT)
203 if len(fulltexts) == 0:
204 return None
205 try:
206 return normalise.normalise_url(fulltexts[0])
207 except ValueError:
208 # can't be normalised, so we just return the url as-is
209 return fulltexts[0]
211 def get_journal(self):
212 """
213 Get this article's associated journal
214 :return: A Journal, or None if this is an orphan article
215 """
216 bibjson = self.bibjson()
218 # first, get the ISSNs associated with the record
219 pissns = bibjson.get_identifiers(bibjson.P_ISSN)
220 eissns = bibjson.get_identifiers(bibjson.E_ISSN)
221 allissns = list(set(pissns + eissns))
223 # find a matching journal record from the index
224 journal = None
225 for issn in allissns:
226 journals = Journal.find_by_issn(issn)
227 if len(journals) > 0:
228 # there should only ever be one, so take the first one
229 journal = journals[0]
230 break
232 return journal
234 def get_associated_journals(self):
235 # find all matching journal record from the index
236 allissns = self.bibjson().issns()
237 return Journal.find_by_issn(allissns)
239 def add_journal_metadata(self, j=None, reg=None):
240 """
241 this function makes sure the article is populated
242 with all the relevant info from its owning parent object
243 :param j: Pass in a Journal to bypass the (slow) locating step. MAKE SURE IT'S THE RIGHT ONE!
244 """
246 # Record the data that is copied into the article into the "reg"ister, in case the
247 # caller needs to know exactly and only which information was copied
248 if reg is None:
249 reg = Journal()
250 rbj = reg.bibjson()
252 if j is None:
253 journal = self.get_journal()
254 else:
255 journal = j
257 # we were unable to find a journal
258 if journal is None:
259 raise NoJournalException("Unable to find a journal associated with this article")
261 # if we get to here, we have a journal record we want to pull data from
262 jbib = journal.bibjson()
263 bibjson = self.bibjson()
265 # tripwire to be tripped if the journal makes changes to the article
266 trip = False
268 if bibjson.subjects() != jbib.subjects():
269 trip = True
270 bibjson.set_subjects(jbib.subjects())
271 rbj.set_subjects(jbib.subjects())
273 if jbib.title is not None:
274 if bibjson.journal_title != jbib.title:
275 trip = True
276 bibjson.journal_title = jbib.title
277 rbj.title = jbib.title
279 if len(jbib.language) > 0:
280 jlang = jbib.language
281 alang = bibjson.journal_language
282 jlang.sort()
283 alang.sort()
284 if jlang != alang:
285 bibjson.journal_language = jbib.language
286 trip = True
287 rbj.set_language(jbib.language)
289 if jbib.country is not None:
290 if jbib.country != bibjson.journal_country:
291 bibjson.journal_country = jbib.country
292 trip = True
293 rbj.country = jbib.country
295 if jbib.publisher:
296 if jbib.publisher != bibjson.publisher:
297 bibjson.publisher = jbib.publisher
298 trip = True
299 rbj.publisher = jbib.publisher
301 # Copy the seal info, in_doaj status and the journal's ISSNs
302 if journal.is_in_doaj() != self.is_in_doaj():
303 self.set_in_doaj(journal.is_in_doaj())
304 trip = True
305 reg.set_in_doaj(journal.is_in_doaj())
307 if journal.has_seal() != self.has_seal():
308 self.set_seal(journal.has_seal())
309 trip = True
310 reg.set_seal(journal.has_seal())
312 try:
313 aissns = bibjson.journal_issns
314 jissns = jbib.issns()
315 aissns.sort()
316 jissns.sort()
317 if aissns != jissns:
318 bibjson.journal_issns = jbib.issns()
319 trip = True
321 eissns = jbib.get_identifiers(jbib.E_ISSN)
322 pissns = jbib.get_identifiers(jbib.P_ISSN)
323 if eissns is not None and len(eissns) > 0:
324 rbj.add_identifier(rbj.E_ISSN, eissns[0])
325 if pissns is not None and len(pissns) > 0:
326 rbj.add_identifier(rbj.P_ISSN, pissns[0])
327 except KeyError:
328 # No issns, don't worry about it for now
329 pass
331 return trip
333 def merge(self, old, take_id=True):
334 # this takes an old version of the article and brings
335 # forward any useful information that is needed. The rules of merge are:
336 # - ignore "index" (it gets regenerated on save)
337 # - always take the "created_date"
338 # - any top level field that does not exist in the current item (esp "id" and "history")
339 # - in "admin", copy any field that does not already exist
341 # first thing to do is create a snapshot of the old record
342 old.snapshot()
344 # now go on and do the merge
346 # always take the created date
347 self.set_created(old.created_date)
349 # take the id
350 if self.id is None or take_id:
351 self.set_id(old.id)
353 # take the history (deprecated)
354 if len(self.data.get("history", [])) == 0:
355 self.data["history"] = deepcopy(old.data.get("history", []))
357 # take the bibjson
358 if "bibjson" not in self.data:
359 self.set_bibjson(deepcopy(old.bibjson()))
361 # take the admin if there isn't one
362 if "admin" not in self.data:
363 self.data["admin"] = deepcopy(old.data.get("admin", {}))
364 else:
365 # otherwise, copy any admin keys that don't exist on the current item
366 oa = old.data.get("admin", {})
367 for key in oa:
368 if key not in self.data["admin"]:
369 self.data["admin"][key] = deepcopy(oa[key])
371 def _generate_index(self):
372 # the index fields we are going to generate
373 issns = []
374 subjects = []
375 schema_subjects = []
376 schema_codes = []
377 schema_codes_tree = []
378 classification = []
379 langs = []
380 country = None
381 publisher = []
382 classification_paths = []
383 unpunctitle = None
384 asciiunpunctitle = None
385 doi = None
386 fulltext = None
388 # the places we're going to get those fields from
389 cbib = self.bibjson()
390 jindex = self.data.get('index', {})
391 hist = self.history()
393 # get the issns out of the current bibjson
394 issns += cbib.get_identifiers(cbib.P_ISSN)
395 issns += cbib.get_identifiers(cbib.E_ISSN)
397 # get the issn from the journal bibjson
398 if isinstance(cbib.journal_issns, list):
399 issns += cbib.journal_issns
401 # de-duplicate the issns
402 issns = list(set(issns))
404 # now get the issns out of the historic records
405 for date, hbib in hist:
406 issns += hbib.get_identifiers(hbib.P_ISSN)
407 issns += hbib.get_identifiers(hbib.E_ISSN)
409 # get the subjects and concatenate them with their schemes from the current bibjson
410 for subs in cbib.subjects():
411 scheme = subs.get("scheme")
412 term = subs.get("term")
413 subjects.append(term)
414 schema_subjects.append(scheme + ":" + term)
415 classification.append(term)
416 if "code" in subs:
417 schema_codes.append(scheme + ":" + subs.get("code"))
419 # copy the languages
420 if len(cbib.journal_language) > 0:
421 langs = [datasets.name_for_lang(l) for l in cbib.journal_language]
423 # copy the country
424 if jindex.get('country'):
425 country = jindex.get('country')
426 elif cbib.journal_country:
427 country = datasets.get_country_name(cbib.journal_country)
429 # copy the publisher/provider
430 if cbib.publisher:
431 publisher.append(cbib.publisher)
433 # deduplicate the lists
434 issns = list(set(issns))
435 subjects = list(set(subjects))
436 schema_subjects = list(set(schema_subjects))
437 classification = list(set(classification))
438 publisher = list(set(publisher))
439 langs = list(set(langs))
440 schema_codes = list(set(schema_codes))
442 # work out what the date of publication is
443 date = cbib.get_publication_date()
445 # calculate the classification paths
446 from portality.lcc import lcc # inline import since this hits the database
447 for subs in cbib.subjects():
448 scheme = subs.get("scheme")
449 term = subs.get("term")
450 if scheme == "LCC":
451 path = lcc.pathify(term)
452 if path is not None:
453 classification_paths.append(path)
455 # normalise the classification paths, so we only store the longest ones
456 classification_paths = lcc.longest(classification_paths)
457 schema_codes_tree = cbib.lcc_codes_full_list()
459 # create an unpunctitle
460 if cbib.title is not None:
461 throwlist = string.punctuation + '\n\t'
462 unpunctitle = "".join(c for c in cbib.title if c not in throwlist).strip()
463 try:
464 asciiunpunctitle = unidecode(unpunctitle)
465 except:
466 asciiunpunctitle = unpunctitle
468 # determine if the seal is applied
469 has_seal = "Yes" if self.has_seal() else "No"
471 # create a normalised version of the DOI for deduplication
472 source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI)
473 try:
474 doi = normalise.normalise_doi(source_doi)
475 except ValueError as e:
476 # if we can't normalise the DOI, just store it as-is.
477 doi = source_doi
480 # create a normalised version of the fulltext URL for deduplication
481 fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT)
482 if len(fulltexts) > 0:
483 source_fulltext = fulltexts[0]
484 try:
485 fulltext = normalise.normalise_url(source_fulltext)
486 except ValueError as e:
487 # if we can't normalise the fulltext store it as-is
488 fulltext = source_fulltext
492 # build the index part of the object
493 self.data["index"] = {}
494 if len(issns) > 0:
495 self.data["index"]["issn"] = issns
496 if date != "":
497 self.data["index"]["date"] = date
498 self.data["index"]["date_toc_fv_month"] = date # Duplicated so we can have year/month facets in fv2
499 if len(subjects) > 0:
500 self.data["index"]["subject"] = subjects
501 if len(schema_subjects) > 0:
502 self.data["index"]["schema_subject"] = schema_subjects
503 if len(classification) > 0:
504 self.data["index"]["classification"] = classification
505 if len(publisher) > 0:
506 self.data["index"]["publisher"] = publisher
507 if len(langs) > 0:
508 self.data["index"]["language"] = langs
509 if country is not None:
510 self.data["index"]["country"] = country
511 if len(schema_codes) > 0:
512 self.data["index"]["schema_code"] = schema_codes
513 if len(classification_paths) > 0:
514 self.data["index"]["classification_paths"] = classification_paths
515 if unpunctitle is not None:
516 self.data["index"]["unpunctitle"] = unpunctitle
517 if asciiunpunctitle is not None:
518 self.data["index"]["asciiunpunctitle"] = unpunctitle
519 if has_seal:
520 self.data["index"]["has_seal"] = has_seal
521 if doi is not None:
522 self.data["index"]["doi"] = doi
523 if fulltext is not None:
524 self.data["index"]["fulltext"] = fulltext
525 if len(schema_codes_tree) > 0:
526 self.data["index"]["schema_codes_tree"] = schema_codes_tree
528 def prep(self):
529 self._generate_index()
530 self.data['last_updated'] = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
532 def save(self, *args, **kwargs):
533 self._generate_index()
534 return super(Article, self).save(*args, **kwargs)
536 def get_owner(self):
537 b = self.bibjson()
538 article_issns = b.get_identifiers(b.P_ISSN)
539 article_issns += b.get_identifiers(b.E_ISSN)
540 owners = []
542 seen_journal_issns = {}
543 for issn in article_issns:
544 journals = Journal.find_by_issn(issn)
545 if journals is not None and len(journals) > 0:
546 for j in journals:
547 owners.append(j.owner)
548 if j.owner not in seen_journal_issns:
549 seen_journal_issns[j.owner] = []
550 seen_journal_issns[j.owner] += j.bibjson().issns()
552 # deduplicate the list of owners
553 owners = list(set(owners))
555 # no owner means we can't confirm
556 if len(owners) == 0:
557 raise NoValidOwnerException
559 # multiple owners means ownership of this article is confused
560 if len(owners) > 1:
561 return NoValidOwnerException
563 return owners[0]
565class ArticleBibJSON(GenericBibJSON):
567 def __init__(self, bibjson=None, **kwargs):
568 self._add_struct(shared_structs.SHARED_BIBJSON.get("structs", {}).get("bibjson"))
569 self._add_struct(ARTICLE_BIBJSON_EXTENSION.get("structs", {}).get("bibjson"))
570 super(ArticleBibJSON, self).__init__(bibjson, **kwargs)
572 # article-specific simple getters and setters
573 @property
574 def year(self):
575 return self._get_single("year")
577 @year.setter
578 def year(self, val):
579 self._set_with_struct("year", val)
581 @year.deleter
582 def year(self):
583 self._delete("year")
585 @property
586 def month(self):
587 return self._get_single("month")
589 @month.setter
590 def month(self, val):
591 self._set_with_struct("month", val)
593 @month.deleter
594 def month(self):
595 self._delete("month")
597 @property
598 def start_page(self):
599 return self._get_single("start_page")
601 @start_page.setter
602 def start_page(self, val):
603 self._set_with_struct("start_page", val)
605 @property
606 def end_page(self):
607 return self._get_single("end_page")
609 @end_page.setter
610 def end_page(self, val):
611 self._set_with_struct("end_page", val)
613 @property
614 def abstract(self):
615 return self._get_single("abstract")
617 @abstract.setter
618 def abstract(self, val):
619 self._set_with_struct("abstract", val)
621 # article-specific complex part getters and setters
623 @property
624 def volume(self):
625 return self._get_single("journal.volume")
627 @volume.setter
628 def volume(self, value):
629 self._set_with_struct("journal.volume", value)
631 @property
632 def number(self):
633 return self._get_single("journal.number")
635 @number.setter
636 def number(self, value):
637 self._set_with_struct("journal.number", value)
639 @property
640 def journal_title(self):
641 return self._get_single("journal.title")
643 @journal_title.setter
644 def journal_title(self, title):
645 self._set_with_struct("journal.title", title)
647 @property
648 def journal_language(self):
649 return self._get_list("journal.language")
651 @journal_language.setter
652 def journal_language(self, lang):
653 self._set_with_struct("journal.language", lang)
655 @property
656 def journal_country(self):
657 return self._get_single("journal.country")
659 @journal_country.setter
660 def journal_country(self, country):
661 self._set_single("journal.country", country)
663 @property
664 def journal_issns(self):
665 return self._get_list("journal.issns")
667 @journal_issns.setter
668 def journal_issns(self, issns):
669 self._set_with_struct("journal.issns", issns)
671 @property
672 def publisher(self):
673 return self._get_single("journal.publisher")
675 @publisher.setter
676 def publisher(self, value):
677 self._set_with_struct("journal.publisher", value)
679 def add_author(self, name, affiliation=None, orcid_id=None):
680 aobj = {"name": name}
681 if affiliation is not None:
682 aobj["affiliation"] = affiliation
683 if orcid_id is not None:
684 aobj["orcid_id"] = orcid_id
685 self._add_to_list_with_struct("author", aobj)
687 @property
688 def author(self):
689 return self._get_list("author")
691 @author.setter
692 def author(self, authors):
693 self._set_with_struct("author", authors)
695 def get_publication_date(self, date_format='%Y-%m-%dT%H:%M:%SZ'):
696 # work out what the date of publication is
697 date = ""
698 if self.year is not None:
699 if type(self.year) is str: # It should be, if the mappings are correct. but len() needs a sequence.
700 # fix 2 digit years
701 if len(self.year) == 2:
702 try:
703 intyear = int(self.year)
704 except ValueError:
705 # if it's 2 chars long and the 2 chars don't make an integer,
706 # forget it
707 return date
709 # In the case of truncated years, assume it's this century if before the current year
710 if intyear <= int(str(datetime.utcnow().year)[:-2]):
711 self.year = "20" + self.year # For readability over long-lasting code, I have refrained
712 else: # from using str(datetime.utcnow().year)[:2] here.
713 self.year = "19" + self.year # But don't come crying to me 90-ish years from now.
715 # if we still don't have a 4 digit year, forget it
716 if len(self.year) != 4:
717 return date
719 # build up our proposed datestamp
720 date += str(self.year)
721 if self.month is not None:
722 try:
723 if type(self.month) is int:
724 if 1 <= int(self.month) <= 12:
725 month_number = self.month
726 else:
727 month_number = 1
728 elif len(self.month) <= 2:
729 if 1 <= int(self.month) <= 12:
730 month_number = self.month
731 else:
732 month_number = '1'
733 elif len(self.month) == 3: # 'May' works with either case, obvz.
734 month_number = datetime.strptime(self.month, '%b').month
735 else:
736 month_number = datetime.strptime(self.month, '%B').month
739 # pad the month number to two digits. This accepts int or string
740 date += '-{:0>2}'.format(month_number)
741 except:
742 # If something goes wrong, just assume it's January
743 date += "-01"
744 else:
745 date += "-01"
746 date += "-01T00:00:00Z"
748 # attempt to confirm the format of our datestamp
749 try:
750 datecheck = datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")
751 date = datecheck.strftime(date_format)
752 except:
753 return ""
754 return date
756 def remove_journal_metadata(self):
757 self._delete("journal")
759 def vancouver_citation(self):
760 jtitle = self.journal_title
761 year = self.year
762 vol = self.volume
763 iss = self.number
764 start = self.start_page
765 end = self.end_page
767 citation = ""
769 if year:
770 citation += year + ";"
772 if vol:
773 citation += vol
775 if iss:
776 citation += "(" + iss + ")"
778 if start or end:
779 if citation != "":
780 citation += ":"
781 if start:
782 citation += start
783 if end:
784 if start:
785 citation += "-"
786 citation += end
788 return jtitle.strip(), citation
790 def lcc_codes_full_list(self):
791 full_list = set()
793 from portality.lcc import lcc # inline import since this hits the database
794 for subs in self.subjects():
795 scheme = subs.get("scheme")
796 if scheme != "LCC":
797 continue
798 code = subs.get("code")
799 expanded = lcc.expand_codes(code)
800 full_list.update(expanded)
802 return ["LCC:" + x for x in full_list if x is not None]
804ARTICLE_BIBJSON_EXTENSION = {
805 "objects" : ["bibjson"],
806 "structs" : {
807 "bibjson" : {
808 "fields" : {
809 "year" : {"coerce" : "unicode"},
810 "month" : {"coerce" : "unicode"},
811 "start_page" : {"coerce" : "unicode"},
812 "end_page" : {"coerce" : "unicode"},
813 "abstract" : {"coerce" : "unicode"}
814 },
815 "lists" : {
816 "author" : {"contains" : "object"}
817 },
818 "objects" : [
819 "journal"
820 ],
822 "structs" : {
823 "author" : {
824 "fields" : {
825 "name" : {"coerce" : "unicode"},
826 "affiliation" : {"coerce" : "unicode"},
827 "email" : {"coerce": "unicode"},
828 "orcid_id" : {"coerce" : "unicode"}
829 }
830 },
832 "journal" : {
833 "fields" : {
834 "volume" : {"coerce" : "unicode"},
835 "number" : {"coerce" : "unicode"},
836 "publisher" : {"coerce" : "unicode"},
837 "title" : {"coerce" : "unicode"},
838 "country" : {"coerce" : "unicode"}
839 },
840 "lists" : {
841 "language" : {"contains" : "field", "coerce" : "unicode"},
842 "issns" : {"contains" : "field", "coerce" : "unicode"}
843 }
844 }
845 }
847 }
848 }
849}
851##################################################
853class ArticleQuery(object):
854 base_query = {
855 "track_total_hits" : True,
856 "query" : {
857 "bool" : {
858 "must" : []
859 }
860 }
861 }
863 _issn_terms = { "terms" : {"index.issn.exact" : ["<list of issns here>"]} }
864 _volume_term = { "term" : {"bibjson.journal.volume.exact" : "<volume here>"} }
866 def __init__(self, issns=None, volume=None):
867 self.issns = issns
868 self.volume = volume
870 def query(self):
871 q = deepcopy(self.base_query)
873 if self.issns is not None:
874 iq = deepcopy(self._issn_terms)
875 iq["terms"]["index.issn.exact"] = self.issns
876 q["query"]["bool"]["must"].append(iq)
878 if self.volume is not None:
879 vq = deepcopy(self._volume_term)
880 vq["term"]["bibjson.journal.volume.exact"] = self.volume
881 q["query"]["bool"]["must"].append(vq)
883 return q
885class ArticleVolumesQuery(object):
886 base_query = {
887 "track_total_hits": True,
888 "query" : {
889 "bool": {
890 "filter": {
891 "terms" : {"index.issn.exact" : ["<list of issns here>"]}
892 }
893 }
894 },
895 "size" : 0,
896 "aggs" : {
897 "vols" : {
898 "terms" : {
899 "field" : "bibjson.journal.volume.exact",
900 "order": {"_key" : "desc"},
901 "size" : 1000
902 }
903 }
904 }
905 }
907 def __init__(self, issns=None):
908 self.issns = issns
910 def query(self):
911 q = deepcopy(self.base_query)
912 q["query"]["bool"]["filter"]["terms"]["index.issn.exact"] = self.issns
913 return q
916class ArticleVolumesIssuesQuery(object):
917 base_query = {
918 "track_total_hits": True,
919 "query" : {
920 "bool": {
921 "filter": {
922 "bool": {
923 "must": [
924 {"terms" : {"index.issn.exact" : ["<list of issns here>"]}},
925 {"term" : {"bibjson.journal.volume.exact" : "<volume here>"}}
926 ]
927 }
928 }
929 }
930 },
931 "size" : 0,
932 "aggs" : {
933 "issues" : {
934 "terms" : {
935 "field" : "bibjson.journal.number.exact",
936 "order": {"_key", "desc"},
937 "size" : 1000
938 }
939 }
940 }
941 }
943 def __init__(self, issns=None, volume=None):
944 self.issns = issns
945 self.volume = volume
947 def query(self):
948 q = deepcopy(self.base_query)
949 q["query"]["bool"]["filter"]["bool"]["must"][0]["terms"]["index.issn.exact"] = self.issns
950 q["query"]["bool"]["filter"]["bool"]["must"][1]["term"]["bibjson.journal.volume.exact"] = self.volume
951 return q
954class DuplicateArticleQuery(object):
955 base_query = {
956 "track_total_hits" : True,
957 "query": {
958 "bool": {
959 "must": []
960 }
961 },
962 "sort": [{"last_updated": {"order": "desc"}}]
963 }
965 _should = {
966 "should" : [],
967 "minimum_should_match" : 2
968 }
970 _volume_term = {"term" : {"bibjson.journal.volume.exact" : "<volume>"}}
971 _number_term = {"term" : {"bibjson.journal.number.exact" : "<issue number>"}}
972 _start_term = {"term" : {"bibjson.start_page.exact" : "<start page>"}}
973 _issn_terms = {"terms" : { "index.issn.exact" : ["<list of issns>"] }}
974 _pubrec_term = {"term" : {"admin.publisher_record_id.exact" : "<publisher record id>"}}
975 _identifier_term = {"term" : {"bibjson.identifier.id.exact" : "<issn here>"}}
976 _doi_term = {"term" : {"index.doi.exact" : "<doi here>"}}
977 _fulltext_terms = {"terms" : {"index.fulltext.exact" : ["<fulltext here>"]}}
978 _fuzzy_title = {"fuzzy" : {"bibjson.title.exact" : "<title here>"}}
980 def __init__(self, issns=None, publisher_record_id=None, doi=None, urls=None, title=None, volume=None, number=None, start=None, should_match=None, size=10):
981 self.issns = issns if isinstance(issns, list) else []
982 self.publisher_record_id = publisher_record_id
983 self.doi = doi
984 self.urls = urls if isinstance(urls, list) else [urls] if isinstance(urls, str) or isinstance(urls, str) else []
985 self.title = title
986 self.volume = volume
987 self.number = number
988 self.start = start
989 self.should_match = should_match
990 self.size = size
992 def query(self):
993 # - MUST be from at least one of the ISSNs
994 # - MUST have the publisher record id
995 # - MUST have the doi unless should_match is set
996 # - MUST have the one of the fulltext urls unless should_match is set
997 # - MUST fuzzy match the title
998 # - SHOULD have <should_match> of: volume, issue, start page, fulltext url, doi
1000 q = deepcopy(self.base_query)
1001 if len(self.issns) > 0:
1002 it = deepcopy(self._issn_terms)
1003 it["terms"]["index.issn.exact"] = self.issns
1004 q["query"]["bool"]["must"].append(it)
1006 if self.publisher_record_id is not None:
1007 pr = deepcopy(self._pubrec_term)
1008 pr["term"]["admin.publisher_record_id.exact"] = self.publisher_record_id
1009 q["query"]["bool"]["must"].append(pr)
1011 if self.doi is not None and self.should_match is None:
1012 idt = deepcopy(self._doi_term)
1013 # idt["term"]["bibjson.identifier.id.exact"] = self.doi
1014 idt["term"]["index.doi.exact"] = self.doi
1015 q["query"]["bool"]["must"].append(idt)
1017 if len(self.urls) > 0 and self.should_match is None:
1018 uq = deepcopy(self._fulltext_terms)
1019 # uq["terms"]["bibjson.link.url.exact"] = self.urls
1020 uq["terms"]["index.fulltext.exact"] = self.urls
1021 q["query"]["bool"]["must"].append(uq)
1023 if self.title is not None:
1024 ft = deepcopy(self._fuzzy_title)
1025 ft["fuzzy"]["bibjson.title.exact"] = self.title
1026 q["query"]["bool"]["must"].append(ft)
1028 if self.should_match is not None:
1029 term_count = 0
1030 s = deepcopy(self._should)
1032 if self.volume is not None:
1033 term_count += 1
1034 vt = deepcopy(self._volume_term)
1035 vt["term"]["bibjson.journal.volume.exact"] = self.volume
1036 s["should"].append(vt)
1038 if self.number is not None:
1039 term_count += 1
1040 nt = deepcopy(self._number_term)
1041 nt["term"]["bibjson.journal.number.exact"] = self.number
1042 s["should"].append(nt)
1044 if self.start is not None:
1045 term_count += 1
1046 st = deepcopy(self._start_term)
1047 st["term"]["bibjson.start_page.exact"] = self.start
1048 s["should"].append(st)
1050 if len(self.urls) > 0:
1051 term_count += 1
1052 uq = deepcopy(self._url_terms)
1053 uq["terms"]["bibjson.link.url.exact"] = self.urls
1054 s["should"].append(uq)
1056 if self.doi is not None:
1057 term_count += 1
1058 idt = deepcopy(self._identifier_term)
1059 idt["term"]["bibjson.identifier.id.exact"] = self.doi
1060 s["should"].append(idt)
1062 msm = self.should_match
1063 if msm > term_count:
1064 msm = term_count
1065 s["minimum_should_match"] = msm
1067 q["query"]["bool"].update(s)
1069 # Allow more results than the default
1070 q["size"] = self.size
1072 return q
1075def _human_sort(things, reverse=True):
1076 numeric = []
1077 non_numeric = []
1078 nmap = {}
1079 for v in things:
1080 try:
1081 # try to convert n to an int
1082 vint = int(v)
1084 # remember the original string (it may have leading 0s)
1085 try:
1086 nmap[vint].append(v)
1087 except KeyError:
1088 nmap[vint] = [v]
1089 numeric.append(vint)
1090 except:
1091 non_numeric.append(v)
1093 numeric.sort(reverse=reverse)
1094 non_numeric.sort(reverse=reverse)
1096 # convert the integers back to their string representation
1097 return reduce(lambda x, y: x+y, [nmap[n] for n in numeric], []) + non_numeric
1100def _sort_articles(articles):
1101 # first extract the array we want to sort on
1102 # and make a map of that value to the issue itself
1103 unsorted = []
1104 numbers = []
1105 imap = {}
1106 for art in articles:
1107 sp = art.get("bibjson.start_page", [None])[0]
1109 # can't sort anything that doesn't have a start page
1110 if sp is None:
1111 unsorted.append(art)
1112 continue
1114 # deal with start page clashes and record the start pages
1115 # to sort by
1116 if sp not in numbers:
1117 numbers.append(sp)
1118 if sp in imap:
1119 imap[sp].append(art)
1120 else:
1121 imap[sp] = [art]
1123 sorted_keys = _human_sort(numbers, reverse=False)
1125 s = []
1126 for n in sorted_keys:
1127 s += [x for x in imap[n]]
1128 s += [x for x in unsorted]
1130 return s