Coverage for portality/tasks/harvester_helpers/epmc/epmc_harvester.py: 87%
90 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-22 15:59 +0100
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-22 15:59 +0100
1from portality.models.harvester import HarvesterPlugin
2from portality.tasks.harvester_helpers.epmc import client, queries
3from portality.lib import dates
4from portality.api.current.client import models as doaj
5from portality.core import app
6from datetime import datetime
7import time
10class DefaultLogger():
11 def __init__(self):
12 self._log = []
14 def log(self, msg):
15 self._log.append({
16 "timestamp": dates.now_with_microseconds(),
17 "message" : msg
18 })
21class EPMCHarvester(HarvesterPlugin):
23 def __init__(self, logger=None):
24 self.logger = DefaultLogger() if logger is None else logger
25 super(EPMCHarvester, self).__init__()
27 def _write_to_logger(self, msg):
28 self.logger.log(msg)
30 def get_name(self):
31 return "epmc"
33 def iterate(self, issn, since, to=None):
34 # set the default value for to, if not already set
35 if to is None:
36 to = dates.now()
38 # get the dates into a datestamp
39 sd = dates.parse(since)
40 td = dates.parse(to)
42 # calculate the ranges we're going to want to query by
43 # We're going to query epmc one day at a time, so that we can effectively
44 # iterate through in updated date order (though within each day, there will
45 # be no ordering, there is little we can do about that except reduce the
46 # request granularity further, which would massively increase the number
47 # of requests)
48 ranges = dates.day_ranges(sd, td)
49 throttle = app.config.get("EPMC_HARVESTER_THROTTLE")
51 last = None
52 for fr, until in ranges:
53 # throttle each day
54 if last is not None and throttle is not None:
55 diff = (datetime.utcnow() - last).total_seconds()
56 self._write_to_logger("Last day request at {x}, {y}s ago; throttle {z}s".format(x=last, y=diff, z=throttle))
57 if diff < throttle:
58 waitfor = throttle - diff
59 self._write_to_logger(
60 "Throttling EPMC requests for {x}s".format(x=waitfor))
61 time.sleep(waitfor)
63 # build the query for the oa articles in that issn for the specified day (note we don't use the range, as the granularity in EPMC means we'd double count
64 # note that we use date_sort=True as a weak proxy for ordering by updated date (it actually orders by publication date, which may be partially the same as updated date)
65 query = queries.oa_issn_updated(issn, fr, date_sort=True)
66 epmc = client.EuropePMC(self.logger)
67 for record in epmc.complex_search_iterator(query, throttle=throttle): # also throttle paging requests
68 article = self.crosswalk(record)
69 yield article, fr
71 last = datetime.utcnow()
73 def crosswalk(self, record):
74 article = doaj.Article()
75 article.bibjson = {"title" : "", "identifier" : []}
76 bj = article.bibjson
77 # FIXME: this is a hack; to use DataObj in this way, we need to create article.bibjson with valid data (as we have set data
78 # requirements on the struct). So above we create it with the requried data, and now we remove the field so that it isn't
79 # populated later when we do actual data validation.
80 # The longer term fix for this is a review of the DataObj code, but for the time being this is good.
81 bj._delete("title")
82 bj.journal = {}
83 journal = bj.journal
85 # sort out the issns - EPMC sometimes puts the same value in the issn and essn fields. I guess this is
86 # because they regard the issn to be the essn if there is no print issn. This little trick below extracts
87 # the values to pissn and eissn, and then if they are the same, gets rid of the pissn.
88 pissn = record.issn
89 eissn = record.essn
90 if pissn == eissn:
91 pissn = None
93 if pissn is not None:
94 article.add_identifier("pissn", pissn)
95 if eissn is not None:
96 article.add_identifier("eissn", eissn)
98 bj.title = record.title
99 article.add_identifier("doi", record.doi)
100 journal.volume = record.journal_volume
101 journal.number = record.journal_issue
102 journal.title = record.journal
103 journal.language = record.language
104 bj.year = record.year_of_publication
105 bj.month = record.month_of_publication
106 journal.start_page = record.start_page
107 journal.end_page = record.end_page
108 article.add_link("fulltext", record.get_first_fulltext_url())
109 bj.abstract = record.abstract
111 for a in record.authors:
112 cn = a.get("collectiveName")
113 fn = a.get("firstName")
114 ln = a.get("lastName")
116 if fn is None and ln is None and cn is None:
117 fn = record.author_string
118 if fn is None:
119 continue
121 n = ""
122 if cn is not None:
123 n += cn
124 if fn is not None:
125 if n != "":
126 n += " "
127 n += fn
128 if ln is not None:
129 if n != "":
130 n += " "
131 n += ln
132 article.add_author(name=n)
134 return article