Coverage for portality/tasks/harvester_helpers/epmc/client.py: 69%
131 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-09-05 21:15 +0100
« prev ^ index » next coverage.py v6.4.2, created at 2022-09-05 21:15 +0100
1from portality.core import app
2from portality.lib import httputil
3import urllib.request, urllib.parse, urllib.error, string
5from portality.lib import dates
6from portality.tasks.harvester_helpers.epmc import models
7from portality.tasks.harvester_helpers.epmc.queries import QueryBuilder
8from datetime import datetime
9import time
12class DefaultLogger():
13 def __init__(self):
14 self._log = []
16 def log(self, msg):
17 self._log.append({
18 "timestamp": dates.now_with_microseconds(),
19 "message" : msg
20 })
22def quote(s, **kwargs):
23 try:
24 return urllib.parse.quote_plus(s, **kwargs)
25 except:
26 pass
28 try:
29 utf = s.encode("utf-8")
30 return urllib.parse.quote(utf, **kwargs)
31 except:
32 return None
35def check_epmc_version(resp_json):
36 try:
37 received_ver = resp_json['version']
38 configured_ver = app.config.get("EPMC_TARGET_VERSION")
39 if received_ver != configured_ver:
40 app.logger.warn("Mismatching EPMC API version; recommend checking for changes. Expected '{0}' Found '{1}'".format(configured_ver, received_ver))
41 except KeyError:
42 app.logger.warn("Couldn't check EPMC API version; did not find 'version' key in response. Proceed with caution as the EPMC API may have changed.")
45def to_keywords(s):
46 # FIXME: this method does not strip stop words - investigations into that indicate that as a natural language
47 # processing thing, the libraries required to do it (e.g. NLTK) are awkward and overblown for our purposes.
49 # translate out all of the punctuation
50 exclude = set(string.punctuation)
51 raw = ''.join(ch if ch not in exclude else " " for ch in s)
53 # normalise the spacing
54 return " ".join([x for x in raw.split(" ") if x != ""])
57class EuropePMCException(Exception):
58 def __init__(self, *args, **kwargs):
59 httpresponse = kwargs.get("httpresponse")
60 if httpresponse is not None:
61 del kwargs["httpresponse"]
62 super(EuropePMCException, self).__init__(*args)
63 self.response = httpresponse
66class EPMCFullTextException(Exception):
67 def __init__(self, message, rawstring, *args, **kwargs):
68 super(EPMCFullTextException, self).__init__(message, *args)
69 self.raw = rawstring
72class EuropePMC(object):
74 def __init__(self, logger=None):
75 self.logger = DefaultLogger() if logger is None else logger
77 def get_by_pmcid(self, pmcid, cursor=""):
78 return self.field_search("PMCID", pmcid, cursor=cursor)
80 def get_by_pmid(self, pmid, cursor=""):
81 return self.field_search("EXT_ID", pmid, cursor=cursor)
83 def get_by_doi(self, doi, cursor=""):
84 return self.field_search("DOI", doi, cursor=cursor)
86 def title_exact(self, title, cursor=""):
87 return self.field_search("TITLE", title, cursor=cursor)
89 def title_approximate(self, title, cursor=""):
90 nt = to_keywords(title)
91 return self.field_search("TITLE", nt, fuzzy=True, cursor=cursor)
93 def field_search(self, field, value, fuzzy=False, cursor="", page_size=25):
94 """
95 :return: (results, next_cursor)
96 """
97 qb = QueryBuilder()
98 qb.add_string_field(field, value, fuzzy)
99 return self.query(qb.to_url_query_param(), cursor=cursor, page_size=page_size)
101 def field_search_iterator(self, field, value, fuzzy=False, page_size=25, throttle=None):
102 qb = QueryBuilder()
103 qb.add_string_field(field, value, fuzzy)
104 return self.iterate(qb.to_url_query_param(), page_size=page_size, throttle=throttle)
106 def complex_search(self, query_builder, cursor="", page_size=25):
107 """
108 :return: (results, next_cursor)
109 """
110 return self.query(query_builder.to_url_query_param(), cursor=cursor, page_size=page_size)
112 def complex_search_iterator(self, query_builder, page_size=1000, throttle=None):
113 return self.iterate(query_builder.to_url_query_param(), page_size=page_size, throttle=throttle)
115 def _write_to_logger(self, msg):
116 self.logger.log(msg)
118 def iterate(self, query_string, page_size=1000, throttle=None):
119 cursor = ""
120 last = None
121 while True:
122 if last is not None and throttle is not None:
123 diff = (datetime.utcnow() - last).total_seconds()
124 self._write_to_logger("Last request at {x}, {y}s ago; throttle {z}s".format(x=last, y=diff, z=throttle))
125 if diff < throttle:
126 waitfor = throttle - diff
127 self._write_to_logger("Throttling EPMC requests for {x}s".format(x=waitfor))
128 time.sleep(waitfor)
129 # FIXME: note for the future: EMPC have also added a `nextPageUrl` field, which does the same as
130 # our `url_from_query` method, and we may want to transition to using that going forward. But for now
131 # the method that we use is still supported
132 results, cursor = self.query(query_string, cursor=cursor, page_size=page_size)
133 last = datetime.utcnow()
134 if len(results) == 0:
135 break
136 for r in results:
137 yield r
138 # we break on the empty cursor at the end, because we want to process any results first
139 if cursor is None or cursor == "":
140 break
142 def url_from_query(self, query_string, cursor, page_size):
143 quoted = quote(query_string, safe="/")
144 qsize = quote(str(page_size))
145 qcursor = quote(str(cursor))
147 if qsize is None or quoted is None or qcursor is None:
148 raise EuropePMCException(None, "unable to url escape the string")
150 url = app.config.get("EPMC_REST_API") + "search?query=" + query_string
151 url += "&resulttype=core&format=json&pageSize=" + qsize
153 if cursor != "":
154 url += "&cursorMark=" + qcursor
156 return url
158 def query(self, query_string, cursor="", page_size=25):
159 """
160 :return: (results, next_cursor)
161 """
162 url = self.url_from_query(query_string, cursor, page_size)
163 self._write_to_logger("Requesting EPMC metadata from " + url)
165 resp = httputil.get(url)
166 if resp is None:
167 raise EuropePMCException(message="could not get a response from EPMC")
168 if resp.status_code != 200:
169 raise EuropePMCException(resp)
171 try:
172 j = resp.json()
173 check_epmc_version(j)
174 except:
175 raise EuropePMCException(message="could not decode JSON from EPMC response")
177 results = [models.EPMCMetadata(r) for r in j.get("resultList", {}).get("result", [])]
178 next_cursor_mark = j.get("nextCursorMark", "")
179 return results, next_cursor_mark
181 def fulltext(self, pmcid):
182 url = app.config.get("EPMC_REST_API") + pmcid + "/fullTextXML"
183 self._write_to_logger("Searching for Fulltext at " + url)
184 resp = httputil.get(url)
185 if resp is None:
186 raise EuropePMCException(message="could not get a response for fulltext from EPMC")
187 if resp.status_code != 200:
188 raise EuropePMCException(resp)
189 return EPMCFullText(resp.text)
192class EPMCFullText(models.JATS):
193 """
194 For backwards compatibility - don't add any methods here
195 """
196 pass