Coverage for portality/tasks/harvester_helpers/epmc/client.py: 69%

131 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-09-05 21:15 +0100

1from portality.core import app 

2from portality.lib import httputil 

3import urllib.request, urllib.parse, urllib.error, string 

4 

5from portality.lib import dates 

6from portality.tasks.harvester_helpers.epmc import models 

7from portality.tasks.harvester_helpers.epmc.queries import QueryBuilder 

8from datetime import datetime 

9import time 

10 

11 

12class DefaultLogger(): 

13 def __init__(self): 

14 self._log = [] 

15 

16 def log(self, msg): 

17 self._log.append({ 

18 "timestamp": dates.now_with_microseconds(), 

19 "message" : msg 

20 }) 

21 

22def quote(s, **kwargs): 

23 try: 

24 return urllib.parse.quote_plus(s, **kwargs) 

25 except: 

26 pass 

27 

28 try: 

29 utf = s.encode("utf-8") 

30 return urllib.parse.quote(utf, **kwargs) 

31 except: 

32 return None 

33 

34 

35def check_epmc_version(resp_json): 

36 try: 

37 received_ver = resp_json['version'] 

38 configured_ver = app.config.get("EPMC_TARGET_VERSION") 

39 if received_ver != configured_ver: 

40 app.logger.warn("Mismatching EPMC API version; recommend checking for changes. Expected '{0}' Found '{1}'".format(configured_ver, received_ver)) 

41 except KeyError: 

42 app.logger.warn("Couldn't check EPMC API version; did not find 'version' key in response. Proceed with caution as the EPMC API may have changed.") 

43 

44 

45def to_keywords(s): 

46 # FIXME: this method does not strip stop words - investigations into that indicate that as a natural language 

47 # processing thing, the libraries required to do it (e.g. NLTK) are awkward and overblown for our purposes. 

48 

49 # translate out all of the punctuation 

50 exclude = set(string.punctuation) 

51 raw = ''.join(ch if ch not in exclude else " " for ch in s) 

52 

53 # normalise the spacing 

54 return " ".join([x for x in raw.split(" ") if x != ""]) 

55 

56 

57class EuropePMCException(Exception): 

58 def __init__(self, *args, **kwargs): 

59 httpresponse = kwargs.get("httpresponse") 

60 if httpresponse is not None: 

61 del kwargs["httpresponse"] 

62 super(EuropePMCException, self).__init__(*args) 

63 self.response = httpresponse 

64 

65 

66class EPMCFullTextException(Exception): 

67 def __init__(self, message, rawstring, *args, **kwargs): 

68 super(EPMCFullTextException, self).__init__(message, *args) 

69 self.raw = rawstring 

70 

71 

72class EuropePMC(object): 

73 

74 def __init__(self, logger=None): 

75 self.logger = DefaultLogger() if logger is None else logger 

76 

77 def get_by_pmcid(self, pmcid, cursor=""): 

78 return self.field_search("PMCID", pmcid, cursor=cursor) 

79 

80 def get_by_pmid(self, pmid, cursor=""): 

81 return self.field_search("EXT_ID", pmid, cursor=cursor) 

82 

83 def get_by_doi(self, doi, cursor=""): 

84 return self.field_search("DOI", doi, cursor=cursor) 

85 

86 def title_exact(self, title, cursor=""): 

87 return self.field_search("TITLE", title, cursor=cursor) 

88 

89 def title_approximate(self, title, cursor=""): 

90 nt = to_keywords(title) 

91 return self.field_search("TITLE", nt, fuzzy=True, cursor=cursor) 

92 

93 def field_search(self, field, value, fuzzy=False, cursor="", page_size=25): 

94 """ 

95 :return: (results, next_cursor) 

96 """ 

97 qb = QueryBuilder() 

98 qb.add_string_field(field, value, fuzzy) 

99 return self.query(qb.to_url_query_param(), cursor=cursor, page_size=page_size) 

100 

101 def field_search_iterator(self, field, value, fuzzy=False, page_size=25, throttle=None): 

102 qb = QueryBuilder() 

103 qb.add_string_field(field, value, fuzzy) 

104 return self.iterate(qb.to_url_query_param(), page_size=page_size, throttle=throttle) 

105 

106 def complex_search(self, query_builder, cursor="", page_size=25): 

107 """ 

108 :return: (results, next_cursor) 

109 """ 

110 return self.query(query_builder.to_url_query_param(), cursor=cursor, page_size=page_size) 

111 

112 def complex_search_iterator(self, query_builder, page_size=1000, throttle=None): 

113 return self.iterate(query_builder.to_url_query_param(), page_size=page_size, throttle=throttle) 

114 

115 def _write_to_logger(self, msg): 

116 self.logger.log(msg) 

117 

118 def iterate(self, query_string, page_size=1000, throttle=None): 

119 cursor = "" 

120 last = None 

121 while True: 

122 if last is not None and throttle is not None: 

123 diff = (datetime.utcnow() - last).total_seconds() 

124 self._write_to_logger("Last request at {x}, {y}s ago; throttle {z}s".format(x=last, y=diff, z=throttle)) 

125 if diff < throttle: 

126 waitfor = throttle - diff 

127 self._write_to_logger("Throttling EPMC requests for {x}s".format(x=waitfor)) 

128 time.sleep(waitfor) 

129 # FIXME: note for the future: EMPC have also added a `nextPageUrl` field, which does the same as 

130 # our `url_from_query` method, and we may want to transition to using that going forward. But for now 

131 # the method that we use is still supported 

132 results, cursor = self.query(query_string, cursor=cursor, page_size=page_size) 

133 last = datetime.utcnow() 

134 if len(results) == 0: 

135 break 

136 for r in results: 

137 yield r 

138 # we break on the empty cursor at the end, because we want to process any results first 

139 if cursor is None or cursor == "": 

140 break 

141 

142 def url_from_query(self, query_string, cursor, page_size): 

143 quoted = quote(query_string, safe="/") 

144 qsize = quote(str(page_size)) 

145 qcursor = quote(str(cursor)) 

146 

147 if qsize is None or quoted is None or qcursor is None: 

148 raise EuropePMCException(None, "unable to url escape the string") 

149 

150 url = app.config.get("EPMC_REST_API") + "search?query=" + query_string 

151 url += "&resulttype=core&format=json&pageSize=" + qsize 

152 

153 if cursor != "": 

154 url += "&cursorMark=" + qcursor 

155 

156 return url 

157 

158 def query(self, query_string, cursor="", page_size=25): 

159 """ 

160 :return: (results, next_cursor) 

161 """ 

162 url = self.url_from_query(query_string, cursor, page_size) 

163 self._write_to_logger("Requesting EPMC metadata from " + url) 

164 

165 resp = httputil.get(url) 

166 if resp is None: 

167 raise EuropePMCException(message="could not get a response from EPMC") 

168 if resp.status_code != 200: 

169 raise EuropePMCException(resp) 

170 

171 try: 

172 j = resp.json() 

173 check_epmc_version(j) 

174 except: 

175 raise EuropePMCException(message="could not decode JSON from EPMC response") 

176 

177 results = [models.EPMCMetadata(r) for r in j.get("resultList", {}).get("result", [])] 

178 next_cursor_mark = j.get("nextCursorMark", "") 

179 return results, next_cursor_mark 

180 

181 def fulltext(self, pmcid): 

182 url = app.config.get("EPMC_REST_API") + pmcid + "/fullTextXML" 

183 self._write_to_logger("Searching for Fulltext at " + url) 

184 resp = httputil.get(url) 

185 if resp is None: 

186 raise EuropePMCException(message="could not get a response for fulltext from EPMC") 

187 if resp.status_code != 200: 

188 raise EuropePMCException(resp) 

189 return EPMCFullText(resp.text) 

190 

191 

192class EPMCFullText(models.JATS): 

193 """ 

194 For backwards compatibility - don't add any methods here 

195 """ 

196 pass