Coverage for portality/api/current/discovery.py: 97%
209 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-22 15:59 +0100
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-22 15:59 +0100
1# ~~APISearch:Feature->API:Feature~~
2from portality.api.common import Api
3from portality import util
4from portality.core import app
5from portality.lib import dates
6from portality import models
7import esprit
8import re, json, uuid, os
9from copy import deepcopy
10from flask import url_for
11from portality.ui.messages import Messages
12from portality.bll.doaj import DOAJ
15class DiscoveryException(Exception):
16 pass
19class SearchResult(object):
20 def __init__(self, raw=None):
21 self.data = raw if raw is not None else {}
24def query_substitute(query, substitutions):
25 if len(list(substitutions.keys())) == 0:
26 return query
28 # apply the regex escapes to the substitutions, so we know they
29 # are ready to be matched
30 escsubs = {}
31 for k, v in substitutions.items():
32 escsubs[k.replace(":", "\\:")] = v
34 # define a function which takes the match group and returns the
35 # substitution if there is one
36 def rep(match):
37 for k, v in escsubs.items():
38 if k == match.group(1):
39 return v
40 return match.group(1)
42 # define the regular expressions for splitting and then extracting
43 # the field to be substituted
44 split_rx = "([^\\\\]:)"
45 field_rx = "([^\s\+\-\(\)\"]+?):$"
47 # split the query around any unescaped colons
48 bits = re.split(split_rx, query)
50 # stitch back together the split sections and the separators
51 segs = [bits[i] + bits[i+1] for i in range(0, len(bits), 2) if i+1 < len(bits)] + [bits[len(bits) - 1]] if len(bits) % 2 == 1 else []
53 # substitute the fields as required
54 subs = []
55 for seg in segs:
56 if seg.endswith(":"):
57 subs.append(re.sub(field_rx, rep, seg))
58 else:
59 subs.append(seg)
61 return ":".join(subs)
64def allowed(query, wildcards=False, fuzzy=False):
65 if not wildcards:
66 rx = "(.+[^\\\\][\?\*]+.*)"
67 if re.search(rx, query):
68 return False
70 if not fuzzy:
71 # this covers both fuzzy searching and proximity searching
72 rx = "(.+[^\\\\]~[0-9]{0,1}[\.]{0,1}[0-9]{0,1})"
73 if re.search(rx, query):
74 return False
76 return True
79def escape(query):
80 # just escapes all instances of "/" in the query with "\\/"
81 # amd all instances of ":" with "\\:
83 # Functions which does the replacements
84 def slasher(m):
85 data = m.group(0)[0] + "\\/"
86 return data
88 def colon_escaper(q):
89 # we need to escape every colon that is not after keyword and is not already escaped
90 # colons after keywords are: first one and every first after AND or OR
91 parts = q.split(":")
92 for i in range(1, len(parts)-1):
93 if not parts[i].endswith('\\') and ' AND ' not in parts[i] and ' OR ' not in parts[i]:
94 parts[i] = parts[i] + "\\"
95 query = ":".join(parts)
96 return query
98 # the regular expression which looks for an unescaped /
99 slash_rx = "[^\\\\]/"
101 # because the regex matches two characters, neighbouring /s will not both
102 # get replaced at the same time because re.sub looks at "non overlapping matches".
103 # This means "//" will not be properly escaped. So, we run the re.subn
104 # function repeatedly until the number of replacements drops to 0
105 count = 1
106 while count > 0:
107 query, count = re.subn(slash_rx, slasher, query)
109 query = colon_escaper(query)
110 return query
113# ~~->Swagger:Feature~~
114# ~~->API:Documentation~~
115DISCOVERY_API_SWAG = {
116 'application': json.loads(util.load_file(os.path.join(app.config['BASE_FILE_PATH'], 'api', 'current', 'discovery_api_application_swag.json'))),
117 'journal': json.loads(util.load_file(os.path.join(app.config['BASE_FILE_PATH'], 'api', 'current', 'discovery_api_journal_swag.json'))),
118 'article': json.loads(util.load_file(os.path.join(app.config['BASE_FILE_PATH'], 'api', 'current', 'discovery_api_article_swag.json')))
119}
120max_page_size = str(app.config.get("DISCOVERY_MAX_PAGE_SIZE", 100))
123class DiscoveryApi(Api):
125 @staticmethod
126 def get_application_swag():
128 description = DISCOVERY_API_SWAG['application']["parameters"][3]["description"]
129 DISCOVERY_API_SWAG['application']["parameters"][3]["description"] = \
130 ''.join([description, " The page size limit is ", max_page_size])
131 return deepcopy(DISCOVERY_API_SWAG['application'])
133 @staticmethod
134 def get_journal_swag():
136 description = DISCOVERY_API_SWAG['journal']["parameters"][2]["description"]
137 DISCOVERY_API_SWAG['journal']["parameters"][2]["description"] = \
138 ''.join([description, " The page size limit is ", max_page_size])
139 return deepcopy(DISCOVERY_API_SWAG['journal'])
141 @staticmethod
142 def get_article_swag():
144 description = DISCOVERY_API_SWAG['article']["parameters"][2]["description"]
145 DISCOVERY_API_SWAG['article']["parameters"][2]["description"] = \
146 ''.join([description, " The page size limit is ", max_page_size])
147 return deepcopy(DISCOVERY_API_SWAG['article'])
149 @classmethod
150 def _sanitise(cls, q, page, page_size, sort, search_subs, sort_subs, bulk):
151 if q is not None:
152 if not allowed(q):
153 raise DiscoveryException("Query contains disallowed Lucene features")
155 q = query_substitute(q, search_subs)
156 q = escape(q)
158 # sanitise the page size information
159 if page < 1:
160 page = 1
162 if bulk:
163 max_page_size = app.config.get("DISCOVERY_BULK_PAGE_SIZE", 1000)
164 else:
165 max_page_size = app.config.get("DISCOVERY_MAX_PAGE_SIZE", 100)
166 if page_size > max_page_size:
167 page_size = max_page_size
168 elif page_size < 1:
169 page_size = 10
171 # calculate the position of the from cursor in the document set
172 fro = (page - 1) * page_size
173 # If fro is greater than the max allowed, throw error
174 # using bulk to provide an override when needed
175 max_records = app.config.get("DISCOVERY_MAX_RECORDS_SIZE", 1000)
176 if fro >= max_records:
177 message = Messages.PREVENT_DEEP_PAGING_IN_API.format(
178 max_records=max_records,
179 data_dump_url=app.config.get("BASE_URL") + url_for("doaj.public_data_dump"),
180 oai_journal_url=app.config.get("BASE_URL") + url_for("oaipmh.oaipmh"),
181 oai_article_url=app.config.get("BASE_URL") + url_for("oaipmh.oaipmh", specified="article")
182 )
183 raise DiscoveryException(message)
185 # interpret the sort field into the form required by the query
186 sortby = None
187 sortdir = None
188 if sort is not None:
189 if ":" in sort:
190 bits = sort.split(":")
191 if len(bits) != 2:
192 raise DiscoveryException("Malformed sort parameter")
194 sortby = bits[0]
195 if sortby in sort_subs:
196 sortby = sort_subs[sortby]
198 if bits[1] in ["asc", "desc"]:
199 sortdir = bits[1]
200 else:
201 raise DiscoveryException("Sort direction must be 'asc' or 'desc'")
202 else:
203 sortby = sort
204 if sortby in sort_subs:
205 sortby = sort_subs[sortby]
207 return q, page, fro, page_size, sortby, sortdir
209 @classmethod
210 def _make_query(cls, q, page, page_size, sort, index_type, bulk):
211 if index_type == 'article':
212 search_subs = app.config.get("DISCOVERY_ARTICLE_SEARCH_SUBS", {})
213 sort_subs = app.config.get("DISCOVERY_ARTICLE_SORT_SUBS", {})
214 elif index_type == 'journal':
215 search_subs = app.config.get("DISCOVERY_JOURNAL_SEARCH_SUBS", {})
216 sort_subs = app.config.get("DISCOVERY_JOURNAL_SORT_SUBS", {})
217 else:
218 search_subs = app.config.get("DISCOVERY_APPLICATION_SEARCH_SUBS", {})
219 sort_subs = app.config.get("DISCOVERY_APPLICATION_SORT_SUBS", {})
221 # sanitise and prep the inputs
222 q, page, fro, page_size, sortby, sortdir = cls._sanitise(q, page, page_size, sort, search_subs, sort_subs, bulk)
224 search_query = SearchQuery(q, fro, page_size, sortby, sortdir)
225 raw_query = search_query.query()
226 return raw_query, page, page_size
228 @staticmethod
229 def _calc_pagination(total, page_size, requested_page):
230 """
231 Calculate pagination for API results like # of pages and the last page.
233 Modified from https://github.com/Pylons/paginate/blob/master/paginate/__init__.py#L260 ,
234 a pagination library. (__init__.py, Page.__init__)
235 """
236 FIRST_PAGE = 1
238 if total == 0:
239 return 1, None, None, 1
241 page_count = ((total - 1) // page_size) + 1
242 last_page = FIRST_PAGE + page_count - 1
244 # Links to previous and next page
245 if requested_page > FIRST_PAGE:
246 previous_page = requested_page - 1
247 else:
248 previous_page = None
250 if requested_page < last_page:
251 next_page = requested_page + 1
252 else:
253 next_page = None
255 return page_count, previous_page, next_page, last_page
257 @classmethod
258 def _make_response(cls, endpoint, res, q, page, page_size, sort, obs):
259 total = res.get("hits", {}).get("total", {}).get('value', 0)
261 page_count, previous_page, next_page, last_page = cls._calc_pagination(total, page_size, page)
263 # build the response object
264 result = {
265 "total": total,
266 "page": page,
267 "pageSize": page_size,
268 "timestamp": dates.now_with_microseconds(),
269 "query": q,
270 "results": obs
271 }
273 if previous_page is not None:
274 result["prev"] = app.config['BASE_URL'] + url_for(app.config['API_CURRENT_BLUEPRINT_NAME'] + '.' + endpoint, search_query=q, page=previous_page, pageSize=page_size, sort=sort)
276 if next_page is not None:
277 result["next"] = app.config['BASE_URL'] + url_for(app.config['API_CURRENT_BLUEPRINT_NAME'] + '.' + endpoint, search_query=q, page=next_page, pageSize=page_size, sort=sort)
279 if last_page is not None:
280 result["last"] = app.config['BASE_URL'] + url_for(app.config['API_CURRENT_BLUEPRINT_NAME'] + '.' + endpoint, search_query=q, page=last_page, pageSize=page_size, sort=sort)
282 if sort is not None:
283 result["sort"] = sort
285 return SearchResult(result)
287 @classmethod
288 def search(cls, index_type, account, q, page, page_size, sort=None):
289 if index_type not in ['article', 'journal', 'application']:
290 raise DiscoveryException("There was an error executing your query for {0}. Unknown type.)".format(index_type))
292 if index_type == 'article':
293 endpoint = 'search_articles'
294 klass = models.Article # ~~->Article:Model~~
295 elif index_type == 'journal':
296 endpoint = 'search_journals'
297 klass = models.Journal # ~~->Journal:Model~~
298 else:
299 endpoint = 'search_applications'
300 klass = models.Suggestion #~~->Application:Model~~
302 raw_query, page, page_size = cls._make_query(q, page, page_size, sort, index_type, False)
304 # execute the query against the articles
305 # ~~->Query:Service~~
306 query_service = DOAJ.queryService()
307 try:
308 res = query_service.search('api_query', index_type, raw_query, account, None)
309 except Exception as e:
310 magic = uuid.uuid1()
311 msg = e.error if hasattr(e, "error") else e.message if hasattr(e, "message") else str(e)
312 app.logger.error(u"Error executing discovery query search for {i}: {x} (ref: {y})".format(i=index_type, x=msg, y=magic))
313 raise DiscoveryException("There was an error executing your query (ref: {y})".format(y=magic))
315 obs = [klass(**raw) for raw in esprit.raw.unpack_json_result(res)]
316 return cls._make_response(endpoint, res, q, page, page_size, sort, obs)
318 @classmethod
319 def scroll(cls, index_type, account, q, page_size, sort=None, scan=False):
320 if index_type not in ['article', 'journal', 'application']:
321 raise DiscoveryException("There was an error executing your query for {0}. Unknown type.)".format(index_type))
323 page = 1 # Not used in scroll
324 raw_query, page, page_size = cls._make_query(q, page, page_size, sort, index_type, True)
326 # execute the query against the articles
327 query_service = DOAJ.queryService()
328 for result in query_service.scroll('api_query', index_type, raw_query, account, page_size, scan=scan):
329 yield result
332class SearchQuery(object):
333 """
334 ~~->Search:Query~~
335 ~~Search:Query->Elasticsearch:Technology~~
336 """
337 def __init__(self, qs, fro, psize, sortby=None, sortdir=None):
338 self.qs = qs
339 self.fro = fro
340 self.psize = psize
341 self.sortby = sortby
342 self.sortdir = sortdir if sortdir is not None else "asc"
344 def query(self):
345 q = {
346 "track_total_hits" : True,
347 "from": self.fro,
348 "size": self.psize
349 }
350 if self.qs is not None:
351 q["query"] = {
352 "query_string": {
353 "query": self.qs,
354 "default_operator": "AND"
355 }
356 }
357 else:
358 q["query"] = {"match_all": {}}
360 if self.sortby is not None:
361 q["sort"] = [{self.sortby: {"order": self.sortdir, "mode": "min"}}]
363 return q