Coverage for portality/api/current/discovery.py: 97%

209 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-22 15:59 +0100

1# ~~APISearch:Feature->API:Feature~~ 

2from portality.api.common import Api 

3from portality import util 

4from portality.core import app 

5from portality.lib import dates 

6from portality import models 

7import esprit 

8import re, json, uuid, os 

9from copy import deepcopy 

10from flask import url_for 

11from portality.ui.messages import Messages 

12from portality.bll.doaj import DOAJ 

13 

14 

15class DiscoveryException(Exception): 

16 pass 

17 

18 

19class SearchResult(object): 

20 def __init__(self, raw=None): 

21 self.data = raw if raw is not None else {} 

22 

23 

24def query_substitute(query, substitutions): 

25 if len(list(substitutions.keys())) == 0: 

26 return query 

27 

28 # apply the regex escapes to the substitutions, so we know they 

29 # are ready to be matched 

30 escsubs = {} 

31 for k, v in substitutions.items(): 

32 escsubs[k.replace(":", "\\:")] = v 

33 

34 # define a function which takes the match group and returns the 

35 # substitution if there is one 

36 def rep(match): 

37 for k, v in escsubs.items(): 

38 if k == match.group(1): 

39 return v 

40 return match.group(1) 

41 

42 # define the regular expressions for splitting and then extracting 

43 # the field to be substituted 

44 split_rx = "([^\\\\]:)" 

45 field_rx = "([^\s\+\-\(\)\"]+?):$" 

46 

47 # split the query around any unescaped colons 

48 bits = re.split(split_rx, query) 

49 

50 # stitch back together the split sections and the separators 

51 segs = [bits[i] + bits[i+1] for i in range(0, len(bits), 2) if i+1 < len(bits)] + [bits[len(bits) - 1]] if len(bits) % 2 == 1 else [] 

52 

53 # substitute the fields as required 

54 subs = [] 

55 for seg in segs: 

56 if seg.endswith(":"): 

57 subs.append(re.sub(field_rx, rep, seg)) 

58 else: 

59 subs.append(seg) 

60 

61 return ":".join(subs) 

62 

63 

64def allowed(query, wildcards=False, fuzzy=False): 

65 if not wildcards: 

66 rx = "(.+[^\\\\][\?\*]+.*)" 

67 if re.search(rx, query): 

68 return False 

69 

70 if not fuzzy: 

71 # this covers both fuzzy searching and proximity searching 

72 rx = "(.+[^\\\\]~[0-9]{0,1}[\.]{0,1}[0-9]{0,1})" 

73 if re.search(rx, query): 

74 return False 

75 

76 return True 

77 

78 

79def escape(query): 

80 # just escapes all instances of "/" in the query with "\\/" 

81 # amd all instances of ":" with "\\: 

82 

83 # Functions which does the replacements 

84 def slasher(m): 

85 data = m.group(0)[0] + "\\/" 

86 return data 

87 

88 def colon_escaper(q): 

89 # we need to escape every colon that is not after keyword and is not already escaped 

90 # colons after keywords are: first one and every first after AND or OR 

91 parts = q.split(":") 

92 for i in range(1, len(parts)-1): 

93 if not parts[i].endswith('\\') and ' AND ' not in parts[i] and ' OR ' not in parts[i]: 

94 parts[i] = parts[i] + "\\" 

95 query = ":".join(parts) 

96 return query 

97 

98 # the regular expression which looks for an unescaped / 

99 slash_rx = "[^\\\\]/" 

100 

101 # because the regex matches two characters, neighbouring /s will not both 

102 # get replaced at the same time because re.sub looks at "non overlapping matches". 

103 # This means "//" will not be properly escaped. So, we run the re.subn 

104 # function repeatedly until the number of replacements drops to 0 

105 count = 1 

106 while count > 0: 

107 query, count = re.subn(slash_rx, slasher, query) 

108 

109 query = colon_escaper(query) 

110 return query 

111 

112 

113# ~~->Swagger:Feature~~ 

114# ~~->API:Documentation~~ 

115DISCOVERY_API_SWAG = { 

116 'application': json.loads(util.load_file(os.path.join(app.config['BASE_FILE_PATH'], 'api', 'current', 'discovery_api_application_swag.json'))), 

117 'journal': json.loads(util.load_file(os.path.join(app.config['BASE_FILE_PATH'], 'api', 'current', 'discovery_api_journal_swag.json'))), 

118 'article': json.loads(util.load_file(os.path.join(app.config['BASE_FILE_PATH'], 'api', 'current', 'discovery_api_article_swag.json'))) 

119} 

120max_page_size = str(app.config.get("DISCOVERY_MAX_PAGE_SIZE", 100)) 

121 

122 

123class DiscoveryApi(Api): 

124 

125 @staticmethod 

126 def get_application_swag(): 

127 

128 description = DISCOVERY_API_SWAG['application']["parameters"][3]["description"] 

129 DISCOVERY_API_SWAG['application']["parameters"][3]["description"] = \ 

130 ''.join([description, " The page size limit is ", max_page_size]) 

131 return deepcopy(DISCOVERY_API_SWAG['application']) 

132 

133 @staticmethod 

134 def get_journal_swag(): 

135 

136 description = DISCOVERY_API_SWAG['journal']["parameters"][2]["description"] 

137 DISCOVERY_API_SWAG['journal']["parameters"][2]["description"] = \ 

138 ''.join([description, " The page size limit is ", max_page_size]) 

139 return deepcopy(DISCOVERY_API_SWAG['journal']) 

140 

141 @staticmethod 

142 def get_article_swag(): 

143 

144 description = DISCOVERY_API_SWAG['article']["parameters"][2]["description"] 

145 DISCOVERY_API_SWAG['article']["parameters"][2]["description"] = \ 

146 ''.join([description, " The page size limit is ", max_page_size]) 

147 return deepcopy(DISCOVERY_API_SWAG['article']) 

148 

149 @classmethod 

150 def _sanitise(cls, q, page, page_size, sort, search_subs, sort_subs, bulk): 

151 if q is not None: 

152 if not allowed(q): 

153 raise DiscoveryException("Query contains disallowed Lucene features") 

154 

155 q = query_substitute(q, search_subs) 

156 q = escape(q) 

157 

158 # sanitise the page size information 

159 if page < 1: 

160 page = 1 

161 

162 if bulk: 

163 max_page_size = app.config.get("DISCOVERY_BULK_PAGE_SIZE", 1000) 

164 else: 

165 max_page_size = app.config.get("DISCOVERY_MAX_PAGE_SIZE", 100) 

166 if page_size > max_page_size: 

167 page_size = max_page_size 

168 elif page_size < 1: 

169 page_size = 10 

170 

171 # calculate the position of the from cursor in the document set 

172 fro = (page - 1) * page_size 

173 # If fro is greater than the max allowed, throw error 

174 # using bulk to provide an override when needed 

175 max_records = app.config.get("DISCOVERY_MAX_RECORDS_SIZE", 1000) 

176 if fro >= max_records: 

177 message = Messages.PREVENT_DEEP_PAGING_IN_API.format( 

178 max_records=max_records, 

179 data_dump_url=app.config.get("BASE_URL") + url_for("doaj.public_data_dump"), 

180 oai_journal_url=app.config.get("BASE_URL") + url_for("oaipmh.oaipmh"), 

181 oai_article_url=app.config.get("BASE_URL") + url_for("oaipmh.oaipmh", specified="article") 

182 ) 

183 raise DiscoveryException(message) 

184 

185 # interpret the sort field into the form required by the query 

186 sortby = None 

187 sortdir = None 

188 if sort is not None: 

189 if ":" in sort: 

190 bits = sort.split(":") 

191 if len(bits) != 2: 

192 raise DiscoveryException("Malformed sort parameter") 

193 

194 sortby = bits[0] 

195 if sortby in sort_subs: 

196 sortby = sort_subs[sortby] 

197 

198 if bits[1] in ["asc", "desc"]: 

199 sortdir = bits[1] 

200 else: 

201 raise DiscoveryException("Sort direction must be 'asc' or 'desc'") 

202 else: 

203 sortby = sort 

204 if sortby in sort_subs: 

205 sortby = sort_subs[sortby] 

206 

207 return q, page, fro, page_size, sortby, sortdir 

208 

209 @classmethod 

210 def _make_query(cls, q, page, page_size, sort, index_type, bulk): 

211 if index_type == 'article': 

212 search_subs = app.config.get("DISCOVERY_ARTICLE_SEARCH_SUBS", {}) 

213 sort_subs = app.config.get("DISCOVERY_ARTICLE_SORT_SUBS", {}) 

214 elif index_type == 'journal': 

215 search_subs = app.config.get("DISCOVERY_JOURNAL_SEARCH_SUBS", {}) 

216 sort_subs = app.config.get("DISCOVERY_JOURNAL_SORT_SUBS", {}) 

217 else: 

218 search_subs = app.config.get("DISCOVERY_APPLICATION_SEARCH_SUBS", {}) 

219 sort_subs = app.config.get("DISCOVERY_APPLICATION_SORT_SUBS", {}) 

220 

221 # sanitise and prep the inputs 

222 q, page, fro, page_size, sortby, sortdir = cls._sanitise(q, page, page_size, sort, search_subs, sort_subs, bulk) 

223 

224 search_query = SearchQuery(q, fro, page_size, sortby, sortdir) 

225 raw_query = search_query.query() 

226 return raw_query, page, page_size 

227 

228 @staticmethod 

229 def _calc_pagination(total, page_size, requested_page): 

230 """ 

231 Calculate pagination for API results like # of pages and the last page. 

232 

233 Modified from https://github.com/Pylons/paginate/blob/master/paginate/__init__.py#L260 , 

234 a pagination library. (__init__.py, Page.__init__) 

235 """ 

236 FIRST_PAGE = 1 

237 

238 if total == 0: 

239 return 1, None, None, 1 

240 

241 page_count = ((total - 1) // page_size) + 1 

242 last_page = FIRST_PAGE + page_count - 1 

243 

244 # Links to previous and next page 

245 if requested_page > FIRST_PAGE: 

246 previous_page = requested_page - 1 

247 else: 

248 previous_page = None 

249 

250 if requested_page < last_page: 

251 next_page = requested_page + 1 

252 else: 

253 next_page = None 

254 

255 return page_count, previous_page, next_page, last_page 

256 

257 @classmethod 

258 def _make_response(cls, endpoint, res, q, page, page_size, sort, obs): 

259 total = res.get("hits", {}).get("total", {}).get('value', 0) 

260 

261 page_count, previous_page, next_page, last_page = cls._calc_pagination(total, page_size, page) 

262 

263 # build the response object 

264 result = { 

265 "total": total, 

266 "page": page, 

267 "pageSize": page_size, 

268 "timestamp": dates.now_with_microseconds(), 

269 "query": q, 

270 "results": obs 

271 } 

272 

273 if previous_page is not None: 

274 result["prev"] = app.config['BASE_URL'] + url_for(app.config['API_CURRENT_BLUEPRINT_NAME'] + '.' + endpoint, search_query=q, page=previous_page, pageSize=page_size, sort=sort) 

275 

276 if next_page is not None: 

277 result["next"] = app.config['BASE_URL'] + url_for(app.config['API_CURRENT_BLUEPRINT_NAME'] + '.' + endpoint, search_query=q, page=next_page, pageSize=page_size, sort=sort) 

278 

279 if last_page is not None: 

280 result["last"] = app.config['BASE_URL'] + url_for(app.config['API_CURRENT_BLUEPRINT_NAME'] + '.' + endpoint, search_query=q, page=last_page, pageSize=page_size, sort=sort) 

281 

282 if sort is not None: 

283 result["sort"] = sort 

284 

285 return SearchResult(result) 

286 

287 @classmethod 

288 def search(cls, index_type, account, q, page, page_size, sort=None): 

289 if index_type not in ['article', 'journal', 'application']: 

290 raise DiscoveryException("There was an error executing your query for {0}. Unknown type.)".format(index_type)) 

291 

292 if index_type == 'article': 

293 endpoint = 'search_articles' 

294 klass = models.Article # ~~->Article:Model~~ 

295 elif index_type == 'journal': 

296 endpoint = 'search_journals' 

297 klass = models.Journal # ~~->Journal:Model~~ 

298 else: 

299 endpoint = 'search_applications' 

300 klass = models.Suggestion #~~->Application:Model~~ 

301 

302 raw_query, page, page_size = cls._make_query(q, page, page_size, sort, index_type, False) 

303 

304 # execute the query against the articles 

305 # ~~->Query:Service~~ 

306 query_service = DOAJ.queryService() 

307 try: 

308 res = query_service.search('api_query', index_type, raw_query, account, None) 

309 except Exception as e: 

310 magic = uuid.uuid1() 

311 msg = e.error if hasattr(e, "error") else e.message if hasattr(e, "message") else str(e) 

312 app.logger.error(u"Error executing discovery query search for {i}: {x} (ref: {y})".format(i=index_type, x=msg, y=magic)) 

313 raise DiscoveryException("There was an error executing your query (ref: {y})".format(y=magic)) 

314 

315 obs = [klass(**raw) for raw in esprit.raw.unpack_json_result(res)] 

316 return cls._make_response(endpoint, res, q, page, page_size, sort, obs) 

317 

318 @classmethod 

319 def scroll(cls, index_type, account, q, page_size, sort=None, scan=False): 

320 if index_type not in ['article', 'journal', 'application']: 

321 raise DiscoveryException("There was an error executing your query for {0}. Unknown type.)".format(index_type)) 

322 

323 page = 1 # Not used in scroll 

324 raw_query, page, page_size = cls._make_query(q, page, page_size, sort, index_type, True) 

325 

326 # execute the query against the articles 

327 query_service = DOAJ.queryService() 

328 for result in query_service.scroll('api_query', index_type, raw_query, account, page_size, scan=scan): 

329 yield result 

330 

331 

332class SearchQuery(object): 

333 """ 

334 ~~->Search:Query~~ 

335 ~~Search:Query->Elasticsearch:Technology~~ 

336 """ 

337 def __init__(self, qs, fro, psize, sortby=None, sortdir=None): 

338 self.qs = qs 

339 self.fro = fro 

340 self.psize = psize 

341 self.sortby = sortby 

342 self.sortdir = sortdir if sortdir is not None else "asc" 

343 

344 def query(self): 

345 q = { 

346 "track_total_hits" : True, 

347 "from": self.fro, 

348 "size": self.psize 

349 } 

350 if self.qs is not None: 

351 q["query"] = { 

352 "query_string": { 

353 "query": self.qs, 

354 "default_operator": "AND" 

355 } 

356 } 

357 else: 

358 q["query"] = {"match_all": {}} 

359 

360 if self.sortby is not None: 

361 q["sort"] = [{self.sortby: {"order": self.sortdir, "mode": "min"}}] 

362 

363 return q