Coverage for portality/api/current/discovery.py: 97%

1# ~~APISearch:Feature->API:Feature~~

2from portality.api.common import Api

3from portality import util

4from portality.core import app

5from portality.lib import dates

6from portality import models

7import esprit

8import re, json, uuid, os

9from copy import deepcopy

10from flask import url_for

11from portality.ui.messages import Messages

12from portality.bll.doaj import DOAJ

15class DiscoveryException(Exception):

16 pass

19class SearchResult(object):

20 def __init__(self, raw=None):

21 self.data = raw if raw is not None else {}

24def query_substitute(query, substitutions):

25 if len(list(substitutions.keys())) == 0:

26 return query

28 # apply the regex escapes to the substitutions, so we know they

29 # are ready to be matched

30 escsubs = {}

31 for k, v in substitutions.items():

32 escsubs[k.replace(":", "\\:")] = v

34 # define a function which takes the match group and returns the

35 # substitution if there is one

36 def rep(match):

37 for k, v in escsubs.items():

38 if k == match.group(1):

39 return v

40 return match.group(1)

42 # define the regular expressions for splitting and then extracting

43 # the field to be substituted

44 split_rx = "([^\\\\]:)"

45 field_rx = "([^\s\+\-\"]+?):$"

47 # split the query around any unescaped colons

48 bits = re.split(split_rx, query)

50 # stitch back together the split sections and the separators

51 segs = [bits[i] + bits[i+1] for i in range(0, len(bits), 2) if i+1 < len(bits)] + [bits[len(bits) - 1]] if len(bits) % 2 == 1 else []

53 # substitute the fields as required

54 subs = []

55 for seg in segs:

56 if seg.endswith(":"):

57 subs.append(re.sub(field_rx, rep, seg))

58 else:

59 subs.append(seg)

61 return ":".join(subs)

64def allowed(query, wildcards=False, fuzzy=False):

65 if not wildcards:

66 rx = "(.+[^\\\\][\?\*]+.*)"

67 if re.search(rx, query):

68 return False

70 if not fuzzy:

71 # this covers both fuzzy searching and proximity searching

72 rx = "(.+[^\\\\]~[0-9]{0,1}[\.]{0,1}[0-9]{0,1})"

73 if re.search(rx, query):

74 return False

76 return True

79def escape(query):

80 # just escapes all instances of "/" in the query with "\\/"

81 # amd all instances of ":" with "\\:

83 # Functions which does the replacements

84 def slasher(m):

85 data = m.group(0)[0] + "\\/"

86 return data

88 def colon_escaper(q):

89 # we need to escape every colon that is not after keyword and is not already escaped

90 # colons after keywords are: first one and every first after AND or OR

91 parts = q.split(":")

92 for i in range(1, len(parts)-1):

93 if not parts[i].endswith('\\') and ' AND ' not in parts[i] and ' OR ' not in parts[i]:

94 parts[i] = parts[i] + "\\"

95 query = ":".join(parts)

96 return query

98 # the regular expression which looks for an unescaped /

99 slash_rx = "[^\\\\]/"

100

101 # because the regex matches two characters, neighbouring /s will not both

102 # get replaced at the same time because re.sub looks at "non overlapping matches".

103 # This means "//" will not be properly escaped. So, we run the re.subn

104 # function repeatedly until the number of replacements drops to 0

105 count = 1

106 while count > 0:

107 query, count = re.subn(slash_rx, slasher, query)

108

109 query = colon_escaper(query)

110 return query

111

112

113# ~~->Swagger:Feature~~

114# ~~->API:Documentation~~

115DISCOVERY_API_SWAG = {

116 'application': json.loads(util.load_file(os.path.join(app.config['BASE_FILE_PATH'], 'api', 'current', 'discovery_api_application_swag.json'))),

117 'journal': json.loads(util.load_file(os.path.join(app.config['BASE_FILE_PATH'], 'api', 'current', 'discovery_api_journal_swag.json'))),

118 'article': json.loads(util.load_file(os.path.join(app.config['BASE_FILE_PATH'], 'api', 'current', 'discovery_api_article_swag.json')))

119}

120max_page_size = str(app.config.get("DISCOVERY_MAX_PAGE_SIZE", 100))

121

122

123class DiscoveryApi(Api):

124

125 @staticmethod

126 def get_application_swag():

127

128 description = DISCOVERY_API_SWAG['application']["parameters"][3]["description"]

129 DISCOVERY_API_SWAG['application']["parameters"][3]["description"] = \

130 ''.join([description, " The page size limit is ", max_page_size])

131 return deepcopy(DISCOVERY_API_SWAG['application'])

132

133 @staticmethod

134 def get_journal_swag():

135

136 description = DISCOVERY_API_SWAG['journal']["parameters"][2]["description"]

137 DISCOVERY_API_SWAG['journal']["parameters"][2]["description"] = \

138 ''.join([description, " The page size limit is ", max_page_size])

139 return deepcopy(DISCOVERY_API_SWAG['journal'])

140

141 @staticmethod

142 def get_article_swag():

143

144 description = DISCOVERY_API_SWAG['article']["parameters"][2]["description"]

145 DISCOVERY_API_SWAG['article']["parameters"][2]["description"] = \

146 ''.join([description, " The page size limit is ", max_page_size])

147 return deepcopy(DISCOVERY_API_SWAG['article'])

148

149 @classmethod

150 def _sanitise(cls, q, page, page_size, sort, search_subs, sort_subs, bulk):

151 if q is not None:

152 if not allowed(q):

153 raise DiscoveryException("Query contains disallowed Lucene features")

154

155 q = query_substitute(q, search_subs)

156 q = escape(q)

157

158 # sanitise the page size information

159 if page < 1:

160 page = 1

161

162 if bulk:

163 max_page_size = app.config.get("DISCOVERY_BULK_PAGE_SIZE", 1000)

164 else:

165 max_page_size = app.config.get("DISCOVERY_MAX_PAGE_SIZE", 100)

166 if page_size > max_page_size:

167 page_size = max_page_size

168 elif page_size < 1:

169 page_size = 10

170

171 # calculate the position of the from cursor in the document set

172 fro = (page - 1) * page_size

173 # If fro is greater than the max allowed, throw error

174 # using bulk to provide an override when needed

175 max_records = app.config.get("DISCOVERY_MAX_RECORDS_SIZE", 1000)

176 if fro >= max_records:

177 message = Messages.PREVENT_DEEP_PAGING_IN_API.format(

178 max_records=max_records,

179 data_dump_url=app.config.get("BASE_URL") + url_for("doaj.public_data_dump"),

180 oai_journal_url=app.config.get("BASE_URL") + url_for("oaipmh.oaipmh"),

181 oai_article_url=app.config.get("BASE_URL") + url_for("oaipmh.oaipmh", specified="article")

182 )

183 raise DiscoveryException(message)

184

185 # interpret the sort field into the form required by the query

186 sortby = None

187 sortdir = None

188 if sort is not None:

189 if ":" in sort:

190 bits = sort.split(":")

191 if len(bits) != 2:

192 raise DiscoveryException("Malformed sort parameter")

193

194 sortby = bits[0]

195 if sortby in sort_subs:

196 sortby = sort_subs[sortby]

197

198 if bits[1] in ["asc", "desc"]:

199 sortdir = bits[1]

200 else:

201 raise DiscoveryException("Sort direction must be 'asc' or 'desc'")

202 else:

203 sortby = sort

204 if sortby in sort_subs:

205 sortby = sort_subs[sortby]

206

207 return q, page, fro, page_size, sortby, sortdir

208

209 @classmethod

210 def _make_query(cls, q, page, page_size, sort, index_type, bulk):

211 if index_type == 'article':

212 search_subs = app.config.get("DISCOVERY_ARTICLE_SEARCH_SUBS", {})

213 sort_subs = app.config.get("DISCOVERY_ARTICLE_SORT_SUBS", {})

214 elif index_type == 'journal':

215 search_subs = app.config.get("DISCOVERY_JOURNAL_SEARCH_SUBS", {})

216 sort_subs = app.config.get("DISCOVERY_JOURNAL_SORT_SUBS", {})

217 else:

218 search_subs = app.config.get("DISCOVERY_APPLICATION_SEARCH_SUBS", {})

219 sort_subs = app.config.get("DISCOVERY_APPLICATION_SORT_SUBS", {})

220

221 # sanitise and prep the inputs

222 q, page, fro, page_size, sortby, sortdir = cls._sanitise(q, page, page_size, sort, search_subs, sort_subs, bulk)

223

224 search_query = SearchQuery(q, fro, page_size, sortby, sortdir)

225 raw_query = search_query.query()

226 return raw_query, page, page_size

227

228 @staticmethod

229 def _calc_pagination(total, page_size, requested_page):

230 """

231 Calculate pagination for API results like # of pages and the last page.

232

233 Modified from https://github.com/Pylons/paginate/blob/master/paginate/__init__.py#L260 ,

234 a pagination library. (__init__.py, Page.__init__)

235 """

236 FIRST_PAGE = 1

237

238 if total == 0:

239 return 1, None, None, 1

240

241 page_count = ((total - 1) // page_size) + 1

242 last_page = FIRST_PAGE + page_count - 1

243

244 # Links to previous and next page

245 if requested_page > FIRST_PAGE:

246 previous_page = requested_page - 1

247 else:

248 previous_page = None

249

250 if requested_page < last_page:

251 next_page = requested_page + 1

252 else:

253 next_page = None

254

255 return page_count, previous_page, next_page, last_page

256

257 @classmethod

258 def _make_response(cls, endpoint, res, q, page, page_size, sort, obs):

259 total = res.get("hits", {}).get("total", {}).get('value', 0)

260

261 page_count, previous_page, next_page, last_page = cls._calc_pagination(total, page_size, page)

262

263 # build the response object

264 result = {

265 "total": total,

266 "page": page,

267 "pageSize": page_size,

268 "timestamp": dates.now_with_microseconds(),

269 "query": q,

270 "results": obs

271 }

272

273 if previous_page is not None:

274 result["prev"] = app.config['BASE_URL'] + url_for(app.config['API_CURRENT_BLUEPRINT_NAME'] + '.' + endpoint, search_query=q, page=previous_page, pageSize=page_size, sort=sort)

275

276 if next_page is not None:

277 result["next"] = app.config['BASE_URL'] + url_for(app.config['API_CURRENT_BLUEPRINT_NAME'] + '.' + endpoint, search_query=q, page=next_page, pageSize=page_size, sort=sort)

278

279 if last_page is not None:

280 result["last"] = app.config['BASE_URL'] + url_for(app.config['API_CURRENT_BLUEPRINT_NAME'] + '.' + endpoint, search_query=q, page=last_page, pageSize=page_size, sort=sort)

281

282 if sort is not None:

283 result["sort"] = sort

284

285 return SearchResult(result)

286

287 @classmethod

288 def search(cls, index_type, account, q, page, page_size, sort=None):

289 if index_type not in ['article', 'journal', 'application']:

290 raise DiscoveryException("There was an error executing your query for {0}. Unknown type.)".format(index_type))

291

292 if index_type == 'article':

293 endpoint = 'search_articles'

294 klass = models.Article # ~~->Article:Model~~

295 elif index_type == 'journal':

296 endpoint = 'search_journals'

297 klass = models.Journal # ~~->Journal:Model~~

298 else:

299 endpoint = 'search_applications'

300 klass = models.Suggestion #~~->Application:Model~~

301

302 raw_query, page, page_size = cls._make_query(q, page, page_size, sort, index_type, False)

303

304 # execute the query against the articles

305 # ~~->Query:Service~~

306 query_service = DOAJ.queryService()

307 try:

308 res = query_service.search('api_query', index_type, raw_query, account, None)

309 except Exception as e:

310 magic = uuid.uuid1()

311 msg = e.error if hasattr(e, "error") else e.message if hasattr(e, "message") else str(e)

312 app.logger.error(u"Error executing discovery query search for {i}: {x} (ref: {y})".format(i=index_type, x=msg, y=magic))

313 raise DiscoveryException("There was an error executing your query (ref: {y})".format(y=magic))

314

315 obs = [klass(**raw) for raw in esprit.raw.unpack_json_result(res)]

316 return cls._make_response(endpoint, res, q, page, page_size, sort, obs)

317

318 @classmethod

319 def scroll(cls, index_type, account, q, page_size, sort=None, scan=False):

320 if index_type not in ['article', 'journal', 'application']:

321 raise DiscoveryException("There was an error executing your query for {0}. Unknown type.)".format(index_type))

322

323 page = 1 # Not used in scroll

324 raw_query, page, page_size = cls._make_query(q, page, page_size, sort, index_type, True)

325

326 # execute the query against the articles

327 query_service = DOAJ.queryService()

328 for result in query_service.scroll('api_query', index_type, raw_query, account, page_size, scan=scan):

329 yield result

330

331

332class SearchQuery(object):

333 """

334 ~~->Search:Query~~

335 ~~Search:Query->Elasticsearch:Technology~~

336 """

337 def __init__(self, qs, fro, psize, sortby=None, sortdir=None):

338 self.qs = qs

339 self.fro = fro

340 self.psize = psize

341 self.sortby = sortby

342 self.sortdir = sortdir if sortdir is not None else "asc"

343

344 def query(self):

345 q = {

346 "track_total_hits" : True,

347 "from": self.fro,

348 "size": self.psize

349 }

350 if self.qs is not None:

351 q["query"] = {

352 "query_string": {

353 "query": self.qs,

354 "default_operator": "AND"

355 }

356 }

357 else:

358 q["query"] = {"match_all": {}}

359

360 if self.sortby is not None:

361 q["sort"] = [{self.sortby: {"order": self.sortdir, "mode": "min"}}]

362

363 return q