Coverage for portality/bll/services/journal.py: 87%

1import csv

2import logging

3import random

4import re

5import string

6from datetime import datetime

7from datetime import timedelta

8import os

9import shutil

10from collections import defaultdict

12from portality import lock

13from portality import models, constants

14from portality.bll import exceptions

15from portality.bll.doaj import DOAJ

16from portality.core import app

17from portality.crosswalks.journal_questions import Journal2QuestionXwalk

18from portality.lib import dates

19from portality.lib.argvalidate import argvalidate

20from portality.lib.dates import FMT_DATETIME_SHORT

21from portality.store import StoreException

22from portality.store import StoreFactory, prune_container

23from portality.ui.messages import Messages

24from portality.util import no_op

27class JournalService(object):

28 """

29 ~~Journal:Service~~

30 """

31 def journal_2_application(self, journal, account=None, keep_editors=False):

32 """

33 Function to convert a given journal into an application object.

35 Provide the journal, and it will be converted

36 in-memory to the application object (currently a Suggestion). The new application

37 WILL NOT be saved by this method.

39 If an account is provided, this will validate that the account holder is

40 allowed to make this conversion

42 :param journal: a journal to convert

43 :param account: an account doing the action - optional, if specified the application will only be created if the account is allowed to

44 :return: Suggestion object

45 """

47 # first validate the incoming arguments to ensure that we've got the right thing

48 argvalidate("journal_2_application", [

49 {"arg": journal, "instance" : models.Journal, "allow_none" : False, "arg_name" : "journal"},

50 {"arg" : account, "instance" : models.Account, "arg_name" : "account"}

51 ], exceptions.ArgumentException)

53 if app.logger.isEnabledFor(logging.DEBUG): app.logger.debug("Entering journal_2_application")

55 # ~~-> AuthNZ:Service~~

56 authService = DOAJ.authorisationService()

58 # if an account is specified, check that it is allowed to perform this action

59 if account is not None:

60 try:

61 authService.can_create_update_request(account, journal) # throws exception if not allowed

62 except exceptions.AuthoriseException as e:

63 msg = "Account {x} is not permitted to create an update request on journal {y}".format(x=account.id, y=journal.id)

64 app.logger.info(msg)

65 e.args += (msg,)

66 raise

68 # copy all the relevant information from the journal to the application

69 bj = journal.bibjson()

70 notes = journal.notes

72 application = models.Suggestion() # ~~-> Application:Model~~

73 application.set_application_status(constants.APPLICATION_STATUS_UPDATE_REQUEST)

74 application.set_current_journal(journal.id)

75 if keep_editors is True:

76 if journal.editor is not None:

77 application.set_editor(journal.editor)

78 if journal.editor_group is not None:

79 application.set_editor_group(journal.editor_group)

80 for n in notes:

81 # NOTE: we keep the same id for notes between journal and application, since ids only matter within

82 # the scope of a record there are no id clashes, and at the same time it may be useful in future to

83 # check the origin of some journal notes by comparing ids to application notes.

84 application.add_note_by_dict(n)

85 application.set_owner(journal.owner)

86 application.set_bibjson(bj)

87 application.date_applied = dates.now_str()

89 if app.logger.isEnabledFor(logging.DEBUG): app.logger.debug("Completed journal_2_application; return application object")

90 return application

92 def journal(self, journal_id, lock_journal=False, lock_account=None, lock_timeout=None):

93 """

94 Function to retrieve a journal by its id, and to optionally lock the resource

96 May raise a Locked exception, if a lock is requested but can't be obtained.

98 :param journal_id: the id of the journal

99 :param: lock_journal: should we lock the resource on retrieval

100 :param: lock_account: which account is doing the locking? Must be present if lock_journal=True

101 :param: lock_timeout: how long to lock the resource for. May be none, in which case it will default

102 :return: Tuple of (Journal Object, Lock Object)

103 """

104 # first validate the incoming arguments to ensure that we've got the right thing

105 argvalidate("journal", [

106 {"arg": journal_id, "allow_none" : False, "arg_name" : "journal_id"},

107 {"arg": lock_journal, "instance" : bool, "allow_none" : False, "arg_name" : "lock_journal"},

108 {"arg": lock_account, "instance" : models.Account, "allow_none" : True, "arg_name" : "lock_account"},

109 {"arg": lock_timeout, "instance" : int, "allow_none" : True, "arg_name" : "lock_timeout"}

110 ], exceptions.ArgumentException)

111

112 # retrieve the journal

113 journal = models.Journal.pull(journal_id)

114

115 # if we've retrieved the journal, and a lock is requested, request it

116 the_lock = None

117 if journal is not None and lock_journal:

118 if lock_account is not None:

119 # ~~->Lock:Feature~~

120 the_lock = lock.lock(constants.LOCK_JOURNAL, journal_id, lock_account.id, lock_timeout)

121 else:

122 raise exceptions.ArgumentException("If you specify lock_journal on journal retrieval, you must also provide lock_account")

123

124 return journal, the_lock

125

126 def find_best(self, identifier):

127 if len(identifier) == 9:

128 # search both in doaj and withdrawn to know whether to return 404 (not found) or 410 (gone)

129 js = models.Journal.find_by_issn(identifier)

130 if len(js) == 0:

131 return None

132

133 # if there is one or more, try to get the active one

134 active_journals = [j for j in js if j.is_in_doaj()]

135 if len(active_journals) > 1:

136 raise exceptions.TooManyJournals(Messages.TOO_MANY_JOURNALS.format(identifier=identifier))

137

138 if len(active_journals) == 0:

139 js.sort(key=lambda x: x.created_date, reverse=True)

140 return js[0] # return the most recently created withdrawn journal

141

142 return active_journals[0]

143

144 elif len(identifier) == 32:

145 # Pull by ES identifier

146 j = models.Journal.pull(identifier) # Returns None on fail

147 if j is None:

148 return None

149 return j

150

151 raise exceptions.ArgumentException("Identifier must be either an ISSN (9 chars) or an internal ID (32 chars)")

152

153 def csv(self, prune=True, logger=None, store=None):

154 """

155 Generate the Journal CSV

156

157 ~~-> JournalCSV:Feature~~

158

159 :param set_cache: whether to update the cache

160 :param out_dir: the directory to output the file to. If set_cache is True, this argument will be overridden by the cache container

161 :return: Tuple of (attachment_name, URL)

162 """

163 # first validate the incoming arguments to ensure that we've got the right thing

164 argvalidate("csv", [

165 {"arg": prune, "allow_none": False, "arg_name": "prune"},

166 {"arg": logger, "allow_none": True, "arg_name": "logger"}

167 ], exceptions.ArgumentException)

168

169 # None isn't executable, so convert logger to NO-OP

170 if logger is None:

171 logger = no_op

172

173 export_start_time = dates.now()

174

175 query = models.JournalQuery().all_in_doaj()

176

177 export_svc = DOAJ.exportService()

178 tmp_filepath, tmp_filename = export_svc.csv(models.Journal, query, logger=logger, admin_fieldset=False)

179

180 jc = models.JournalCSV()

181 jc.export_date = export_start_time

182

183 if store is None:

184 store = StoreFactory.get(constants.STORE__SCOPE__JOURNAL_CSV)

185

186 container = app.config.get("STORE_JOURNAL_CSV_CONTAINER")

187 filename = 'doaj_journalcsv_' + dates.format(export_start_time, FMT_DATETIME_SHORT) + '_utf8.csv'

188 try:

189 store.store(container, filename, source_path=tmp_filepath)

190 url = store.url(container, filename)

191 logger("Stored CSV in main cache store at {x}".format(x=url))

192 jc.set_csv(container, filename, os.path.getsize(tmp_filepath), url)

193 except:

194 logger("Could not store CSV in main cache store: {x}".format(x=tmp_filename))

195 raise StoreException("Could not store CSV in main cache store: {x}".format(x=tmp_filename))

196

197 export_svc.delete_tmp_csv(tmp_filename)

198 logger("Deleted file from tmp store")

199

200 jc.save()

201

202 if prune:

203 logger("Pruning old CSVs from store")

204 self.prune_csvs(store=store, logger=logger, ignore=[filename])

205 logger("Pruned old CSVs from store")

206

207 # update the ES record to point to the new file

208 return jc

209

210 def admin_csv(self, file_path, obscure_accounts=True, add_sensitive_account_info=False):

211 """

212 ~~AdminJournalCSV:Feature->JournalCSV:Feature~~

213

214 :param file_path: where to put the CSV

215 :param obscure_accounts: anonymise the account data with consistent random strings

216 :param add_sensitive_account_info: augment the CSV with account information - account ID, account name, account email addr

217 """

218 query = models.JournalQuery().all_in_doaj()

219

220 export_svc = DOAJ.exportService()

221 export_svc.csv(models.Journal, query, out_file=file_path,

222 admin_fieldset=True,

223 obscure_accounts=obscure_accounts,

224 add_sensitive_account_info=add_sensitive_account_info

225 )

226

227 def prune_csvs(self, store=None, logger=None, ignore=None):

228 if store is None:

229 store = StoreFactory.get(constants.STORE__SCOPE__JOURNAL_CSV)

230

231 if ignore is None:

232 ignore = []

233

234 # None isn't executable, so convert logger to NO-OP

235 if logger is None:

236 logger = no_op

237

238 # First we're going to remove all the files for csv records which are too old to keep

239 total = models.JournalCSV.count()

240 old_csvs = models.JournalCSV.all_csvs_before(dates.before_now(app.config.get("NON_PREMIUM_DELAY_SECONDS") + 86400))

241

242 # if removing the old_dds would leave us without any data dump records, then don't do anything

243 if total <= len(old_csvs):

244 logger("Not removing any old journal csv records, as this would leave us with none")

245 else:

246 for jc in old_csvs:

247 ac = jc.container

248 af = jc.filename

249 store.delete_file(ac, af)

250 jc.delete()

251

252 # Second, we're going to look at all records, and keep only the most recent one from each day

253 thin = models.JournalCSV.all_csvs_before(dates.before_now(86400))

254

255 def separate_by_newest_per_day(jcs):

256 # Group objects by their day

257 grouped_by_day = defaultdict(list)

258 for jc in jcs:

259 day = dates.parse(jc.export_day) # Extract the day (ignoring time)

260 grouped_by_day[day].append(jc)

261

262 newest_per_day = []

263 everything_else = []

264

265 # Find the newest object for each day

266 for day, items in grouped_by_day.items():

267 items.sort(key=lambda x: x.export_date, reverse=True) # Sort by date descending

268 newest_per_day.append(items[0]) # Add the newest object

269 everything_else.extend(items[1:]) # Add the rest to "everything else"

270

271 return newest_per_day, everything_else

272

273 # Separate the objects into newest_per_day and everything_else

274 newest_per_day, everything_else = separate_by_newest_per_day(thin)

275 for jc in everything_else:

276 ac = jc.container

277 af = jc.filename

278 try:

279 store.delete_file(ac, af)

280 except:

281 pass

282 jc.delete()

283

284 # Third we're going to check the container for files which don't have index records, and

285 # clean them up

286

287 # get the files in storage

288 container = app.config.get("STORE_JOURNAL_CSV_CONTAINER")

289 container_files = store.list(container)

290

291 # if the filename doesn't match anything, remove the file

292 for cf in container_files:

293 if cf in ignore:

294 continue

295 jc = models.JournalCSV.find_by_filename(cf)

296 if jc is None or len(jc) == 0:

297 logger("No related index record; Deleting file {x} from storage container {y}".format(x=cf, y=container))

298 store.delete_file(container, cf)

299

300 # Finally, we check all the records in the index and confirm their files exist, and if not

301 # remove the record

302 for jc in models.JournalCSV.iterate_unstable():

303 missing = False

304 if jc.container is not None and jc.filename is not None:

305 if jc.filename not in store.list(jc.container):

306 logger("File {x} in container {y} does not exist".format(x=jc.filename, y=jc.container))

307 missing = True

308

309 if missing:

310 logger("File missing for {x}".format(x=jc.id))

311 jc.delete()

312

313 def get_premium_csv(self):

314 # Get the latest data dump

315 return models.JournalCSV.find_latest()

316

317 def get_free_csv(self, cutoff=None):

318 if cutoff is None:

319 cutoff_seconds = app.config.get("NON_PREMIUM_DELAY_SECONDS", 2592000) + 86400

320

321 # if we are in the phase-in period, cap the delay to the phase in date

322 if app.config.get("PREMIUM_PHASE_IN", False):

323 phase_in_start = app.config.get("PREMIUM_PHASE_IN_START")

324 if phase_in_start is not None:

325 max_delay = dates.now() - phase_in_start

326 if max_delay.total_seconds() < cutoff_seconds:

327 cutoff_seconds = max_delay.total_seconds()

328

329 cutoff = dates.before_now(cutoff_seconds)

330

331 # get the first dump after the cutoff

332 option = models.JournalCSV.first_csv_after(cutoff=cutoff)

333 if option is not None:

334 return option

335

336 # if there was no such dump, just return the latest

337 return models.JournalCSV.find_latest()

338

339 def get_temporary_url(self, jc: models.JournalCSV):

340 container = jc.container

341 filename = jc.filename

342

343 if container is None or filename is None:

344 raise exceptions.NoSuchPropertyException("Cannot find container and filename for journal csv")

345

346 main_store = StoreFactory.get(constants.STORE__SCOPE__JOURNAL_CSV)

347 store_url = main_store.temporary_url(container, filename,

348 timeout=app.config.get("JOURNAL_CSV_URL_TIMEOUT", 3600))

349 return store_url

350

351 def delete_csv(self, id:str):

352 """

353 Delete a journal csv by id.

354

355 :param id: the id of the journal csv to delete

356 :return: True if deleted, False if not found

357 """

358 jc = models.JournalCSV.pull(id)

359 if jc is None:

360 return False

361

362 container = jc.container

363 filename = jc.filename

364

365 try:

366 store = StoreFactory.get(constants.STORE__SCOPE__JOURNAL_CSV)

367 store.delete_file(container, filename)

368 except:

369 pass

370

371 jc.delete()

372 return True

Coverage for portality / bll / services / journal.py: 87%

207 statements