Coverage for portality / bll / services / journal.py: 87%

207 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-05 00:09 +0100

1import csv 

2import logging 

3import random 

4import re 

5import string 

6from datetime import datetime 

7from datetime import timedelta 

8import os 

9import shutil 

10from collections import defaultdict 

11 

12from portality import lock 

13from portality import models, constants 

14from portality.bll import exceptions 

15from portality.bll.doaj import DOAJ 

16from portality.core import app 

17from portality.crosswalks.journal_questions import Journal2QuestionXwalk 

18from portality.lib import dates 

19from portality.lib.argvalidate import argvalidate 

20from portality.lib.dates import FMT_DATETIME_SHORT 

21from portality.store import StoreException 

22from portality.store import StoreFactory, prune_container 

23from portality.ui.messages import Messages 

24from portality.util import no_op 

25 

26 

27class JournalService(object): 

28 """ 

29 ~~Journal:Service~~ 

30 """ 

31 def journal_2_application(self, journal, account=None, keep_editors=False): 

32 """ 

33 Function to convert a given journal into an application object. 

34 

35 Provide the journal, and it will be converted 

36 in-memory to the application object (currently a Suggestion). The new application 

37 WILL NOT be saved by this method. 

38 

39 If an account is provided, this will validate that the account holder is 

40 allowed to make this conversion 

41 

42 :param journal: a journal to convert 

43 :param account: an account doing the action - optional, if specified the application will only be created if the account is allowed to 

44 :return: Suggestion object 

45 """ 

46 

47 # first validate the incoming arguments to ensure that we've got the right thing 

48 argvalidate("journal_2_application", [ 

49 {"arg": journal, "instance" : models.Journal, "allow_none" : False, "arg_name" : "journal"}, 

50 {"arg" : account, "instance" : models.Account, "arg_name" : "account"} 

51 ], exceptions.ArgumentException) 

52 

53 if app.logger.isEnabledFor(logging.DEBUG): app.logger.debug("Entering journal_2_application") 

54 

55 # ~~-> AuthNZ:Service~~ 

56 authService = DOAJ.authorisationService() 

57 

58 # if an account is specified, check that it is allowed to perform this action 

59 if account is not None: 

60 try: 

61 authService.can_create_update_request(account, journal) # throws exception if not allowed 

62 except exceptions.AuthoriseException as e: 

63 msg = "Account {x} is not permitted to create an update request on journal {y}".format(x=account.id, y=journal.id) 

64 app.logger.info(msg) 

65 e.args += (msg,) 

66 raise 

67 

68 # copy all the relevant information from the journal to the application 

69 bj = journal.bibjson() 

70 notes = journal.notes 

71 

72 application = models.Suggestion() # ~~-> Application:Model~~ 

73 application.set_application_status(constants.APPLICATION_STATUS_UPDATE_REQUEST) 

74 application.set_current_journal(journal.id) 

75 if keep_editors is True: 

76 if journal.editor is not None: 

77 application.set_editor(journal.editor) 

78 if journal.editor_group is not None: 

79 application.set_editor_group(journal.editor_group) 

80 for n in notes: 

81 # NOTE: we keep the same id for notes between journal and application, since ids only matter within 

82 # the scope of a record there are no id clashes, and at the same time it may be useful in future to 

83 # check the origin of some journal notes by comparing ids to application notes. 

84 application.add_note_by_dict(n) 

85 application.set_owner(journal.owner) 

86 application.set_bibjson(bj) 

87 application.date_applied = dates.now_str() 

88 

89 if app.logger.isEnabledFor(logging.DEBUG): app.logger.debug("Completed journal_2_application; return application object") 

90 return application 

91 

92 def journal(self, journal_id, lock_journal=False, lock_account=None, lock_timeout=None): 

93 """ 

94 Function to retrieve a journal by its id, and to optionally lock the resource 

95 

96 May raise a Locked exception, if a lock is requested but can't be obtained. 

97 

98 :param journal_id: the id of the journal 

99 :param: lock_journal: should we lock the resource on retrieval 

100 :param: lock_account: which account is doing the locking? Must be present if lock_journal=True 

101 :param: lock_timeout: how long to lock the resource for. May be none, in which case it will default 

102 :return: Tuple of (Journal Object, Lock Object) 

103 """ 

104 # first validate the incoming arguments to ensure that we've got the right thing 

105 argvalidate("journal", [ 

106 {"arg": journal_id, "allow_none" : False, "arg_name" : "journal_id"}, 

107 {"arg": lock_journal, "instance" : bool, "allow_none" : False, "arg_name" : "lock_journal"}, 

108 {"arg": lock_account, "instance" : models.Account, "allow_none" : True, "arg_name" : "lock_account"}, 

109 {"arg": lock_timeout, "instance" : int, "allow_none" : True, "arg_name" : "lock_timeout"} 

110 ], exceptions.ArgumentException) 

111 

112 # retrieve the journal 

113 journal = models.Journal.pull(journal_id) 

114 

115 # if we've retrieved the journal, and a lock is requested, request it 

116 the_lock = None 

117 if journal is not None and lock_journal: 

118 if lock_account is not None: 

119 # ~~->Lock:Feature~~ 

120 the_lock = lock.lock(constants.LOCK_JOURNAL, journal_id, lock_account.id, lock_timeout) 

121 else: 

122 raise exceptions.ArgumentException("If you specify lock_journal on journal retrieval, you must also provide lock_account") 

123 

124 return journal, the_lock 

125 

126 def find_best(self, identifier): 

127 if len(identifier) == 9: 

128 # search both in doaj and withdrawn to know whether to return 404 (not found) or 410 (gone) 

129 js = models.Journal.find_by_issn(identifier) 

130 if len(js) == 0: 

131 return None 

132 

133 # if there is one or more, try to get the active one 

134 active_journals = [j for j in js if j.is_in_doaj()] 

135 if len(active_journals) > 1: 

136 raise exceptions.TooManyJournals(Messages.TOO_MANY_JOURNALS.format(identifier=identifier)) 

137 

138 if len(active_journals) == 0: 

139 js.sort(key=lambda x: x.created_date, reverse=True) 

140 return js[0] # return the most recently created withdrawn journal 

141 

142 return active_journals[0] 

143 

144 elif len(identifier) == 32: 

145 # Pull by ES identifier 

146 j = models.Journal.pull(identifier) # Returns None on fail 

147 if j is None: 

148 return None 

149 return j 

150 

151 raise exceptions.ArgumentException("Identifier must be either an ISSN (9 chars) or an internal ID (32 chars)") 

152 

153 def csv(self, prune=True, logger=None, store=None): 

154 """ 

155 Generate the Journal CSV 

156 

157 ~~-> JournalCSV:Feature~~ 

158 

159 :param set_cache: whether to update the cache 

160 :param out_dir: the directory to output the file to. If set_cache is True, this argument will be overridden by the cache container 

161 :return: Tuple of (attachment_name, URL) 

162 """ 

163 # first validate the incoming arguments to ensure that we've got the right thing 

164 argvalidate("csv", [ 

165 {"arg": prune, "allow_none": False, "arg_name": "prune"}, 

166 {"arg": logger, "allow_none": True, "arg_name": "logger"} 

167 ], exceptions.ArgumentException) 

168 

169 # None isn't executable, so convert logger to NO-OP 

170 if logger is None: 

171 logger = no_op 

172 

173 export_start_time = dates.now() 

174 

175 query = models.JournalQuery().all_in_doaj() 

176 

177 export_svc = DOAJ.exportService() 

178 tmp_filepath, tmp_filename = export_svc.csv(models.Journal, query, logger=logger, admin_fieldset=False) 

179 

180 jc = models.JournalCSV() 

181 jc.export_date = export_start_time 

182 

183 if store is None: 

184 store = StoreFactory.get(constants.STORE__SCOPE__JOURNAL_CSV) 

185 

186 container = app.config.get("STORE_JOURNAL_CSV_CONTAINER") 

187 filename = 'doaj_journalcsv_' + dates.format(export_start_time, FMT_DATETIME_SHORT) + '_utf8.csv' 

188 try: 

189 store.store(container, filename, source_path=tmp_filepath) 

190 url = store.url(container, filename) 

191 logger("Stored CSV in main cache store at {x}".format(x=url)) 

192 jc.set_csv(container, filename, os.path.getsize(tmp_filepath), url) 

193 except: 

194 logger("Could not store CSV in main cache store: {x}".format(x=tmp_filename)) 

195 raise StoreException("Could not store CSV in main cache store: {x}".format(x=tmp_filename)) 

196 

197 export_svc.delete_tmp_csv(tmp_filename) 

198 logger("Deleted file from tmp store") 

199 

200 jc.save() 

201 

202 if prune: 

203 logger("Pruning old CSVs from store") 

204 self.prune_csvs(store=store, logger=logger, ignore=[filename]) 

205 logger("Pruned old CSVs from store") 

206 

207 # update the ES record to point to the new file 

208 return jc 

209 

210 def admin_csv(self, file_path, obscure_accounts=True, add_sensitive_account_info=False): 

211 """ 

212 ~~AdminJournalCSV:Feature->JournalCSV:Feature~~ 

213 

214 :param file_path: where to put the CSV 

215 :param obscure_accounts: anonymise the account data with consistent random strings 

216 :param add_sensitive_account_info: augment the CSV with account information - account ID, account name, account email addr 

217 """ 

218 query = models.JournalQuery().all_in_doaj() 

219 

220 export_svc = DOAJ.exportService() 

221 export_svc.csv(models.Journal, query, out_file=file_path, 

222 admin_fieldset=True, 

223 obscure_accounts=obscure_accounts, 

224 add_sensitive_account_info=add_sensitive_account_info 

225 ) 

226 

227 def prune_csvs(self, store=None, logger=None, ignore=None): 

228 if store is None: 

229 store = StoreFactory.get(constants.STORE__SCOPE__JOURNAL_CSV) 

230 

231 if ignore is None: 

232 ignore = [] 

233 

234 # None isn't executable, so convert logger to NO-OP 

235 if logger is None: 

236 logger = no_op 

237 

238 # First we're going to remove all the files for csv records which are too old to keep 

239 total = models.JournalCSV.count() 

240 old_csvs = models.JournalCSV.all_csvs_before(dates.before_now(app.config.get("NON_PREMIUM_DELAY_SECONDS") + 86400)) 

241 

242 # if removing the old_dds would leave us without any data dump records, then don't do anything 

243 if total <= len(old_csvs): 

244 logger("Not removing any old journal csv records, as this would leave us with none") 

245 else: 

246 for jc in old_csvs: 

247 ac = jc.container 

248 af = jc.filename 

249 store.delete_file(ac, af) 

250 jc.delete() 

251 

252 # Second, we're going to look at all records, and keep only the most recent one from each day 

253 thin = models.JournalCSV.all_csvs_before(dates.before_now(86400)) 

254 

255 def separate_by_newest_per_day(jcs): 

256 # Group objects by their day 

257 grouped_by_day = defaultdict(list) 

258 for jc in jcs: 

259 day = dates.parse(jc.export_day) # Extract the day (ignoring time) 

260 grouped_by_day[day].append(jc) 

261 

262 newest_per_day = [] 

263 everything_else = [] 

264 

265 # Find the newest object for each day 

266 for day, items in grouped_by_day.items(): 

267 items.sort(key=lambda x: x.export_date, reverse=True) # Sort by date descending 

268 newest_per_day.append(items[0]) # Add the newest object 

269 everything_else.extend(items[1:]) # Add the rest to "everything else" 

270 

271 return newest_per_day, everything_else 

272 

273 # Separate the objects into newest_per_day and everything_else 

274 newest_per_day, everything_else = separate_by_newest_per_day(thin) 

275 for jc in everything_else: 

276 ac = jc.container 

277 af = jc.filename 

278 try: 

279 store.delete_file(ac, af) 

280 except: 

281 pass 

282 jc.delete() 

283 

284 # Third we're going to check the container for files which don't have index records, and 

285 # clean them up 

286 

287 # get the files in storage 

288 container = app.config.get("STORE_JOURNAL_CSV_CONTAINER") 

289 container_files = store.list(container) 

290 

291 # if the filename doesn't match anything, remove the file 

292 for cf in container_files: 

293 if cf in ignore: 

294 continue 

295 jc = models.JournalCSV.find_by_filename(cf) 

296 if jc is None or len(jc) == 0: 

297 logger("No related index record; Deleting file {x} from storage container {y}".format(x=cf, y=container)) 

298 store.delete_file(container, cf) 

299 

300 # Finally, we check all the records in the index and confirm their files exist, and if not 

301 # remove the record 

302 for jc in models.JournalCSV.iterate_unstable(): 

303 missing = False 

304 if jc.container is not None and jc.filename is not None: 

305 if jc.filename not in store.list(jc.container): 

306 logger("File {x} in container {y} does not exist".format(x=jc.filename, y=jc.container)) 

307 missing = True 

308 

309 if missing: 

310 logger("File missing for {x}".format(x=jc.id)) 

311 jc.delete() 

312 

313 def get_premium_csv(self): 

314 # Get the latest data dump 

315 return models.JournalCSV.find_latest() 

316 

317 def get_free_csv(self, cutoff=None): 

318 if cutoff is None: 

319 cutoff_seconds = app.config.get("NON_PREMIUM_DELAY_SECONDS", 2592000) + 86400 

320 

321 # if we are in the phase-in period, cap the delay to the phase in date 

322 if app.config.get("PREMIUM_PHASE_IN", False): 

323 phase_in_start = app.config.get("PREMIUM_PHASE_IN_START") 

324 if phase_in_start is not None: 

325 max_delay = dates.now() - phase_in_start 

326 if max_delay.total_seconds() < cutoff_seconds: 

327 cutoff_seconds = max_delay.total_seconds() 

328 

329 cutoff = dates.before_now(cutoff_seconds) 

330 

331 # get the first dump after the cutoff 

332 option = models.JournalCSV.first_csv_after(cutoff=cutoff) 

333 if option is not None: 

334 return option 

335 

336 # if there was no such dump, just return the latest 

337 return models.JournalCSV.find_latest() 

338 

339 def get_temporary_url(self, jc: models.JournalCSV): 

340 container = jc.container 

341 filename = jc.filename 

342 

343 if container is None or filename is None: 

344 raise exceptions.NoSuchPropertyException("Cannot find container and filename for journal csv") 

345 

346 main_store = StoreFactory.get(constants.STORE__SCOPE__JOURNAL_CSV) 

347 store_url = main_store.temporary_url(container, filename, 

348 timeout=app.config.get("JOURNAL_CSV_URL_TIMEOUT", 3600)) 

349 return store_url 

350 

351 def delete_csv(self, id:str): 

352 """ 

353 Delete a journal csv by id. 

354 

355 :param id: the id of the journal csv to delete 

356 :return: True if deleted, False if not found 

357 """ 

358 jc = models.JournalCSV.pull(id) 

359 if jc is None: 

360 return False 

361 

362 container = jc.container 

363 filename = jc.filename 

364 

365 try: 

366 store = StoreFactory.get(constants.STORE__SCOPE__JOURNAL_CSV) 

367 store.delete_file(container, filename) 

368 except: 

369 pass 

370 

371 jc.delete() 

372 return True