Coverage for portality / bll / services / journal.py: 87%
207 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-05 00:09 +0100
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-05 00:09 +0100
1import csv
2import logging
3import random
4import re
5import string
6from datetime import datetime
7from datetime import timedelta
8import os
9import shutil
10from collections import defaultdict
12from portality import lock
13from portality import models, constants
14from portality.bll import exceptions
15from portality.bll.doaj import DOAJ
16from portality.core import app
17from portality.crosswalks.journal_questions import Journal2QuestionXwalk
18from portality.lib import dates
19from portality.lib.argvalidate import argvalidate
20from portality.lib.dates import FMT_DATETIME_SHORT
21from portality.store import StoreException
22from portality.store import StoreFactory, prune_container
23from portality.ui.messages import Messages
24from portality.util import no_op
27class JournalService(object):
28 """
29 ~~Journal:Service~~
30 """
31 def journal_2_application(self, journal, account=None, keep_editors=False):
32 """
33 Function to convert a given journal into an application object.
35 Provide the journal, and it will be converted
36 in-memory to the application object (currently a Suggestion). The new application
37 WILL NOT be saved by this method.
39 If an account is provided, this will validate that the account holder is
40 allowed to make this conversion
42 :param journal: a journal to convert
43 :param account: an account doing the action - optional, if specified the application will only be created if the account is allowed to
44 :return: Suggestion object
45 """
47 # first validate the incoming arguments to ensure that we've got the right thing
48 argvalidate("journal_2_application", [
49 {"arg": journal, "instance" : models.Journal, "allow_none" : False, "arg_name" : "journal"},
50 {"arg" : account, "instance" : models.Account, "arg_name" : "account"}
51 ], exceptions.ArgumentException)
53 if app.logger.isEnabledFor(logging.DEBUG): app.logger.debug("Entering journal_2_application")
55 # ~~-> AuthNZ:Service~~
56 authService = DOAJ.authorisationService()
58 # if an account is specified, check that it is allowed to perform this action
59 if account is not None:
60 try:
61 authService.can_create_update_request(account, journal) # throws exception if not allowed
62 except exceptions.AuthoriseException as e:
63 msg = "Account {x} is not permitted to create an update request on journal {y}".format(x=account.id, y=journal.id)
64 app.logger.info(msg)
65 e.args += (msg,)
66 raise
68 # copy all the relevant information from the journal to the application
69 bj = journal.bibjson()
70 notes = journal.notes
72 application = models.Suggestion() # ~~-> Application:Model~~
73 application.set_application_status(constants.APPLICATION_STATUS_UPDATE_REQUEST)
74 application.set_current_journal(journal.id)
75 if keep_editors is True:
76 if journal.editor is not None:
77 application.set_editor(journal.editor)
78 if journal.editor_group is not None:
79 application.set_editor_group(journal.editor_group)
80 for n in notes:
81 # NOTE: we keep the same id for notes between journal and application, since ids only matter within
82 # the scope of a record there are no id clashes, and at the same time it may be useful in future to
83 # check the origin of some journal notes by comparing ids to application notes.
84 application.add_note_by_dict(n)
85 application.set_owner(journal.owner)
86 application.set_bibjson(bj)
87 application.date_applied = dates.now_str()
89 if app.logger.isEnabledFor(logging.DEBUG): app.logger.debug("Completed journal_2_application; return application object")
90 return application
92 def journal(self, journal_id, lock_journal=False, lock_account=None, lock_timeout=None):
93 """
94 Function to retrieve a journal by its id, and to optionally lock the resource
96 May raise a Locked exception, if a lock is requested but can't be obtained.
98 :param journal_id: the id of the journal
99 :param: lock_journal: should we lock the resource on retrieval
100 :param: lock_account: which account is doing the locking? Must be present if lock_journal=True
101 :param: lock_timeout: how long to lock the resource for. May be none, in which case it will default
102 :return: Tuple of (Journal Object, Lock Object)
103 """
104 # first validate the incoming arguments to ensure that we've got the right thing
105 argvalidate("journal", [
106 {"arg": journal_id, "allow_none" : False, "arg_name" : "journal_id"},
107 {"arg": lock_journal, "instance" : bool, "allow_none" : False, "arg_name" : "lock_journal"},
108 {"arg": lock_account, "instance" : models.Account, "allow_none" : True, "arg_name" : "lock_account"},
109 {"arg": lock_timeout, "instance" : int, "allow_none" : True, "arg_name" : "lock_timeout"}
110 ], exceptions.ArgumentException)
112 # retrieve the journal
113 journal = models.Journal.pull(journal_id)
115 # if we've retrieved the journal, and a lock is requested, request it
116 the_lock = None
117 if journal is not None and lock_journal:
118 if lock_account is not None:
119 # ~~->Lock:Feature~~
120 the_lock = lock.lock(constants.LOCK_JOURNAL, journal_id, lock_account.id, lock_timeout)
121 else:
122 raise exceptions.ArgumentException("If you specify lock_journal on journal retrieval, you must also provide lock_account")
124 return journal, the_lock
126 def find_best(self, identifier):
127 if len(identifier) == 9:
128 # search both in doaj and withdrawn to know whether to return 404 (not found) or 410 (gone)
129 js = models.Journal.find_by_issn(identifier)
130 if len(js) == 0:
131 return None
133 # if there is one or more, try to get the active one
134 active_journals = [j for j in js if j.is_in_doaj()]
135 if len(active_journals) > 1:
136 raise exceptions.TooManyJournals(Messages.TOO_MANY_JOURNALS.format(identifier=identifier))
138 if len(active_journals) == 0:
139 js.sort(key=lambda x: x.created_date, reverse=True)
140 return js[0] # return the most recently created withdrawn journal
142 return active_journals[0]
144 elif len(identifier) == 32:
145 # Pull by ES identifier
146 j = models.Journal.pull(identifier) # Returns None on fail
147 if j is None:
148 return None
149 return j
151 raise exceptions.ArgumentException("Identifier must be either an ISSN (9 chars) or an internal ID (32 chars)")
153 def csv(self, prune=True, logger=None, store=None):
154 """
155 Generate the Journal CSV
157 ~~-> JournalCSV:Feature~~
159 :param set_cache: whether to update the cache
160 :param out_dir: the directory to output the file to. If set_cache is True, this argument will be overridden by the cache container
161 :return: Tuple of (attachment_name, URL)
162 """
163 # first validate the incoming arguments to ensure that we've got the right thing
164 argvalidate("csv", [
165 {"arg": prune, "allow_none": False, "arg_name": "prune"},
166 {"arg": logger, "allow_none": True, "arg_name": "logger"}
167 ], exceptions.ArgumentException)
169 # None isn't executable, so convert logger to NO-OP
170 if logger is None:
171 logger = no_op
173 export_start_time = dates.now()
175 query = models.JournalQuery().all_in_doaj()
177 export_svc = DOAJ.exportService()
178 tmp_filepath, tmp_filename = export_svc.csv(models.Journal, query, logger=logger, admin_fieldset=False)
180 jc = models.JournalCSV()
181 jc.export_date = export_start_time
183 if store is None:
184 store = StoreFactory.get(constants.STORE__SCOPE__JOURNAL_CSV)
186 container = app.config.get("STORE_JOURNAL_CSV_CONTAINER")
187 filename = 'doaj_journalcsv_' + dates.format(export_start_time, FMT_DATETIME_SHORT) + '_utf8.csv'
188 try:
189 store.store(container, filename, source_path=tmp_filepath)
190 url = store.url(container, filename)
191 logger("Stored CSV in main cache store at {x}".format(x=url))
192 jc.set_csv(container, filename, os.path.getsize(tmp_filepath), url)
193 except:
194 logger("Could not store CSV in main cache store: {x}".format(x=tmp_filename))
195 raise StoreException("Could not store CSV in main cache store: {x}".format(x=tmp_filename))
197 export_svc.delete_tmp_csv(tmp_filename)
198 logger("Deleted file from tmp store")
200 jc.save()
202 if prune:
203 logger("Pruning old CSVs from store")
204 self.prune_csvs(store=store, logger=logger, ignore=[filename])
205 logger("Pruned old CSVs from store")
207 # update the ES record to point to the new file
208 return jc
210 def admin_csv(self, file_path, obscure_accounts=True, add_sensitive_account_info=False):
211 """
212 ~~AdminJournalCSV:Feature->JournalCSV:Feature~~
214 :param file_path: where to put the CSV
215 :param obscure_accounts: anonymise the account data with consistent random strings
216 :param add_sensitive_account_info: augment the CSV with account information - account ID, account name, account email addr
217 """
218 query = models.JournalQuery().all_in_doaj()
220 export_svc = DOAJ.exportService()
221 export_svc.csv(models.Journal, query, out_file=file_path,
222 admin_fieldset=True,
223 obscure_accounts=obscure_accounts,
224 add_sensitive_account_info=add_sensitive_account_info
225 )
227 def prune_csvs(self, store=None, logger=None, ignore=None):
228 if store is None:
229 store = StoreFactory.get(constants.STORE__SCOPE__JOURNAL_CSV)
231 if ignore is None:
232 ignore = []
234 # None isn't executable, so convert logger to NO-OP
235 if logger is None:
236 logger = no_op
238 # First we're going to remove all the files for csv records which are too old to keep
239 total = models.JournalCSV.count()
240 old_csvs = models.JournalCSV.all_csvs_before(dates.before_now(app.config.get("NON_PREMIUM_DELAY_SECONDS") + 86400))
242 # if removing the old_dds would leave us without any data dump records, then don't do anything
243 if total <= len(old_csvs):
244 logger("Not removing any old journal csv records, as this would leave us with none")
245 else:
246 for jc in old_csvs:
247 ac = jc.container
248 af = jc.filename
249 store.delete_file(ac, af)
250 jc.delete()
252 # Second, we're going to look at all records, and keep only the most recent one from each day
253 thin = models.JournalCSV.all_csvs_before(dates.before_now(86400))
255 def separate_by_newest_per_day(jcs):
256 # Group objects by their day
257 grouped_by_day = defaultdict(list)
258 for jc in jcs:
259 day = dates.parse(jc.export_day) # Extract the day (ignoring time)
260 grouped_by_day[day].append(jc)
262 newest_per_day = []
263 everything_else = []
265 # Find the newest object for each day
266 for day, items in grouped_by_day.items():
267 items.sort(key=lambda x: x.export_date, reverse=True) # Sort by date descending
268 newest_per_day.append(items[0]) # Add the newest object
269 everything_else.extend(items[1:]) # Add the rest to "everything else"
271 return newest_per_day, everything_else
273 # Separate the objects into newest_per_day and everything_else
274 newest_per_day, everything_else = separate_by_newest_per_day(thin)
275 for jc in everything_else:
276 ac = jc.container
277 af = jc.filename
278 try:
279 store.delete_file(ac, af)
280 except:
281 pass
282 jc.delete()
284 # Third we're going to check the container for files which don't have index records, and
285 # clean them up
287 # get the files in storage
288 container = app.config.get("STORE_JOURNAL_CSV_CONTAINER")
289 container_files = store.list(container)
291 # if the filename doesn't match anything, remove the file
292 for cf in container_files:
293 if cf in ignore:
294 continue
295 jc = models.JournalCSV.find_by_filename(cf)
296 if jc is None or len(jc) == 0:
297 logger("No related index record; Deleting file {x} from storage container {y}".format(x=cf, y=container))
298 store.delete_file(container, cf)
300 # Finally, we check all the records in the index and confirm their files exist, and if not
301 # remove the record
302 for jc in models.JournalCSV.iterate_unstable():
303 missing = False
304 if jc.container is not None and jc.filename is not None:
305 if jc.filename not in store.list(jc.container):
306 logger("File {x} in container {y} does not exist".format(x=jc.filename, y=jc.container))
307 missing = True
309 if missing:
310 logger("File missing for {x}".format(x=jc.id))
311 jc.delete()
313 def get_premium_csv(self):
314 # Get the latest data dump
315 return models.JournalCSV.find_latest()
317 def get_free_csv(self, cutoff=None):
318 if cutoff is None:
319 cutoff_seconds = app.config.get("NON_PREMIUM_DELAY_SECONDS", 2592000) + 86400
321 # if we are in the phase-in period, cap the delay to the phase in date
322 if app.config.get("PREMIUM_PHASE_IN", False):
323 phase_in_start = app.config.get("PREMIUM_PHASE_IN_START")
324 if phase_in_start is not None:
325 max_delay = dates.now() - phase_in_start
326 if max_delay.total_seconds() < cutoff_seconds:
327 cutoff_seconds = max_delay.total_seconds()
329 cutoff = dates.before_now(cutoff_seconds)
331 # get the first dump after the cutoff
332 option = models.JournalCSV.first_csv_after(cutoff=cutoff)
333 if option is not None:
334 return option
336 # if there was no such dump, just return the latest
337 return models.JournalCSV.find_latest()
339 def get_temporary_url(self, jc: models.JournalCSV):
340 container = jc.container
341 filename = jc.filename
343 if container is None or filename is None:
344 raise exceptions.NoSuchPropertyException("Cannot find container and filename for journal csv")
346 main_store = StoreFactory.get(constants.STORE__SCOPE__JOURNAL_CSV)
347 store_url = main_store.temporary_url(container, filename,
348 timeout=app.config.get("JOURNAL_CSV_URL_TIMEOUT", 3600))
349 return store_url
351 def delete_csv(self, id:str):
352 """
353 Delete a journal csv by id.
355 :param id: the id of the journal csv to delete
356 :return: True if deleted, False if not found
357 """
358 jc = models.JournalCSV.pull(id)
359 if jc is None:
360 return False
362 container = jc.container
363 filename = jc.filename
365 try:
366 store = StoreFactory.get(constants.STORE__SCOPE__JOURNAL_CSV)
367 store.delete_file(container, filename)
368 except:
369 pass
371 jc.delete()
372 return True