Coverage for portality / bll / services / article.py: 98%
294 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-04 09:41 +0100
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-04 09:41 +0100
1from portality.lib import dates
2from portality.lib.argvalidate import argvalidate
3from portality import models, constants
4from portality.bll import exceptions, DOAJ
5from portality.ui.messages import Messages
6from portality.lib.dataobj import DataStructureException
10class ArticleService(object):
11 """
12 ~~Article:Service~~
13 """
15 def batch_create_articles(self, articles, account, duplicate_check=True, merge_duplicate=True,
16 limit_to_account=True, add_journal_info=False):
17 """
18 Create a batch of articles in a single operation. Articles are either all created/updated or none of them are
20 This method checks for duplicates within the provided set and within the current database (if you set duplicate_check=True)
22 ~~->ArticleBatchCreate:Feature~~
24 :param articles: The list of article objects
25 :param account: The account creating the articles
26 :param duplicate_check: Whether to check for duplicates in the batch and in the index
27 :param merge_duplicate: Should duplicates be merged. If set to False, this may raise a DuplicateArticleException
28 :param limit_to_account: Should the ingest be limited only to articles for journals owned by the account. If set to True, may result in an IngestException
29 :param add_journal_info: Should we fetch the journal info and attach it to the article before save?
30 :return: a report on the state of the import: {success: x, fail: x, update: x, new: x, shared: [], unowned: [], unmatched: []}
31 """
32 # first validate the incoming arguments to ensure that we've got the right thing
33 argvalidate("batch_create_article", [
34 {"arg": articles, "instance": list, "allow_none": False, "arg_name": "articles"},
35 {"arg": account, "instance": models.Account, "allow_none": False, "arg_name": "account"},
36 {"arg": duplicate_check, "instance": bool, "allow_none": False, "arg_name": "duplicate_check"},
37 {"arg": merge_duplicate, "instance": bool, "allow_none": False, "arg_name": "merge_duplicate"},
38 {"arg": limit_to_account, "instance": bool, "allow_none": False, "arg_name": "limit_to_account"},
39 {"arg": add_journal_info, "instance": bool, "allow_none": False, "arg_name": "add_journal_info"}
40 ], exceptions.ArgumentException)
42 # 1. dedupe the batch
43 if duplicate_check:
44 batch_duplicates = self._batch_contains_duplicates(articles)
45 if batch_duplicates:
46 report = {"success": 0, "fail": len(articles), "update": 0, "new": 0, "shared": [], "unowned": [],
47 "unmatched": []}
48 raise exceptions.IngestException(message=Messages.EXCEPTION_ARTICLE_BATCH_DUPLICATE, result=report)
50 # 2. check legitimate ownership
51 success = 0
52 fail = 0
53 update = 0
54 new = 0
55 all_shared = set()
56 all_unowned = set()
57 all_unmatched = set()
59 # Hold on to the exception so we can raise it later
60 e_not_acceptable = None
62 for article in articles:
63 try:
64 # ~~!ArticleBatchCreate:Feature->ArticleCreate:Feature~~
65 result = self.create_article(article, account,
66 duplicate_check=duplicate_check,
67 merge_duplicate=merge_duplicate,
68 limit_to_account=limit_to_account,
69 add_journal_info=add_journal_info,
70 dry_run=True)
71 except (exceptions.ArticleMergeConflict, exceptions.ConfigurationException):
72 raise exceptions.IngestException(message=Messages.EXCEPTION_ARTICLE_BATCH_CONFLICT)
73 except exceptions.ArticleNotAcceptable as e:
74 # The ArticleNotAcceptable exception is a superset of reasons we can't match a journal to this article
75 e_not_acceptable = e
76 result = {'fail': 1, 'unmatched': set(article.bibjson().issns())}
78 success += result.get("success", 0)
79 fail += result.get("fail", 0)
80 update += result.get("update", 0)
81 new += result.get("new", 0)
82 all_shared.update(result.get("shared", set()))
83 all_unowned.update(result.get("unowned", set()))
84 all_unmatched.update(result.get("unmatched", set()))
86 report = {"success": success, "fail": fail, "update": update, "new": new, "shared": all_shared,
87 "unowned": all_unowned, "unmatched": all_unmatched}
89 # if there were no failures in the batch, then we can do the save
90 if fail == 0:
91 for i in range(len(articles)):
92 block = i == len(articles) - 1
93 # block on the final save, so that when this method returns, all articles are
94 # available in the index
95 articles[i].save(blocking=block)
97 # return some stats on the import
98 return report
99 else:
100 if e_not_acceptable is not None:
101 raise exceptions.ArticleNotAcceptable(message=e_not_acceptable.message, result=report)
102 raise exceptions.IngestException(message=Messages.EXCEPTION_ARTICLE_BATCH_FAIL, result=report)
104 @staticmethod
105 def _batch_contains_duplicates(articles):
106 dois = []
107 fulltexts = []
109 for article in articles:
110 doi = article.get_normalised_doi()
111 if doi is not None:
112 if doi in dois:
113 return True
114 dois.append(doi)
116 ft = article.get_normalised_fulltext()
117 if ft is not None:
118 if ft in fulltexts:
119 return True
120 fulltexts.append(ft)
122 return False
124 @staticmethod
125 def _prepare_update_admin(article, duplicate, update_article_id, merge_duplicate):
127 is_update = 0
128 if duplicate is not None:
129 if update_article_id is not None and duplicate.id != update_article_id:
130 # it means that doi or ft url has been changed so that it duplicates existing article
131 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_IDENTIFIER_CHANGE_CLASH)
132 elif merge_duplicate:
133 is_update += 1
134 article.merge(duplicate)
135 elif update_article_id is not None and merge_duplicate: # requested to update article has both url and doi changed to new values - no duplicate detected
136 is_update += 1
137 art = models.Article.pull(update_article_id)
138 article.merge(art)
140 return is_update
142 def _prepare_update_publisher(self, article, duplicate, merge_duplicate, account, limit_to_account):
143 # before saving, we need to determine whether this is a new article
144 # or an update
145 is_update = 0
147 if duplicate is not None: # else -> it is new article
148 # check if can update the duplicate - if is the owner
149 has_permissions_result = self.has_permissions(account, article, limit_to_account)
150 if isinstance(has_permissions_result, bool) and has_permissions_result == True:
151 doi_or_ft_updated = self._doi_or_fulltext_updated(article, duplicate.id)
152 if doi_or_ft_updated or not merge_duplicate:
153 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_IDENTIFIER_CHANGE)
154 else:
155 is_update += 1
156 article.merge(duplicate)
157 else:
158 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_DUPLICATE_NO_PERMISSION)
159 return is_update
161 # here we should have the final point of validation for all incoming articles
162 @staticmethod
163 def _validate_issns(article_bibjson: models.ArticleBibJSON):
164 # only 2 issns: one print, one electronic
165 pissn = article_bibjson.get_identifiers("pissn")
166 eissn = article_bibjson.get_identifiers("eissn")
168 if len(pissn) > 1 or len(eissn) > 1:
169 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_TOO_MANY_ISSNS)
171 # no pissn or eissn
172 if not pissn and not eissn:
173 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_NO_ISSNS)
175 # pissn and eissn identical
176 if pissn == eissn:
177 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_IDENTICAL_PISSN_AND_EISSN)
179 def create_article(self, article, account, duplicate_check=True, merge_duplicate=True,
180 limit_to_account=True, add_journal_info=False, dry_run=False, update_article_id=None):
182 """
183 Create an individual article in the database
185 This method will check and merge any duplicates, and report back on successes and failures in a manner consistent with
186 batch_create_articles.
188 ~~->ArticleCreate:Feature~~
190 :param article: The article to be created
191 :param account: The account creating the article
192 :param duplicate_check: Whether to check for duplicates in the database
193 :param merge_duplicate: Whether to merge duplicate if found. If set to False, may result in a DuplicateArticleException
194 :param limit_to_account: Whether to limit create to when the account owns the journal to which the article belongs
195 :param add_journal_info: Should we fetch the journal info and attach it to the article before save?
196 :param dry_run: Whether to actuall save, or if this is just to either see if it would work, or to prep for a batch ingest
197 :param update_article_id: The article id that it is supposed to be an update to; taken into consideration ONLY
198 if duplicate_check == True and merge_duplicate == True
199 :return:
200 """
201 # first validate the incoming arguments to ensure that we've got the right thing
202 argvalidate("create_article", [
203 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},
204 {"arg": account, "instance": models.Account, "allow_none": False, "arg_name": "account"},
205 {"arg": duplicate_check, "instance": bool, "allow_none": False, "arg_name": "duplicate_check"},
206 {"arg": merge_duplicate, "instance": bool, "allow_none": False, "arg_name": "merge_duplicate"},
207 {"arg": limit_to_account, "instance": bool, "allow_none": False, "arg_name": "limit_to_account"},
208 {"arg": add_journal_info, "instance": bool, "allow_none": False, "arg_name": "add_journal_info"},
209 {"arg": dry_run, "instance": bool, "allow_none": False, "arg_name": "dry_run"},
210 {"arg": update_article_id, "instance": str, "allow_none": True, "arg_name": "update_article_id"}
211 ], exceptions.ArgumentException)
213 has_permissions_result = self.has_permissions(account, article, limit_to_account)
214 if isinstance(has_permissions_result, dict):
215 return has_permissions_result
217 # Validate that the article is acceptable: it must have a DOI and/or a fulltext & match only one in_doaj journal
218 # this raises an exception if the article is not acceptable, containing all the relevant validation details
219 # We do this after the permissions check because that gives a detailed result whereas this throws an exception
220 try:
221 self.is_acceptable(article)
222 except Exception as e:
223 raise e
225 is_update = 0
226 if duplicate_check:
227 # ~~!ArticleCreate:Feature->ArticleDeduplication:Feature~~
228 duplicate = self.get_duplicate(article)
229 try:
230 if account.has_role("admin"): # is update_article_id is None then treat as normal publisher upload
231 # for testing by admin
232 is_update = self._prepare_update_admin(article, duplicate, update_article_id, merge_duplicate)
233 else:
234 is_update = self._prepare_update_publisher(article, duplicate, merge_duplicate, account, limit_to_account)
235 except (exceptions.DuplicateArticleException, exceptions.ArticleMergeConflict, exceptions.ConfigurationException) as e:
236 raise e
238 if add_journal_info:
239 article.add_journal_metadata()
241 # finally, save the new article
242 if not dry_run:
243 article.save()
244 eventsSvc = DOAJ.eventsService()
245 eventsSvc.trigger(models.Event(constants.EVENT_ARTICLE_SAVE, account.id, {
246 "article": article.data
247 }))
249 return {"success": 1, "fail": 0, "update": is_update, "new": 1 - is_update, "shared": set(), "unowned": set(),
250 "unmatched": set()}
252 def has_permissions(self, account, article, limit_to_account):
254 if limit_to_account:
255 legit = account.has_role("admin") or self.is_legitimate_owner(article, account.id)
256 if not legit:
257 owned, shared, unowned, unmatched = self.issn_ownership_status(article, account.id)
258 return {"success": 0, "fail": 1, "update": 0, "new": 0, "shared": shared, "unowned": unowned,
259 "unmatched": unmatched}
260 return True
262 def is_acceptable(self, article: models.Article):
263 """
264 Conduct some deep validation on the article to make sure we will accept it
265 this just means making sure it has a DOI or fulltext, and that its ISSNs
266 match a single journal that is in DOAJ.
267 """
268 try:
269 bj = article.bibjson()
270 except DataStructureException as e:
271 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_INVALID_BIBJSON + e.message)
273 # do we have a DOI. If so, no need to go further
274 doi = bj.get_one_identifier(bj.DOI)
275 ft = bj.get_single_url(bj.FULLTEXT)
276 if doi is None and ft is None:
277 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_NO_DOI_NO_FULLTEXT)
279 self._validate_issns(bj)
281 try:
282 self.match_journal_with_validation(bj)
283 except exceptions.ArticleNotAcceptable:
284 raise
287 @staticmethod
288 def match_journal_with_validation(article_bibjson: models.ArticleBibJSON):
289 pissn = article_bibjson.get_one_identifier("pissn")
290 eissn = article_bibjson.get_one_identifier("eissn")
292 issns = []
294 if pissn is not None:
295 issns.append(pissn)
296 if eissn is not None:
297 issns.append(eissn)
299 # Find an exact match that is in DOAJ
300 journal = models.Journal.find_by_issn_exact(issns, in_doaj=True)
302 match len(journal):
303 case 0:
304 # Nothing back from in_doaj search, determine if withdrawn or nonexistent
305 if len(models.Journal.find_by_issn_exact(issns, in_doaj=False)) > 0:
306 raise exceptions.ArticleNotAcceptable(
307 message=Messages.EXCEPTION_ADDING_ARTICLE_TO_WITHDRAWN_JOURNAL)
308 else:
309 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_MISMATCHED_ISSNS)
310 case 1:
311 # check if only one journal matches pissn and eissn and if they are in the correct fields
312 # no need to check eissn, if pissn matches, pissn and eissn are different and only 1 journal has been found - then eissn matches too
313 if pissn is not None:
314 if journal[0].bibjson().pissn != pissn:
315 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_MISMATCHED_ISSNS)
316 if eissn is not None:
317 if journal[0].bibjson().eissn != eissn:
318 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_MISMATCHED_ISSNS)
319 case _:
320 # >1
321 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_MISMATCHED_ISSNS)
323 return journal[0]
325 @staticmethod
326 def is_legitimate_owner(article, owner):
327 """
328 Determine if the owner id is the owner of the article
330 :param article: an article model
331 :param owner: string account ID
332 :return: True or False
333 """
334 # first validate the incoming arguments to ensure that we've got the right thing
335 argvalidate("is_legitimate_owner", [
336 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},
337 {"arg": owner, "instance": str, "allow_none": False, "arg_name": "owner"}
338 ], exceptions.ArgumentException)
340 # get all the issns for the article
341 b = article.bibjson()
342 article_issns = b.get_identifiers(b.P_ISSN)
343 article_issns += b.get_identifiers(b.E_ISSN)
345 # check each issn against the index, and if a related journal is found
346 # record the owner of that journal
347 owners = []
348 seen_journal_issns = {}
349 for issn in article_issns:
350 journals = models.Journal.find_by_issn(issn)
351 if journals is not None and len(journals) > 0:
352 for j in journals:
353 owners.append(j.owner)
354 if j.owner not in seen_journal_issns:
355 seen_journal_issns[j.owner] = []
356 seen_journal_issns[j.owner] += j.bibjson().issns()
358 # deduplicate the list of owners
359 owners = list(set(owners))
361 # no owner means we can't confirm
362 if len(owners) == 0:
363 return False
365 # multiple owners means ownership of this article is confused
366 if len(owners) > 1:
367 return False
369 # if the found owner is not the same as the desired owner, return false
370 if owners[0] != owner:
371 return False
373 # single owner must still know of all supplied issns
374 journal_issns = set(seen_journal_issns[owner])
375 for issn in article_issns:
376 if issn not in journal_issns:
377 return False
379 return True
381 @staticmethod
382 def _doi_or_fulltext_updated(new_article, update_id):
383 if new_article.id is None:
384 return False
386 old_art = models.Article.pull(update_id) # ~~->Article:Model~~
387 old_doi = old_art.get_normalised_doi()
388 old_ft_url = old_art.get_normalised_fulltext()
390 new_doi = new_article.get_normalised_doi()
391 new_ft_url = new_article.get_normalised_fulltext()
393 return old_doi != new_doi or old_ft_url != new_ft_url
395 @staticmethod
396 def issn_ownership_status(article, owner):
397 """
398 Determine the ownership status of the supplied owner over the issns in the given article
400 This will give you a tuple back which lists the following (in order):
402 * which issns are owned by that owner
403 * which issns are shared with another owner
404 * which issns are not owned by this owner
405 * which issns are not found in the DOAJ database
407 :param article:
408 :param owner:
409 :return:
410 """
411 # first validate the incoming arguments to ensure that we've got the right thing
412 argvalidate("issn_ownership_status", [
413 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},
414 {"arg": owner, "instance": str, "allow_none": False, "arg_name": "owner"}
415 ], exceptions.ArgumentException)
417 # get all the issns for the article
418 b = article.bibjson()
419 issns = b.get_identifiers(b.P_ISSN)
420 issns += b.get_identifiers(b.E_ISSN)
422 # FIXME: Duplicate check due to inconsistent control flow (result vs exception)
423 if len(issns) == 0:
424 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_NO_ISSNS)
426 owned = []
427 shared = []
428 unowned = []
429 unmatched = []
431 # check each issn against the index, and if a related journal is found
432 # record the owner of that journal
433 seen_issns = {}
434 for issn in issns:
435 journals = models.Journal.find_by_issn(issn)
436 if journals is not None and len(journals) > 0:
437 for j in journals:
438 if issn not in seen_issns:
439 seen_issns[issn] = set()
440 if j.owner is not None:
441 seen_issns[issn].add(j.owner)
443 for issn in issns:
444 if issn not in list(seen_issns.keys()):
445 unmatched.append(issn)
447 for issn, owners in seen_issns.items():
448 owners = list(owners)
449 if len(owners) == 0:
450 unowned.append(issn)
451 elif len(owners) == 1 and owners[0] == owner:
452 owned.append(issn)
453 elif len(owners) == 1 and owners[0] != owner:
454 unowned.append(issn)
455 elif len(owners) > 1:
456 if owner in owners:
457 shared.append(issn)
458 else:
459 unowned.append(issn)
461 return owned, shared, unowned, unmatched
463 def get_duplicate(self, article):
464 """
465 Get at most one, most recent, duplicate article for the supplied article.
467 ~~->ArticleDeduplication:Feature~~
469 :param article:
470 :return:
471 """
472 # first validate the incoming arguments to ensure that we've got the right thing
473 argvalidate("get_duplicate", [
474 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},
475 ], exceptions.ArgumentException)
477 article.prep()
478 dup = self.get_duplicates(article, max_results=2)
479 if len(dup) > 1:
480 raise exceptions.ArticleMergeConflict(Messages.EXCEPTION_ARTICLE_MERGE_CONFLICT)
481 elif dup:
482 return dup.pop()
483 else:
484 return None
486 def get_duplicates(self, article, max_results=10):
487 """
488 Get all known duplicates of an article
490 ~~->ArticleDeduplication:Feature~~
492 :param article: Article of interest
493 :param max_results: Maximum number of duplicate candidates to return
494 :return: A list of possible duplicates
495 """
496 # first validate the incoming arguments to ensure that we've got the right thing
497 argvalidate("get_duplicates", [
498 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},
499 ], exceptions.ArgumentException)
501 possible_articles_dict = self.discover_duplicates(article, max_results)
502 if not possible_articles_dict:
503 return []
505 # We don't need the details of duplicate types, so flatten the lists.
506 all_possible_articles = [article for dup_type in list(possible_articles_dict.values()) for article in dup_type]
508 # An article may fulfil more than one duplication criteria, so needs to be de-duplicated
509 ids = []
510 possible_articles = []
511 for a in all_possible_articles:
512 if a.id not in ids:
513 ids.append(a.id)
514 possible_articles.append(a)
516 # Sort the articles newest -> oldest by last_updated so we can get the most recent at [0]
517 possible_articles.sort(key=lambda x: dates.parse(x.last_updated), reverse=True)
519 return possible_articles[:max_results]
521 @staticmethod
522 def discover_duplicates(article, results_per_match_type=10, include_article=True):
523 """
524 Identify duplicates, separated by duplication criteria
526 If the owner id is provided, this will limit the search to duplicates owned by that owner
528 ~~->ArticleDeduplication:Feature~~
530 :param article:
531 :param results_per_match_type
532 :param include_article
533 :return:
534 """
535 # first validate the incoming arguments to ensure that we've got the right thing
536 argvalidate("discover_duplicates", [
537 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},
538 ], exceptions.ArgumentException)
540 # if we get more than one result, we'll record them here, and then at the end
541 # if we haven't got a definitive match we'll pick the most likely candidate
542 # (this isn't as bad as it sounds - the identifiers are pretty reliable, this catches
543 # issues like where there are already duplicates in the data, and not matching one
544 # of them propagates the issue)
545 possible_articles = {}
546 found = False
548 # Checking by DOI is our first step
549 # dois = b.get_identifiers(b.DOI)
550 doi = article.get_normalised_doi()
551 if doi is not None:
552 if isinstance(doi, str) and doi != '':
553 articles = models.Article.duplicates(doi=doi, size=results_per_match_type)
554 if len(articles) > 0:
555 if include_article:
556 possible_articles['doi'] = [a for a in articles]
557 else:
558 possible_articles['doi'] = [a for a in articles if a.id != article.id]
559 if len(possible_articles['doi']) > 0:
560 found = True
562 # Second test is to look by fulltext url
563 fulltext = article.get_normalised_fulltext()
564 if fulltext is not None:
565 articles = models.Article.duplicates(fulltexts=fulltext, size=results_per_match_type)
566 if len(articles) > 0:
567 if include_article:
568 possible_articles['fulltext'] = [a for a in articles]
569 else:
570 possible_articles['fulltext'] = [a for a in articles if a.id != article.id]
571 if possible_articles['fulltext']:
572 found = True
574 if doi is None and fulltext is None:
575 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_DETECT_DUPLICATE_NO_ID)
577 return possible_articles if found else None