Coverage for portality/bll/services/article.py: 99%
245 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-22 15:59 +0100
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-22 15:59 +0100
1from portality.lib.argvalidate import argvalidate
2from portality import models
3from portality.bll import exceptions
4from portality.ui.messages import Messages
6from datetime import datetime
9class ArticleService(object):
10 """
11 ~~Article:Service~~
12 """
14 def batch_create_articles(self, articles, account, duplicate_check=True, merge_duplicate=True,
15 limit_to_account=True, add_journal_info=False):
16 """
17 Create a batch of articles in a single operation. Articles are either all created/updated or none of them are
19 This method checks for duplicates within the provided set and within the current database (if you set duplicate_check=True)
21 ~~->ArticleBatchCreate:Feature~~
23 :param articles: The list of article objects
24 :param account: The account creating the articles
25 :param duplicate_check: Whether to check for duplicates in the batch and in the index
26 :param merge_duplicate: Should duplicates be merged. If set to False, this may raise a DuplicateArticleException
27 :param limit_to_account: Should the ingest be limited only to articles for journals owned by the account. If set to True, may result in an IngestException
28 :param add_journal_info: Should we fetch the journal info and attach it to the article before save?
29 :return: a report on the state of the import: {success: x, fail: x, update: x, new: x, shared: [], unowned: [], unmatched: []}
30 """
31 # first validate the incoming arguments to ensure that we've got the right thing
32 argvalidate("batch_create_article", [
33 {"arg": articles, "instance": list, "allow_none": False, "arg_name": "articles"},
34 {"arg": account, "instance": models.Account, "allow_none": False, "arg_name": "account"},
35 {"arg": duplicate_check, "instance": bool, "allow_none": False, "arg_name": "duplicate_check"},
36 {"arg": merge_duplicate, "instance": bool, "allow_none": False, "arg_name": "merge_duplicate"},
37 {"arg": limit_to_account, "instance": bool, "allow_none": False, "arg_name": "limit_to_account"},
38 {"arg": add_journal_info, "instance": bool, "allow_none": False, "arg_name": "add_journal_info"}
39 ], exceptions.ArgumentException)
41 # 1. dedupe the batch
42 if duplicate_check:
43 batch_duplicates = self._batch_contains_duplicates(articles)
44 if batch_duplicates:
45 report = {"success": 0, "fail": len(articles), "update": 0, "new": 0, "shared": [], "unowned": [],
46 "unmatched": []}
47 raise exceptions.IngestException(message=Messages.EXCEPTION_ARTICLE_BATCH_DUPLICATE, result=report)
49 # 2. check legitimate ownership
50 success = 0
51 fail = 0
52 update = 0
53 new = 0
54 all_shared = set()
55 all_unowned = set()
56 all_unmatched = set()
58 for article in articles:
59 try:
60 # ~~!ArticleBatchCreate:Feature->ArticleCreate:Feature~~
61 result = self.create_article(article, account,
62 duplicate_check=duplicate_check,
63 merge_duplicate=merge_duplicate,
64 limit_to_account=limit_to_account,
65 add_journal_info=add_journal_info,
66 dry_run=True)
67 except (exceptions.ArticleMergeConflict, exceptions.ConfigurationException):
68 raise exceptions.IngestException(message=Messages.EXCEPTION_ARTICLE_BATCH_CONFLICT)
70 success += result.get("success", 0)
71 fail += result.get("fail", 0)
72 update += result.get("update", 0)
73 new += result.get("new", 0)
74 all_shared.update(result.get("shared", set()))
75 all_unowned.update(result.get("unowned", set()))
76 all_unmatched.update(result.get("unmatched", set()))
78 report = {"success": success, "fail": fail, "update": update, "new": new, "shared": all_shared,
79 "unowned": all_unowned, "unmatched": all_unmatched}
81 # if there were no failures in the batch, then we can do the save
82 if fail == 0:
83 for i in range(len(articles)):
84 block = i == len(articles) - 1
85 # block on the final save, so that when this method returns, all articles are
86 # available in the index
87 articles[i].save(blocking=block)
89 # return some stats on the import
90 return report
91 else:
92 raise exceptions.IngestException(message=Messages.EXCEPTION_ARTICLE_BATCH_FAIL, result=report)
94 def _batch_contains_duplicates(self, articles):
95 dois = []
96 fulltexts = []
98 for article in articles:
99 doi = article.get_normalised_doi()
100 if doi is not None:
101 if doi in dois:
102 return True
103 dois.append(doi)
105 ft = article.get_normalised_fulltext()
106 if ft is not None:
107 if ft in fulltexts:
108 return True
109 fulltexts.append(ft)
111 return False
113 def _prepare_update_admin(self, article, duplicate, update_article_id, merge_duplicate):
115 is_update = 0
116 if duplicate is not None:
117 if duplicate.id != update_article_id:
118 # it means that doi or ft url has been changed so that it duplicates existing article
119 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_IDENTIFIER_CHANGE_CLASH)
120 elif merge_duplicate:
121 is_update += 1
122 article.merge(duplicate)
123 elif merge_duplicate: # requested to update article has both url and doi changed to new values - no duplicate detected
124 is_update += 1
125 art = models.Article.pull(update_article_id)
126 article.merge(art)
128 return is_update
130 def _prepare_update_publisher(self, article, duplicate, merge_duplicate, account, limit_to_account):
131 # before saving, we need to determine whether this is a new article
132 # or an update
133 is_update = 0
135 if duplicate is not None: # else -> it is new article
136 # check if can update the duplicate - if is the owner
137 has_permissions_result = self.has_permissions(account, article, limit_to_account)
138 if isinstance(has_permissions_result, bool) and has_permissions_result == True:
139 doi_or_ft_updated = self._doi_or_fulltext_updated(article, duplicate.id)
140 if doi_or_ft_updated or not merge_duplicate:
141 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_IDENTIFIER_CHANGE)
142 else:
143 is_update += 1
144 article.merge(duplicate)
145 else:
146 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_DUPLICATE_NO_PERMISSION)
147 return is_update
149 # here we should have the final point of validation for all incoming articles
150 def _validate_issns(self, article):
151 # only 2 issns: one print, one electronic
152 b = article.bibjson()
153 pissn = b.get_identifiers("pissn")
154 eissn = b.get_identifiers("eissn")
156 if len(pissn) > 1 or len(eissn) > 1:
157 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_TOO_MANY_ISSNS)
159 pissn = b.get_one_identifier("pissn")
160 eissn = b.get_one_identifier("eissn")
162 #pissn and eissn identical
163 if pissn == eissn:
164 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_IDENTICAL_PISSN_AND_EISSN)
167 def create_article(self, article, account, duplicate_check=True, merge_duplicate=True,
168 limit_to_account=True, add_journal_info=False, dry_run=False, update_article_id=None):
170 """
171 Create an individual article in the database
173 This method will check and merge any duplicates, and report back on successes and failures in a manner consistent with
174 batch_create_articles.
176 ~~->ArticleCreate:Feature~~
178 :param article: The article to be created
179 :param account: The account creating the article
180 :param duplicate_check: Whether to check for duplicates in the database
181 :param merge_duplicate: Whether to merge duplicate if found. If set to False, may result in a DuplicateArticleException
182 :param limit_to_account: Whether to limit create to when the account owns the journal to which the article belongs
183 :param add_journal_info: Should we fetch the journal info and attach it to the article before save?
184 :param dry_run: Whether to actuall save, or if this is just to either see if it would work, or to prep for a batch ingest
185 :param update_article_id: The article id that it is supposed to be an update to; taken into consideration ONLY
186 if duplicate_check == True and merge_duplicate == True
187 :return:
188 """
189 # first validate the incoming arguments to ensure that we've got the right thing
190 argvalidate("create_article", [
191 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},
192 {"arg": account, "instance": models.Account, "allow_none": False, "arg_name": "account"},
193 {"arg": duplicate_check, "instance": bool, "allow_none": False, "arg_name": "duplicate_check"},
194 {"arg": merge_duplicate, "instance": bool, "allow_none": False, "arg_name": "merge_duplicate"},
195 {"arg": limit_to_account, "instance": bool, "allow_none": False, "arg_name": "limit_to_account"},
196 {"arg": add_journal_info, "instance": bool, "allow_none": False, "arg_name": "add_journal_info"},
197 {"arg": dry_run, "instance": bool, "allow_none": False, "arg_name": "dry_run"},
198 {"arg": update_article_id, "instance": str, "allow_none": True, "arg_name": "update_article_id"}
199 ], exceptions.ArgumentException)
201 # quickly validate that the article is acceptable - it must have a DOI and/or a fulltext
202 # this raises an exception if the article is not acceptable, containing all the relevant validation details
204 try:
205 self.is_acceptable(article)
206 except Exception as e:
207 raise e
209 has_permissions_result = self.has_permissions(account, article, limit_to_account)
210 if isinstance(has_permissions_result,dict):
211 return has_permissions_result
213 is_update = 0
214 if duplicate_check:
215 # ~~!ArticleCreate:Feature->ArticleDeduplication:Feature~~
216 duplicate = self.get_duplicate(article)
217 try:
218 if account.has_role("admin") and update_article_id is not None: # is update_article_id is None then treat as normal publisher upload
219 # for testing by admin
220 is_update = self._prepare_update_admin(article, duplicate, update_article_id, merge_duplicate)
221 else:
222 is_update = self._prepare_update_publisher(article, duplicate, merge_duplicate, account, limit_to_account)
223 except (exceptions.DuplicateArticleException, exceptions.ArticleMergeConflict, exceptions.ConfigurationException) as e:
224 raise e
226 if add_journal_info:
227 article.add_journal_metadata()
229 # finally, save the new article
230 if not dry_run:
231 article.save()
233 return {"success": 1, "fail": 0, "update": is_update, "new": 1 - is_update, "shared": set(), "unowned": set(),
234 "unmatched": set()}
237 def has_permissions(self, account, article, limit_to_account):
239 if limit_to_account:
240 legit = account.has_role("admin") or self.is_legitimate_owner(article, account.id)
241 if not legit:
242 owned, shared, unowned, unmatched = self.issn_ownership_status(article, account.id)
243 return {"success": 0, "fail": 1, "update": 0, "new": 0, "shared": shared, "unowned": unowned,
244 "unmatched": unmatched}
245 return True
248 def is_acceptable(self, article):
249 """
250 conduct some deep validation on the article to make sure we will accept it
251 or the moment, this just means making sure it has a DOI and a fulltext
252 """
253 bj = article.bibjson()
255 # do we have a DOI. If so, no need to go further
256 doi = bj.get_one_identifier(bj.DOI)
257 ft = bj.get_single_url(bj.FULLTEXT)
258 if doi is None and ft is None:
259 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_NO_DOI_NO_FULLTEXT)
261 self._validate_issns(article)
263 def is_legitimate_owner(self, article, owner):
264 """
265 Determine if the owner id is the owner of the article
267 :param article: an article model
268 :param owner: string account ID
269 :return: True or False
270 """
271 # first validate the incoming arguments to ensure that we've got the right thing
272 argvalidate("is_legitimate_owner", [
273 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},
274 {"arg": owner, "instance": str, "allow_none": False, "arg_name": "owner"}
275 ], exceptions.ArgumentException)
277 # get all the issns for the article
278 b = article.bibjson()
279 article_issns = b.get_identifiers(b.P_ISSN)
280 article_issns += b.get_identifiers(b.E_ISSN)
282 # check each issn against the index, and if a related journal is found
283 # record the owner of that journal
284 owners = []
285 seen_journal_issns = {}
286 for issn in article_issns:
287 journals = models.Journal.find_by_issn(issn)
288 if journals is not None and len(journals) > 0:
289 for j in journals:
290 owners.append(j.owner)
291 if j.owner not in seen_journal_issns:
292 seen_journal_issns[j.owner] = []
293 seen_journal_issns[j.owner] += j.bibjson().issns()
295 # deduplicate the list of owners
296 owners = list(set(owners))
298 # no owner means we can't confirm
299 if len(owners) == 0:
300 return False
302 # multiple owners means ownership of this article is confused
303 if len(owners) > 1:
304 return False
306 # if the found owner is not the same as the desired owner, return false
307 if owners[0] != owner:
308 return False
310 # single owner must still know of all supplied issns
311 journal_issns = set(seen_journal_issns[owner])
312 for issn in article_issns:
313 if issn not in journal_issns:
314 return False
316 return True
318 def _doi_or_fulltext_updated(self, new_article, update_id):
319 if new_article.id is None:
320 return False
322 old_art = models.Article.pull(update_id) # ~~->Article:Model~~
323 old_doi = old_art.get_normalised_doi()
324 old_ft_url = old_art.get_normalised_fulltext()
326 new_doi = new_article.get_normalised_doi()
327 new_ft_url = new_article.get_normalised_fulltext()
329 return old_doi != new_doi or old_ft_url != new_ft_url
331 def issn_ownership_status(self, article, owner):
332 """
333 Determine the ownership status of the supplied owner over the issns in the given article
335 This will give you a tuple back which lists the following (in order):
337 * which issns are owned by that owner
338 * which issns are shared with another owner
339 * which issns are not owned by this owner
340 * which issns are not found in the DOAJ database
342 :param article:
343 :param owner:
344 :return:
345 """
346 # first validate the incoming arguments to ensure that we've got the right thing
347 argvalidate("issn_ownership_status", [
348 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},
349 {"arg": owner, "instance": str, "allow_none": False, "arg_name": "owner"}
350 ], exceptions.ArgumentException)
352 # get all the issns for the article
353 b = article.bibjson()
354 issns = b.get_identifiers(b.P_ISSN)
355 issns += b.get_identifiers(b.E_ISSN)
357 owned = []
358 shared = []
359 unowned = []
360 unmatched = []
362 # check each issn against the index, and if a related journal is found
363 # record the owner of that journal
364 seen_issns = {}
365 for issn in issns:
366 journals = models.Journal.find_by_issn(issn)
367 if journals is not None and len(journals) > 0:
368 for j in journals:
369 if issn not in seen_issns:
370 seen_issns[issn] = set()
371 if j.owner is not None:
372 seen_issns[issn].add(j.owner)
374 for issn in issns:
375 if issn not in list(seen_issns.keys()):
376 unmatched.append(issn)
378 for issn, owners in seen_issns.items():
379 owners = list(owners)
380 if len(owners) == 0:
381 unowned.append(issn)
382 elif len(owners) == 1 and owners[0] == owner:
383 owned.append(issn)
384 elif len(owners) == 1 and owners[0] != owner:
385 unowned.append(issn)
386 elif len(owners) > 1:
387 if owner in owners:
388 shared.append(issn)
389 else:
390 unowned.append(issn)
392 return owned, shared, unowned, unmatched
394 def get_duplicate(self, article):
395 """
396 Get at most one one, most recent, duplicate article for the supplied article.
398 If the owner id is provided, this will limit the search to duplicates owned by that owner
400 ~~->ArticleDeduplication:Feature~~
402 :param article:
403 :param owner:
404 :return:
405 """
406 # first validate the incoming arguments to ensure that we've got the right thing
407 argvalidate("get_duplicate", [
408 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},
409 ], exceptions.ArgumentException)
411 article.prep()
412 dup = self.get_duplicates(article, max_results=2)
413 if len(dup) > 1:
414 raise exceptions.ArticleMergeConflict(Messages.EXCEPTION_ARTICLE_MERGE_CONFLICT)
415 elif dup:
416 return dup.pop()
417 else:
418 return None
420 def get_duplicates(self, article, max_results=10):
421 """
422 Get all known duplicates of an article
424 If the owner id is provided, this will limit the search to duplicates owned by that owner
426 ~~->ArticleDeduplication:Feature~~
428 :param article:
429 :return:
430 """
431 # first validate the incoming arguments to ensure that we've got the right thing
432 argvalidate("get_duplicates", [
433 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},
434 ], exceptions.ArgumentException)
436 possible_articles_dict = self.discover_duplicates(article, max_results)
437 if not possible_articles_dict:
438 return []
440 # We don't need the details of duplicate types, so flatten the lists.
441 all_possible_articles = [article for dup_type in list(possible_articles_dict.values()) for article in dup_type]
443 # An article may fulfil more than one duplication criteria, so needs to be de-duplicated
444 ids = []
445 possible_articles = []
446 for a in all_possible_articles:
447 if a.id not in ids:
448 ids.append(a.id)
449 possible_articles.append(a)
451 # Sort the articles newest -> oldest by last_updated so we can get the most recent at [0]
452 possible_articles.sort(key=lambda x: datetime.strptime(x.last_updated, "%Y-%m-%dT%H:%M:%SZ"), reverse=True)
454 return possible_articles[:max_results]
456 def discover_duplicates(self, article, results_per_match_type=10, include_article=True):
457 """
458 Identify duplicates, separated by duplication criteria
460 If the owner id is provided, this will limit the search to duplicates owned by that owner
462 ~~->ArticleDeduplication:Feature~~
464 :param article:
465 :return:
466 """
467 # first validate the incoming arguments to ensure that we've got the right thing
468 argvalidate("discover_duplicates", [
469 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},
470 ], exceptions.ArgumentException)
472 # if we get more than one result, we'll record them here, and then at the end
473 # if we haven't got a definitive match we'll pick the most likely candidate
474 # (this isn't as bad as it sounds - the identifiers are pretty reliable, this catches
475 # issues like where there are already duplicates in the data, and not matching one
476 # of them propagates the issue)
477 possible_articles = {}
478 found = False
480 # Checking by DOI is our first step
481 # dois = b.get_identifiers(b.DOI)
482 doi = article.get_normalised_doi()
483 if doi is not None:
484 if isinstance(doi, str) and doi != '':
485 articles = models.Article.duplicates(doi=doi, size=results_per_match_type)
486 if len(articles) > 0:
487 if include_article:
488 possible_articles['doi'] = [a for a in articles]
489 else:
490 possible_articles['doi'] = [a for a in articles if a.id != article.id]
491 if len(possible_articles['doi']) > 0:
492 found = True
494 # Second test is to look by fulltext url
495 fulltext = article.get_normalised_fulltext()
496 if fulltext is not None:
497 articles = models.Article.duplicates(fulltexts=fulltext, size=results_per_match_type)
498 if len(articles) > 0:
499 if include_article:
500 possible_articles['fulltext'] = [a for a in articles]
501 else:
502 possible_articles['fulltext'] = [a for a in articles if a.id != article.id]
503 if possible_articles['fulltext']:
504 found = True
506 if doi is None and fulltext is None:
507 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_DETECT_DUPLICATE_NO_ID)
509 return possible_articles if found else None