Coverage for portality / bll / services / article.py: 97%

294 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-05 00:09 +0100

1from portality.lib import dates 

2from portality.lib.argvalidate import argvalidate 

3from portality import models, constants 

4from portality.bll import exceptions, DOAJ 

5from portality.ui.messages import Messages 

6from portality.lib.dataobj import DataStructureException 

7 

8 

9 

10class ArticleService(object): 

11 """ 

12 ~~Article:Service~~ 

13 """ 

14 

15 def batch_create_articles(self, articles, account, duplicate_check=True, merge_duplicate=True, 

16 limit_to_account=True, add_journal_info=False): 

17 """ 

18 Create a batch of articles in a single operation. Articles are either all created/updated or none of them are 

19 

20 This method checks for duplicates within the provided set and within the current database (if you set duplicate_check=True) 

21 

22 ~~->ArticleBatchCreate:Feature~~ 

23 

24 :param articles: The list of article objects 

25 :param account: The account creating the articles 

26 :param duplicate_check: Whether to check for duplicates in the batch and in the index 

27 :param merge_duplicate: Should duplicates be merged. If set to False, this may raise a DuplicateArticleException 

28 :param limit_to_account: Should the ingest be limited only to articles for journals owned by the account. If set to True, may result in an IngestException 

29 :param add_journal_info: Should we fetch the journal info and attach it to the article before save? 

30 :return: a report on the state of the import: {success: x, fail: x, update: x, new: x, shared: [], unowned: [], unmatched: []} 

31 """ 

32 # first validate the incoming arguments to ensure that we've got the right thing 

33 argvalidate("batch_create_article", [ 

34 {"arg": articles, "instance": list, "allow_none": False, "arg_name": "articles"}, 

35 {"arg": account, "instance": models.Account, "allow_none": False, "arg_name": "account"}, 

36 {"arg": duplicate_check, "instance": bool, "allow_none": False, "arg_name": "duplicate_check"}, 

37 {"arg": merge_duplicate, "instance": bool, "allow_none": False, "arg_name": "merge_duplicate"}, 

38 {"arg": limit_to_account, "instance": bool, "allow_none": False, "arg_name": "limit_to_account"}, 

39 {"arg": add_journal_info, "instance": bool, "allow_none": False, "arg_name": "add_journal_info"} 

40 ], exceptions.ArgumentException) 

41 

42 # 1. dedupe the batch 

43 if duplicate_check: 

44 batch_duplicates = self._batch_contains_duplicates(articles) 

45 if batch_duplicates: 

46 report = {"success": 0, "fail": len(articles), "update": 0, "new": 0, "shared": [], "unowned": [], 

47 "unmatched": []} 

48 raise exceptions.IngestException(message=Messages.EXCEPTION_ARTICLE_BATCH_DUPLICATE, result=report) 

49 

50 # 2. check legitimate ownership 

51 success = 0 

52 fail = 0 

53 update = 0 

54 new = 0 

55 all_shared = set() 

56 all_unowned = set() 

57 all_unmatched = set() 

58 

59 # Hold on to the exception so we can raise it later 

60 e_not_acceptable = None 

61 

62 for article in articles: 

63 try: 

64 # ~~!ArticleBatchCreate:Feature->ArticleCreate:Feature~~ 

65 result = self.create_article(article, account, 

66 duplicate_check=duplicate_check, 

67 merge_duplicate=merge_duplicate, 

68 limit_to_account=limit_to_account, 

69 add_journal_info=add_journal_info, 

70 dry_run=True) 

71 except (exceptions.ArticleMergeConflict, exceptions.ConfigurationException): 

72 raise exceptions.IngestException(message=Messages.EXCEPTION_ARTICLE_BATCH_CONFLICT) 

73 except exceptions.ArticleNotAcceptable as e: 

74 # The ArticleNotAcceptable exception is a superset of reasons we can't match a journal to this article 

75 e_not_acceptable = e 

76 result = {'fail': 1, 'unmatched': set(article.bibjson().issns())} 

77 

78 success += result.get("success", 0) 

79 fail += result.get("fail", 0) 

80 update += result.get("update", 0) 

81 new += result.get("new", 0) 

82 all_shared.update(result.get("shared", set())) 

83 all_unowned.update(result.get("unowned", set())) 

84 all_unmatched.update(result.get("unmatched", set())) 

85 

86 report = {"success": success, "fail": fail, "update": update, "new": new, "shared": all_shared, 

87 "unowned": all_unowned, "unmatched": all_unmatched} 

88 

89 # if there were no failures in the batch, then we can do the save 

90 if fail == 0: 

91 for i in range(len(articles)): 

92 block = i == len(articles) - 1 

93 # block on the final save, so that when this method returns, all articles are 

94 # available in the index 

95 articles[i].save(blocking=block) 

96 

97 # return some stats on the import 

98 return report 

99 else: 

100 if e_not_acceptable is not None: 

101 raise exceptions.ArticleNotAcceptable(message=e_not_acceptable.message, result=report) 

102 raise exceptions.IngestException(message=Messages.EXCEPTION_ARTICLE_BATCH_FAIL, result=report) 

103 

104 @staticmethod 

105 def _batch_contains_duplicates(articles): 

106 dois = [] 

107 fulltexts = [] 

108 

109 for article in articles: 

110 doi = article.get_normalised_doi() 

111 if doi is not None: 

112 if doi in dois: 

113 return True 

114 dois.append(doi) 

115 

116 ft = article.get_normalised_fulltext() 

117 if ft is not None: 

118 if ft in fulltexts: 

119 return True 

120 fulltexts.append(ft) 

121 

122 return False 

123 

124 @staticmethod 

125 def _prepare_update_admin(article, duplicate, update_article_id, merge_duplicate): 

126 

127 is_update = 0 

128 if duplicate is not None: 

129 if update_article_id is not None and duplicate.id != update_article_id: 

130 # it means that doi or ft url has been changed so that it duplicates existing article 

131 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_IDENTIFIER_CHANGE_CLASH) 

132 elif merge_duplicate: 

133 is_update += 1 

134 article.merge(duplicate) 

135 elif update_article_id is not None and merge_duplicate: # requested to update article has both url and doi changed to new values - no duplicate detected 

136 is_update += 1 

137 art = models.Article.pull(update_article_id) 

138 article.merge(art) 

139 

140 return is_update 

141 

142 def _prepare_update_publisher(self, article, duplicate, merge_duplicate, account, limit_to_account): 

143 # before saving, we need to determine whether this is a new article 

144 # or an update 

145 is_update = 0 

146 

147 if duplicate is not None: # else -> it is new article 

148 # check if can update the duplicate - if is the owner 

149 has_permissions_result = self.has_permissions(account, article, limit_to_account) 

150 if isinstance(has_permissions_result, bool) and has_permissions_result == True: 

151 doi_or_ft_updated = self._doi_or_fulltext_updated(article, duplicate.id) 

152 if doi_or_ft_updated or not merge_duplicate: 

153 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_IDENTIFIER_CHANGE) 

154 else: 

155 is_update += 1 

156 article.merge(duplicate) 

157 else: 

158 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_DUPLICATE_NO_PERMISSION) 

159 return is_update 

160 

161 # here we should have the final point of validation for all incoming articles 

162 @staticmethod 

163 def _validate_issns(article_bibjson: models.ArticleBibJSON): 

164 # only 2 issns: one print, one electronic 

165 pissn = article_bibjson.get_identifiers("pissn") 

166 eissn = article_bibjson.get_identifiers("eissn") 

167 

168 if len(pissn) > 1 or len(eissn) > 1: 

169 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_TOO_MANY_ISSNS) 

170 

171 # no pissn or eissn 

172 if not pissn and not eissn: 

173 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_NO_ISSNS) 

174 

175 # pissn and eissn identical 

176 if pissn == eissn: 

177 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_IDENTICAL_PISSN_AND_EISSN) 

178 

179 def create_article(self, article, account, duplicate_check=True, merge_duplicate=True, 

180 limit_to_account=True, add_journal_info=False, dry_run=False, update_article_id=None): 

181 

182 """ 

183 Create an individual article in the database 

184 

185 This method will check and merge any duplicates, and report back on successes and failures in a manner consistent with 

186 batch_create_articles. 

187 

188 ~~->ArticleCreate:Feature~~ 

189 

190 :param article: The article to be created 

191 :param account: The account creating the article 

192 :param duplicate_check: Whether to check for duplicates in the database 

193 :param merge_duplicate: Whether to merge duplicate if found. If set to False, may result in a DuplicateArticleException 

194 :param limit_to_account: Whether to limit create to when the account owns the journal to which the article belongs 

195 :param add_journal_info: Should we fetch the journal info and attach it to the article before save? 

196 :param dry_run: Whether to actuall save, or if this is just to either see if it would work, or to prep for a batch ingest 

197 :param update_article_id: The article id that it is supposed to be an update to; taken into consideration ONLY 

198 if duplicate_check == True and merge_duplicate == True 

199 :return: 

200 """ 

201 # first validate the incoming arguments to ensure that we've got the right thing 

202 argvalidate("create_article", [ 

203 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"}, 

204 {"arg": account, "instance": models.Account, "allow_none": False, "arg_name": "account"}, 

205 {"arg": duplicate_check, "instance": bool, "allow_none": False, "arg_name": "duplicate_check"}, 

206 {"arg": merge_duplicate, "instance": bool, "allow_none": False, "arg_name": "merge_duplicate"}, 

207 {"arg": limit_to_account, "instance": bool, "allow_none": False, "arg_name": "limit_to_account"}, 

208 {"arg": add_journal_info, "instance": bool, "allow_none": False, "arg_name": "add_journal_info"}, 

209 {"arg": dry_run, "instance": bool, "allow_none": False, "arg_name": "dry_run"}, 

210 {"arg": update_article_id, "instance": str, "allow_none": True, "arg_name": "update_article_id"} 

211 ], exceptions.ArgumentException) 

212 

213 has_permissions_result = self.has_permissions(account, article, limit_to_account) 

214 if isinstance(has_permissions_result, dict): 

215 return has_permissions_result 

216 

217 # Validate that the article is acceptable: it must have a DOI and/or a fulltext & match only one in_doaj journal 

218 # this raises an exception if the article is not acceptable, containing all the relevant validation details 

219 # We do this after the permissions check because that gives a detailed result whereas this throws an exception 

220 try: 

221 self.is_acceptable(article) 

222 except Exception as e: 

223 raise e 

224 

225 is_update = 0 

226 if duplicate_check: 

227 # ~~!ArticleCreate:Feature->ArticleDeduplication:Feature~~ 

228 duplicate = self.get_duplicate(article) 

229 try: 

230 if account.has_role("admin"): # is update_article_id is None then treat as normal publisher upload 

231 # for testing by admin 

232 is_update = self._prepare_update_admin(article, duplicate, update_article_id, merge_duplicate) 

233 else: 

234 is_update = self._prepare_update_publisher(article, duplicate, merge_duplicate, account, limit_to_account) 

235 except (exceptions.DuplicateArticleException, exceptions.ArticleMergeConflict, exceptions.ConfigurationException) as e: 

236 raise e 

237 

238 if add_journal_info: 

239 article.add_journal_metadata() 

240 

241 # finally, save the new article 

242 if not dry_run: 

243 article.save() 

244 eventsSvc = DOAJ.eventsService() 

245 eventsSvc.trigger(models.Event(constants.EVENT_ARTICLE_SAVE, account.id, { 

246 "article": article.data 

247 })) 

248 

249 return {"success": 1, "fail": 0, "update": is_update, "new": 1 - is_update, "shared": set(), "unowned": set(), 

250 "unmatched": set()} 

251 

252 def has_permissions(self, account, article, limit_to_account): 

253 

254 if limit_to_account: 

255 legit = account.has_role("admin") or self.is_legitimate_owner(article, account.id) 

256 if not legit: 

257 owned, shared, unowned, unmatched = self.issn_ownership_status(article, account.id) 

258 return {"success": 0, "fail": 1, "update": 0, "new": 0, "shared": shared, "unowned": unowned, 

259 "unmatched": unmatched} 

260 return True 

261 

262 def is_acceptable(self, article: models.Article): 

263 """ 

264 Conduct some deep validation on the article to make sure we will accept it 

265 this just means making sure it has a DOI or fulltext, and that its ISSNs 

266 match a single journal that is in DOAJ. 

267 """ 

268 try: 

269 bj = article.bibjson() 

270 except DataStructureException as e: 

271 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_INVALID_BIBJSON + e.message) 

272 

273 # do we have a DOI. If so, no need to go further 

274 doi = bj.get_one_identifier(bj.DOI) 

275 ft = bj.get_single_url(bj.FULLTEXT) 

276 if doi is None and ft is None: 

277 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_NO_DOI_NO_FULLTEXT) 

278 

279 self._validate_issns(bj) 

280 

281 try: 

282 self.match_journal_with_validation(bj) 

283 except exceptions.ArticleNotAcceptable: 

284 raise 

285 

286 

287 @staticmethod 

288 def match_journal_with_validation(article_bibjson: models.ArticleBibJSON): 

289 pissn = article_bibjson.get_one_identifier("pissn") 

290 eissn = article_bibjson.get_one_identifier("eissn") 

291 

292 issns = [] 

293 

294 if pissn is not None: 

295 issns.append(pissn) 

296 if eissn is not None: 

297 issns.append(eissn) 

298 

299 # Find an exact match that is in DOAJ 

300 journal = models.Journal.find_by_issn_exact(issns, in_doaj=True) 

301 

302 match len(journal): 

303 case 0: 

304 # Nothing back from in_doaj search, determine if withdrawn or nonexistent 

305 if len(models.Journal.find_by_issn_exact(issns, in_doaj=False)) > 0: 

306 raise exceptions.ArticleNotAcceptable( 

307 message=Messages.EXCEPTION_ADDING_ARTICLE_TO_WITHDRAWN_JOURNAL) 

308 else: 

309 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_MISMATCHED_ISSNS) 

310 case 1: 

311 # check if only one journal matches pissn and eissn and if they are in the correct fields 

312 # no need to check eissn, if pissn matches, pissn and eissn are different and only 1 journal has been found - then eissn matches too 

313 if pissn is not None: 

314 if journal[0].bibjson().pissn != pissn: 

315 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_MISMATCHED_ISSNS) 

316 if eissn is not None: 

317 if journal[0].bibjson().eissn != eissn: 

318 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_MISMATCHED_ISSNS) 

319 case _: 

320 # >1 

321 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_MISMATCHED_ISSNS) 

322 

323 return journal[0] 

324 

325 @staticmethod 

326 def is_legitimate_owner(article, owner): 

327 """ 

328 Determine if the owner id is the owner of the article 

329 

330 :param article: an article model 

331 :param owner: string account ID 

332 :return: True or False 

333 """ 

334 # first validate the incoming arguments to ensure that we've got the right thing 

335 argvalidate("is_legitimate_owner", [ 

336 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"}, 

337 {"arg": owner, "instance": str, "allow_none": False, "arg_name": "owner"} 

338 ], exceptions.ArgumentException) 

339 

340 # get all the issns for the article 

341 b = article.bibjson() 

342 article_issns = b.get_identifiers(b.P_ISSN) 

343 article_issns += b.get_identifiers(b.E_ISSN) 

344 

345 # check each issn against the index, and if a related journal is found 

346 # record the owner of that journal 

347 owners = [] 

348 seen_journal_issns = {} 

349 for issn in article_issns: 

350 journals = models.Journal.find_by_issn(issn) 

351 if journals is not None and len(journals) > 0: 

352 for j in journals: 

353 owners.append(j.owner) 

354 if j.owner not in seen_journal_issns: 

355 seen_journal_issns[j.owner] = [] 

356 seen_journal_issns[j.owner] += j.bibjson().issns() 

357 

358 # deduplicate the list of owners 

359 owners = list(set(owners)) 

360 

361 # no owner means we can't confirm 

362 if len(owners) == 0: 

363 return False 

364 

365 # multiple owners means ownership of this article is confused 

366 if len(owners) > 1: 

367 return False 

368 

369 # if the found owner is not the same as the desired owner, return false 

370 if owners[0] != owner: 

371 return False 

372 

373 # single owner must still know of all supplied issns 

374 journal_issns = set(seen_journal_issns[owner]) 

375 for issn in article_issns: 

376 if issn not in journal_issns: 

377 return False 

378 

379 return True 

380 

381 @staticmethod 

382 def _doi_or_fulltext_updated(new_article, update_id): 

383 if new_article.id is None: 

384 return False 

385 

386 old_art = models.Article.pull(update_id) # ~~->Article:Model~~ 

387 old_doi = old_art.get_normalised_doi() 

388 old_ft_url = old_art.get_normalised_fulltext() 

389 

390 new_doi = new_article.get_normalised_doi() 

391 new_ft_url = new_article.get_normalised_fulltext() 

392 

393 return old_doi != new_doi or old_ft_url != new_ft_url 

394 

395 @staticmethod 

396 def issn_ownership_status(article, owner): 

397 """ 

398 Determine the ownership status of the supplied owner over the issns in the given article 

399 

400 This will give you a tuple back which lists the following (in order): 

401 

402 * which issns are owned by that owner 

403 * which issns are shared with another owner 

404 * which issns are not owned by this owner 

405 * which issns are not found in the DOAJ database 

406 

407 :param article: 

408 :param owner: 

409 :return: 

410 """ 

411 # first validate the incoming arguments to ensure that we've got the right thing 

412 argvalidate("issn_ownership_status", [ 

413 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"}, 

414 {"arg": owner, "instance": str, "allow_none": False, "arg_name": "owner"} 

415 ], exceptions.ArgumentException) 

416 

417 # get all the issns for the article 

418 b = article.bibjson() 

419 issns = b.get_identifiers(b.P_ISSN) 

420 issns += b.get_identifiers(b.E_ISSN) 

421 

422 # FIXME: Duplicate check due to inconsistent control flow (result vs exception) 

423 if len(issns) == 0: 

424 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_NO_ISSNS) 

425 

426 owned = [] 

427 shared = [] 

428 unowned = [] 

429 unmatched = [] 

430 

431 # check each issn against the index, and if a related journal is found 

432 # record the owner of that journal 

433 seen_issns = {} 

434 for issn in issns: 

435 journals = models.Journal.find_by_issn(issn) 

436 if journals is not None and len(journals) > 0: 

437 for j in journals: 

438 if issn not in seen_issns: 

439 seen_issns[issn] = set() 

440 if j.owner is not None: 

441 seen_issns[issn].add(j.owner) 

442 

443 for issn in issns: 

444 if issn not in list(seen_issns.keys()): 

445 unmatched.append(issn) 

446 

447 for issn, owners in seen_issns.items(): 

448 owners = list(owners) 

449 if len(owners) == 0: 

450 unowned.append(issn) 

451 elif len(owners) == 1 and owners[0] == owner: 

452 owned.append(issn) 

453 elif len(owners) == 1 and owners[0] != owner: 

454 unowned.append(issn) 

455 elif len(owners) > 1: 

456 if owner in owners: 

457 shared.append(issn) 

458 else: 

459 unowned.append(issn) 

460 

461 return owned, shared, unowned, unmatched 

462 

463 def get_duplicate(self, article): 

464 """ 

465 Get at most one, most recent, duplicate article for the supplied article. 

466 

467 ~~->ArticleDeduplication:Feature~~ 

468 

469 :param article: 

470 :return: 

471 """ 

472 # first validate the incoming arguments to ensure that we've got the right thing 

473 argvalidate("get_duplicate", [ 

474 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"}, 

475 ], exceptions.ArgumentException) 

476 

477 article.prep() 

478 dup = self.get_duplicates(article, max_results=2) 

479 if len(dup) > 1: 

480 raise exceptions.ArticleMergeConflict(Messages.EXCEPTION_ARTICLE_MERGE_CONFLICT) 

481 elif dup: 

482 return dup.pop() 

483 else: 

484 return None 

485 

486 def get_duplicates(self, article, max_results=10): 

487 """ 

488 Get all known duplicates of an article 

489 

490 ~~->ArticleDeduplication:Feature~~ 

491 

492 :param article: Article of interest 

493 :param max_results: Maximum number of duplicate candidates to return 

494 :return: A list of possible duplicates 

495 """ 

496 # first validate the incoming arguments to ensure that we've got the right thing 

497 argvalidate("get_duplicates", [ 

498 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"}, 

499 ], exceptions.ArgumentException) 

500 

501 possible_articles_dict = self.discover_duplicates(article, max_results) 

502 if not possible_articles_dict: 

503 return [] 

504 

505 # We don't need the details of duplicate types, so flatten the lists. 

506 all_possible_articles = [article for dup_type in list(possible_articles_dict.values()) for article in dup_type] 

507 

508 # An article may fulfil more than one duplication criteria, so needs to be de-duplicated 

509 ids = [] 

510 possible_articles = [] 

511 for a in all_possible_articles: 

512 if a.id not in ids: 

513 ids.append(a.id) 

514 possible_articles.append(a) 

515 

516 # Sort the articles newest -> oldest by last_updated so we can get the most recent at [0] 

517 possible_articles.sort(key=lambda x: dates.parse(x.last_updated), reverse=True) 

518 

519 return possible_articles[:max_results] 

520 

521 @staticmethod 

522 def discover_duplicates(article, results_per_match_type=10, include_article=True): 

523 """ 

524 Identify duplicates, separated by duplication criteria 

525 

526 If the owner id is provided, this will limit the search to duplicates owned by that owner 

527 

528 ~~->ArticleDeduplication:Feature~~ 

529 

530 :param article: 

531 :param results_per_match_type 

532 :param include_article 

533 :return: 

534 """ 

535 # first validate the incoming arguments to ensure that we've got the right thing 

536 argvalidate("discover_duplicates", [ 

537 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"}, 

538 ], exceptions.ArgumentException) 

539 

540 # if we get more than one result, we'll record them here, and then at the end 

541 # if we haven't got a definitive match we'll pick the most likely candidate 

542 # (this isn't as bad as it sounds - the identifiers are pretty reliable, this catches 

543 # issues like where there are already duplicates in the data, and not matching one 

544 # of them propagates the issue) 

545 possible_articles = {} 

546 found = False 

547 

548 # Checking by DOI is our first step 

549 # dois = b.get_identifiers(b.DOI) 

550 doi = article.get_normalised_doi() 

551 if doi is not None: 

552 if isinstance(doi, str) and doi != '': 

553 articles = models.Article.duplicates(doi=doi, size=results_per_match_type) 

554 if len(articles) > 0: 

555 if include_article: 

556 possible_articles['doi'] = [a for a in articles] 

557 else: 

558 possible_articles['doi'] = [a for a in articles if a.id != article.id] 

559 if len(possible_articles['doi']) > 0: 

560 found = True 

561 

562 # Second test is to look by fulltext url 

563 fulltext = article.get_normalised_fulltext() 

564 if fulltext is not None: 

565 articles = models.Article.duplicates(fulltexts=fulltext, size=results_per_match_type) 

566 if len(articles) > 0: 

567 if include_article: 

568 possible_articles['fulltext'] = [a for a in articles] 

569 else: 

570 possible_articles['fulltext'] = [a for a in articles if a.id != article.id] 

571 if possible_articles['fulltext']: 

572 found = True 

573 

574 if doi is None and fulltext is None: 

575 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_DETECT_DUPLICATE_NO_ID) 

576 

577 return possible_articles if found else None