Coverage for portality/bll/services/article.py: 99%

245 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-22 15:59 +0100

1from portality.lib.argvalidate import argvalidate 

2from portality import models 

3from portality.bll import exceptions 

4from portality.ui.messages import Messages 

5 

6from datetime import datetime 

7 

8 

9class ArticleService(object): 

10 """ 

11 ~~Article:Service~~ 

12 """ 

13 

14 def batch_create_articles(self, articles, account, duplicate_check=True, merge_duplicate=True, 

15 limit_to_account=True, add_journal_info=False): 

16 """ 

17 Create a batch of articles in a single operation. Articles are either all created/updated or none of them are 

18 

19 This method checks for duplicates within the provided set and within the current database (if you set duplicate_check=True) 

20 

21 ~~->ArticleBatchCreate:Feature~~ 

22 

23 :param articles: The list of article objects 

24 :param account: The account creating the articles 

25 :param duplicate_check: Whether to check for duplicates in the batch and in the index 

26 :param merge_duplicate: Should duplicates be merged. If set to False, this may raise a DuplicateArticleException 

27 :param limit_to_account: Should the ingest be limited only to articles for journals owned by the account. If set to True, may result in an IngestException 

28 :param add_journal_info: Should we fetch the journal info and attach it to the article before save? 

29 :return: a report on the state of the import: {success: x, fail: x, update: x, new: x, shared: [], unowned: [], unmatched: []} 

30 """ 

31 # first validate the incoming arguments to ensure that we've got the right thing 

32 argvalidate("batch_create_article", [ 

33 {"arg": articles, "instance": list, "allow_none": False, "arg_name": "articles"}, 

34 {"arg": account, "instance": models.Account, "allow_none": False, "arg_name": "account"}, 

35 {"arg": duplicate_check, "instance": bool, "allow_none": False, "arg_name": "duplicate_check"}, 

36 {"arg": merge_duplicate, "instance": bool, "allow_none": False, "arg_name": "merge_duplicate"}, 

37 {"arg": limit_to_account, "instance": bool, "allow_none": False, "arg_name": "limit_to_account"}, 

38 {"arg": add_journal_info, "instance": bool, "allow_none": False, "arg_name": "add_journal_info"} 

39 ], exceptions.ArgumentException) 

40 

41 # 1. dedupe the batch 

42 if duplicate_check: 

43 batch_duplicates = self._batch_contains_duplicates(articles) 

44 if batch_duplicates: 

45 report = {"success": 0, "fail": len(articles), "update": 0, "new": 0, "shared": [], "unowned": [], 

46 "unmatched": []} 

47 raise exceptions.IngestException(message=Messages.EXCEPTION_ARTICLE_BATCH_DUPLICATE, result=report) 

48 

49 # 2. check legitimate ownership 

50 success = 0 

51 fail = 0 

52 update = 0 

53 new = 0 

54 all_shared = set() 

55 all_unowned = set() 

56 all_unmatched = set() 

57 

58 for article in articles: 

59 try: 

60 # ~~!ArticleBatchCreate:Feature->ArticleCreate:Feature~~ 

61 result = self.create_article(article, account, 

62 duplicate_check=duplicate_check, 

63 merge_duplicate=merge_duplicate, 

64 limit_to_account=limit_to_account, 

65 add_journal_info=add_journal_info, 

66 dry_run=True) 

67 except (exceptions.ArticleMergeConflict, exceptions.ConfigurationException): 

68 raise exceptions.IngestException(message=Messages.EXCEPTION_ARTICLE_BATCH_CONFLICT) 

69 

70 success += result.get("success", 0) 

71 fail += result.get("fail", 0) 

72 update += result.get("update", 0) 

73 new += result.get("new", 0) 

74 all_shared.update(result.get("shared", set())) 

75 all_unowned.update(result.get("unowned", set())) 

76 all_unmatched.update(result.get("unmatched", set())) 

77 

78 report = {"success": success, "fail": fail, "update": update, "new": new, "shared": all_shared, 

79 "unowned": all_unowned, "unmatched": all_unmatched} 

80 

81 # if there were no failures in the batch, then we can do the save 

82 if fail == 0: 

83 for i in range(len(articles)): 

84 block = i == len(articles) - 1 

85 # block on the final save, so that when this method returns, all articles are 

86 # available in the index 

87 articles[i].save(blocking=block) 

88 

89 # return some stats on the import 

90 return report 

91 else: 

92 raise exceptions.IngestException(message=Messages.EXCEPTION_ARTICLE_BATCH_FAIL, result=report) 

93 

94 def _batch_contains_duplicates(self, articles): 

95 dois = [] 

96 fulltexts = [] 

97 

98 for article in articles: 

99 doi = article.get_normalised_doi() 

100 if doi is not None: 

101 if doi in dois: 

102 return True 

103 dois.append(doi) 

104 

105 ft = article.get_normalised_fulltext() 

106 if ft is not None: 

107 if ft in fulltexts: 

108 return True 

109 fulltexts.append(ft) 

110 

111 return False 

112 

113 def _prepare_update_admin(self, article, duplicate, update_article_id, merge_duplicate): 

114 

115 is_update = 0 

116 if duplicate is not None: 

117 if duplicate.id != update_article_id: 

118 # it means that doi or ft url has been changed so that it duplicates existing article 

119 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_IDENTIFIER_CHANGE_CLASH) 

120 elif merge_duplicate: 

121 is_update += 1 

122 article.merge(duplicate) 

123 elif merge_duplicate: # requested to update article has both url and doi changed to new values - no duplicate detected 

124 is_update += 1 

125 art = models.Article.pull(update_article_id) 

126 article.merge(art) 

127 

128 return is_update 

129 

130 def _prepare_update_publisher(self, article, duplicate, merge_duplicate, account, limit_to_account): 

131 # before saving, we need to determine whether this is a new article 

132 # or an update 

133 is_update = 0 

134 

135 if duplicate is not None: # else -> it is new article 

136 # check if can update the duplicate - if is the owner 

137 has_permissions_result = self.has_permissions(account, article, limit_to_account) 

138 if isinstance(has_permissions_result, bool) and has_permissions_result == True: 

139 doi_or_ft_updated = self._doi_or_fulltext_updated(article, duplicate.id) 

140 if doi_or_ft_updated or not merge_duplicate: 

141 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_IDENTIFIER_CHANGE) 

142 else: 

143 is_update += 1 

144 article.merge(duplicate) 

145 else: 

146 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_DUPLICATE_NO_PERMISSION) 

147 return is_update 

148 

149 # here we should have the final point of validation for all incoming articles 

150 def _validate_issns(self, article): 

151 # only 2 issns: one print, one electronic 

152 b = article.bibjson() 

153 pissn = b.get_identifiers("pissn") 

154 eissn = b.get_identifiers("eissn") 

155 

156 if len(pissn) > 1 or len(eissn) > 1: 

157 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_TOO_MANY_ISSNS) 

158 

159 pissn = b.get_one_identifier("pissn") 

160 eissn = b.get_one_identifier("eissn") 

161 

162 #pissn and eissn identical 

163 if pissn == eissn: 

164 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_IDENTICAL_PISSN_AND_EISSN) 

165 

166 

167 def create_article(self, article, account, duplicate_check=True, merge_duplicate=True, 

168 limit_to_account=True, add_journal_info=False, dry_run=False, update_article_id=None): 

169 

170 """ 

171 Create an individual article in the database 

172 

173 This method will check and merge any duplicates, and report back on successes and failures in a manner consistent with 

174 batch_create_articles. 

175 

176 ~~->ArticleCreate:Feature~~ 

177 

178 :param article: The article to be created 

179 :param account: The account creating the article 

180 :param duplicate_check: Whether to check for duplicates in the database 

181 :param merge_duplicate: Whether to merge duplicate if found. If set to False, may result in a DuplicateArticleException 

182 :param limit_to_account: Whether to limit create to when the account owns the journal to which the article belongs 

183 :param add_journal_info: Should we fetch the journal info and attach it to the article before save? 

184 :param dry_run: Whether to actuall save, or if this is just to either see if it would work, or to prep for a batch ingest 

185 :param update_article_id: The article id that it is supposed to be an update to; taken into consideration ONLY 

186 if duplicate_check == True and merge_duplicate == True 

187 :return: 

188 """ 

189 # first validate the incoming arguments to ensure that we've got the right thing 

190 argvalidate("create_article", [ 

191 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"}, 

192 {"arg": account, "instance": models.Account, "allow_none": False, "arg_name": "account"}, 

193 {"arg": duplicate_check, "instance": bool, "allow_none": False, "arg_name": "duplicate_check"}, 

194 {"arg": merge_duplicate, "instance": bool, "allow_none": False, "arg_name": "merge_duplicate"}, 

195 {"arg": limit_to_account, "instance": bool, "allow_none": False, "arg_name": "limit_to_account"}, 

196 {"arg": add_journal_info, "instance": bool, "allow_none": False, "arg_name": "add_journal_info"}, 

197 {"arg": dry_run, "instance": bool, "allow_none": False, "arg_name": "dry_run"}, 

198 {"arg": update_article_id, "instance": str, "allow_none": True, "arg_name": "update_article_id"} 

199 ], exceptions.ArgumentException) 

200 

201 # quickly validate that the article is acceptable - it must have a DOI and/or a fulltext 

202 # this raises an exception if the article is not acceptable, containing all the relevant validation details 

203 

204 try: 

205 self.is_acceptable(article) 

206 except Exception as e: 

207 raise e 

208 

209 has_permissions_result = self.has_permissions(account, article, limit_to_account) 

210 if isinstance(has_permissions_result,dict): 

211 return has_permissions_result 

212 

213 is_update = 0 

214 if duplicate_check: 

215 # ~~!ArticleCreate:Feature->ArticleDeduplication:Feature~~ 

216 duplicate = self.get_duplicate(article) 

217 try: 

218 if account.has_role("admin") and update_article_id is not None: # is update_article_id is None then treat as normal publisher upload 

219 # for testing by admin 

220 is_update = self._prepare_update_admin(article, duplicate, update_article_id, merge_duplicate) 

221 else: 

222 is_update = self._prepare_update_publisher(article, duplicate, merge_duplicate, account, limit_to_account) 

223 except (exceptions.DuplicateArticleException, exceptions.ArticleMergeConflict, exceptions.ConfigurationException) as e: 

224 raise e 

225 

226 if add_journal_info: 

227 article.add_journal_metadata() 

228 

229 # finally, save the new article 

230 if not dry_run: 

231 article.save() 

232 

233 return {"success": 1, "fail": 0, "update": is_update, "new": 1 - is_update, "shared": set(), "unowned": set(), 

234 "unmatched": set()} 

235 

236 

237 def has_permissions(self, account, article, limit_to_account): 

238 

239 if limit_to_account: 

240 legit = account.has_role("admin") or self.is_legitimate_owner(article, account.id) 

241 if not legit: 

242 owned, shared, unowned, unmatched = self.issn_ownership_status(article, account.id) 

243 return {"success": 0, "fail": 1, "update": 0, "new": 0, "shared": shared, "unowned": unowned, 

244 "unmatched": unmatched} 

245 return True 

246 

247 

248 def is_acceptable(self, article): 

249 """ 

250 conduct some deep validation on the article to make sure we will accept it 

251 or the moment, this just means making sure it has a DOI and a fulltext 

252 """ 

253 bj = article.bibjson() 

254 

255 # do we have a DOI. If so, no need to go further 

256 doi = bj.get_one_identifier(bj.DOI) 

257 ft = bj.get_single_url(bj.FULLTEXT) 

258 if doi is None and ft is None: 

259 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_NO_DOI_NO_FULLTEXT) 

260 

261 self._validate_issns(article) 

262 

263 def is_legitimate_owner(self, article, owner): 

264 """ 

265 Determine if the owner id is the owner of the article 

266 

267 :param article: an article model 

268 :param owner: string account ID 

269 :return: True or False 

270 """ 

271 # first validate the incoming arguments to ensure that we've got the right thing 

272 argvalidate("is_legitimate_owner", [ 

273 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"}, 

274 {"arg": owner, "instance": str, "allow_none": False, "arg_name": "owner"} 

275 ], exceptions.ArgumentException) 

276 

277 # get all the issns for the article 

278 b = article.bibjson() 

279 article_issns = b.get_identifiers(b.P_ISSN) 

280 article_issns += b.get_identifiers(b.E_ISSN) 

281 

282 # check each issn against the index, and if a related journal is found 

283 # record the owner of that journal 

284 owners = [] 

285 seen_journal_issns = {} 

286 for issn in article_issns: 

287 journals = models.Journal.find_by_issn(issn) 

288 if journals is not None and len(journals) > 0: 

289 for j in journals: 

290 owners.append(j.owner) 

291 if j.owner not in seen_journal_issns: 

292 seen_journal_issns[j.owner] = [] 

293 seen_journal_issns[j.owner] += j.bibjson().issns() 

294 

295 # deduplicate the list of owners 

296 owners = list(set(owners)) 

297 

298 # no owner means we can't confirm 

299 if len(owners) == 0: 

300 return False 

301 

302 # multiple owners means ownership of this article is confused 

303 if len(owners) > 1: 

304 return False 

305 

306 # if the found owner is not the same as the desired owner, return false 

307 if owners[0] != owner: 

308 return False 

309 

310 # single owner must still know of all supplied issns 

311 journal_issns = set(seen_journal_issns[owner]) 

312 for issn in article_issns: 

313 if issn not in journal_issns: 

314 return False 

315 

316 return True 

317 

318 def _doi_or_fulltext_updated(self, new_article, update_id): 

319 if new_article.id is None: 

320 return False 

321 

322 old_art = models.Article.pull(update_id) # ~~->Article:Model~~ 

323 old_doi = old_art.get_normalised_doi() 

324 old_ft_url = old_art.get_normalised_fulltext() 

325 

326 new_doi = new_article.get_normalised_doi() 

327 new_ft_url = new_article.get_normalised_fulltext() 

328 

329 return old_doi != new_doi or old_ft_url != new_ft_url 

330 

331 def issn_ownership_status(self, article, owner): 

332 """ 

333 Determine the ownership status of the supplied owner over the issns in the given article 

334 

335 This will give you a tuple back which lists the following (in order): 

336 

337 * which issns are owned by that owner 

338 * which issns are shared with another owner 

339 * which issns are not owned by this owner 

340 * which issns are not found in the DOAJ database 

341 

342 :param article: 

343 :param owner: 

344 :return: 

345 """ 

346 # first validate the incoming arguments to ensure that we've got the right thing 

347 argvalidate("issn_ownership_status", [ 

348 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"}, 

349 {"arg": owner, "instance": str, "allow_none": False, "arg_name": "owner"} 

350 ], exceptions.ArgumentException) 

351 

352 # get all the issns for the article 

353 b = article.bibjson() 

354 issns = b.get_identifiers(b.P_ISSN) 

355 issns += b.get_identifiers(b.E_ISSN) 

356 

357 owned = [] 

358 shared = [] 

359 unowned = [] 

360 unmatched = [] 

361 

362 # check each issn against the index, and if a related journal is found 

363 # record the owner of that journal 

364 seen_issns = {} 

365 for issn in issns: 

366 journals = models.Journal.find_by_issn(issn) 

367 if journals is not None and len(journals) > 0: 

368 for j in journals: 

369 if issn not in seen_issns: 

370 seen_issns[issn] = set() 

371 if j.owner is not None: 

372 seen_issns[issn].add(j.owner) 

373 

374 for issn in issns: 

375 if issn not in list(seen_issns.keys()): 

376 unmatched.append(issn) 

377 

378 for issn, owners in seen_issns.items(): 

379 owners = list(owners) 

380 if len(owners) == 0: 

381 unowned.append(issn) 

382 elif len(owners) == 1 and owners[0] == owner: 

383 owned.append(issn) 

384 elif len(owners) == 1 and owners[0] != owner: 

385 unowned.append(issn) 

386 elif len(owners) > 1: 

387 if owner in owners: 

388 shared.append(issn) 

389 else: 

390 unowned.append(issn) 

391 

392 return owned, shared, unowned, unmatched 

393 

394 def get_duplicate(self, article): 

395 """ 

396 Get at most one one, most recent, duplicate article for the supplied article. 

397 

398 If the owner id is provided, this will limit the search to duplicates owned by that owner 

399 

400 ~~->ArticleDeduplication:Feature~~ 

401 

402 :param article: 

403 :param owner: 

404 :return: 

405 """ 

406 # first validate the incoming arguments to ensure that we've got the right thing 

407 argvalidate("get_duplicate", [ 

408 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"}, 

409 ], exceptions.ArgumentException) 

410 

411 article.prep() 

412 dup = self.get_duplicates(article, max_results=2) 

413 if len(dup) > 1: 

414 raise exceptions.ArticleMergeConflict(Messages.EXCEPTION_ARTICLE_MERGE_CONFLICT) 

415 elif dup: 

416 return dup.pop() 

417 else: 

418 return None 

419 

420 def get_duplicates(self, article, max_results=10): 

421 """ 

422 Get all known duplicates of an article 

423 

424 If the owner id is provided, this will limit the search to duplicates owned by that owner 

425 

426 ~~->ArticleDeduplication:Feature~~ 

427 

428 :param article: 

429 :return: 

430 """ 

431 # first validate the incoming arguments to ensure that we've got the right thing 

432 argvalidate("get_duplicates", [ 

433 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"}, 

434 ], exceptions.ArgumentException) 

435 

436 possible_articles_dict = self.discover_duplicates(article, max_results) 

437 if not possible_articles_dict: 

438 return [] 

439 

440 # We don't need the details of duplicate types, so flatten the lists. 

441 all_possible_articles = [article for dup_type in list(possible_articles_dict.values()) for article in dup_type] 

442 

443 # An article may fulfil more than one duplication criteria, so needs to be de-duplicated 

444 ids = [] 

445 possible_articles = [] 

446 for a in all_possible_articles: 

447 if a.id not in ids: 

448 ids.append(a.id) 

449 possible_articles.append(a) 

450 

451 # Sort the articles newest -> oldest by last_updated so we can get the most recent at [0] 

452 possible_articles.sort(key=lambda x: datetime.strptime(x.last_updated, "%Y-%m-%dT%H:%M:%SZ"), reverse=True) 

453 

454 return possible_articles[:max_results] 

455 

456 def discover_duplicates(self, article, results_per_match_type=10, include_article=True): 

457 """ 

458 Identify duplicates, separated by duplication criteria 

459 

460 If the owner id is provided, this will limit the search to duplicates owned by that owner 

461 

462 ~~->ArticleDeduplication:Feature~~ 

463 

464 :param article: 

465 :return: 

466 """ 

467 # first validate the incoming arguments to ensure that we've got the right thing 

468 argvalidate("discover_duplicates", [ 

469 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"}, 

470 ], exceptions.ArgumentException) 

471 

472 # if we get more than one result, we'll record them here, and then at the end 

473 # if we haven't got a definitive match we'll pick the most likely candidate 

474 # (this isn't as bad as it sounds - the identifiers are pretty reliable, this catches 

475 # issues like where there are already duplicates in the data, and not matching one 

476 # of them propagates the issue) 

477 possible_articles = {} 

478 found = False 

479 

480 # Checking by DOI is our first step 

481 # dois = b.get_identifiers(b.DOI) 

482 doi = article.get_normalised_doi() 

483 if doi is not None: 

484 if isinstance(doi, str) and doi != '': 

485 articles = models.Article.duplicates(doi=doi, size=results_per_match_type) 

486 if len(articles) > 0: 

487 if include_article: 

488 possible_articles['doi'] = [a for a in articles] 

489 else: 

490 possible_articles['doi'] = [a for a in articles if a.id != article.id] 

491 if len(possible_articles['doi']) > 0: 

492 found = True 

493 

494 # Second test is to look by fulltext url 

495 fulltext = article.get_normalised_fulltext() 

496 if fulltext is not None: 

497 articles = models.Article.duplicates(fulltexts=fulltext, size=results_per_match_type) 

498 if len(articles) > 0: 

499 if include_article: 

500 possible_articles['fulltext'] = [a for a in articles] 

501 else: 

502 possible_articles['fulltext'] = [a for a in articles if a.id != article.id] 

503 if possible_articles['fulltext']: 

504 found = True 

505 

506 if doi is None and fulltext is None: 

507 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_DETECT_DUPLICATE_NO_ID) 

508 

509 return possible_articles if found else None