Coverage for portality/bll/services/article.py: 99%

1from portality.lib.argvalidate import argvalidate

2from portality import models

3from portality.bll import exceptions

4from portality.ui.messages import Messages

6from datetime import datetime

9class ArticleService(object):

10 """

11 ~~Article:Service~~

12 """

14 def batch_create_articles(self, articles, account, duplicate_check=True, merge_duplicate=True,

15 limit_to_account=True, add_journal_info=False):

16 """

17 Create a batch of articles in a single operation. Articles are either all created/updated or none of them are

19 This method checks for duplicates within the provided set and within the current database (if you set duplicate_check=True)

21 ~~->ArticleBatchCreate:Feature~~

23 :param articles: The list of article objects

24 :param account: The account creating the articles

25 :param duplicate_check: Whether to check for duplicates in the batch and in the index

26 :param merge_duplicate: Should duplicates be merged. If set to False, this may raise a DuplicateArticleException

27 :param limit_to_account: Should the ingest be limited only to articles for journals owned by the account. If set to True, may result in an IngestException

28 :param add_journal_info: Should we fetch the journal info and attach it to the article before save?

29 :return: a report on the state of the import: {success: x, fail: x, update: x, new: x, shared: [], unowned: [], unmatched: []}

30 """

31 # first validate the incoming arguments to ensure that we've got the right thing

32 argvalidate("batch_create_article", [

33 {"arg": articles, "instance": list, "allow_none": False, "arg_name": "articles"},

34 {"arg": account, "instance": models.Account, "allow_none": False, "arg_name": "account"},

35 {"arg": duplicate_check, "instance": bool, "allow_none": False, "arg_name": "duplicate_check"},

36 {"arg": merge_duplicate, "instance": bool, "allow_none": False, "arg_name": "merge_duplicate"},

37 {"arg": limit_to_account, "instance": bool, "allow_none": False, "arg_name": "limit_to_account"},

38 {"arg": add_journal_info, "instance": bool, "allow_none": False, "arg_name": "add_journal_info"}

39 ], exceptions.ArgumentException)

41 # 1. dedupe the batch

42 if duplicate_check:

43 batch_duplicates = self._batch_contains_duplicates(articles)

44 if batch_duplicates:

45 report = {"success": 0, "fail": len(articles), "update": 0, "new": 0, "shared": [], "unowned": [],

46 "unmatched": []}

47 raise exceptions.IngestException(message=Messages.EXCEPTION_ARTICLE_BATCH_DUPLICATE, result=report)

49 # 2. check legitimate ownership

50 success = 0

51 fail = 0

52 update = 0

53 new = 0

54 all_shared = set()

55 all_unowned = set()

56 all_unmatched = set()

58 for article in articles:

59 try:

60 # ~~!ArticleBatchCreate:Feature->ArticleCreate:Feature~~

61 result = self.create_article(article, account,

62 duplicate_check=duplicate_check,

63 merge_duplicate=merge_duplicate,

64 limit_to_account=limit_to_account,

65 add_journal_info=add_journal_info,

66 dry_run=True)

67 except (exceptions.ArticleMergeConflict, exceptions.ConfigurationException):

68 raise exceptions.IngestException(message=Messages.EXCEPTION_ARTICLE_BATCH_CONFLICT)

70 success += result.get("success", 0)

71 fail += result.get("fail", 0)

72 update += result.get("update", 0)

73 new += result.get("new", 0)

74 all_shared.update(result.get("shared", set()))

75 all_unowned.update(result.get("unowned", set()))

76 all_unmatched.update(result.get("unmatched", set()))

78 report = {"success": success, "fail": fail, "update": update, "new": new, "shared": all_shared,

79 "unowned": all_unowned, "unmatched": all_unmatched}

81 # if there were no failures in the batch, then we can do the save

82 if fail == 0:

83 for i in range(len(articles)):

84 block = i == len(articles) - 1

85 # block on the final save, so that when this method returns, all articles are

86 # available in the index

87 articles[i].save(blocking=block)

89 # return some stats on the import

90 return report

91 else:

92 raise exceptions.IngestException(message=Messages.EXCEPTION_ARTICLE_BATCH_FAIL, result=report)

94 def _batch_contains_duplicates(self, articles):

95 dois = []

96 fulltexts = []

98 for article in articles:

99 doi = article.get_normalised_doi()

100 if doi is not None:

101 if doi in dois:

102 return True

103 dois.append(doi)

104

105 ft = article.get_normalised_fulltext()

106 if ft is not None:

107 if ft in fulltexts:

108 return True

109 fulltexts.append(ft)

110

111 return False

112

113 def _prepare_update_admin(self, article, duplicate, update_article_id, merge_duplicate):

114

115 is_update = 0

116 if duplicate is not None:

117 if duplicate.id != update_article_id:

118 # it means that doi or ft url has been changed so that it duplicates existing article

119 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_IDENTIFIER_CHANGE_CLASH)

120 elif merge_duplicate:

121 is_update += 1

122 article.merge(duplicate)

123 elif merge_duplicate: # requested to update article has both url and doi changed to new values - no duplicate detected

124 is_update += 1

125 art = models.Article.pull(update_article_id)

126 article.merge(art)

127

128 return is_update

129

130 def _prepare_update_publisher(self, article, duplicate, merge_duplicate, account, limit_to_account):

131 # before saving, we need to determine whether this is a new article

132 # or an update

133 is_update = 0

134

135 if duplicate is not None: # else -> it is new article

136 # check if can update the duplicate - if is the owner

137 has_permissions_result = self.has_permissions(account, article, limit_to_account)

138 if isinstance(has_permissions_result, bool) and has_permissions_result == True:

139 doi_or_ft_updated = self._doi_or_fulltext_updated(article, duplicate.id)

140 if doi_or_ft_updated or not merge_duplicate:

141 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_IDENTIFIER_CHANGE)

142 else:

143 is_update += 1

144 article.merge(duplicate)

145 else:

146 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_DUPLICATE_NO_PERMISSION)

147 return is_update

148

149 # here we should have the final point of validation for all incoming articles

150 def _validate_issns(self, article):

151 # only 2 issns: one print, one electronic

152 b = article.bibjson()

153 pissn = b.get_identifiers("pissn")

154 eissn = b.get_identifiers("eissn")

155

156 if len(pissn) > 1 or len(eissn) > 1:

157 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_TOO_MANY_ISSNS)

158

159 pissn = b.get_one_identifier("pissn")

160 eissn = b.get_one_identifier("eissn")

161

162 #pissn and eissn identical

163 if pissn == eissn:

164 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_IDENTICAL_PISSN_AND_EISSN)

165

166

167 def create_article(self, article, account, duplicate_check=True, merge_duplicate=True,

168 limit_to_account=True, add_journal_info=False, dry_run=False, update_article_id=None):

169

170 """

171 Create an individual article in the database

172

173 This method will check and merge any duplicates, and report back on successes and failures in a manner consistent with

174 batch_create_articles.

175

176 ~~->ArticleCreate:Feature~~

177

178 :param article: The article to be created

179 :param account: The account creating the article

180 :param duplicate_check: Whether to check for duplicates in the database

181 :param merge_duplicate: Whether to merge duplicate if found. If set to False, may result in a DuplicateArticleException

182 :param limit_to_account: Whether to limit create to when the account owns the journal to which the article belongs

183 :param add_journal_info: Should we fetch the journal info and attach it to the article before save?

184 :param dry_run: Whether to actuall save, or if this is just to either see if it would work, or to prep for a batch ingest

185 :param update_article_id: The article id that it is supposed to be an update to; taken into consideration ONLY

186 if duplicate_check == True and merge_duplicate == True

187 :return:

188 """

189 # first validate the incoming arguments to ensure that we've got the right thing

190 argvalidate("create_article", [

191 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},

192 {"arg": account, "instance": models.Account, "allow_none": False, "arg_name": "account"},

193 {"arg": duplicate_check, "instance": bool, "allow_none": False, "arg_name": "duplicate_check"},

194 {"arg": merge_duplicate, "instance": bool, "allow_none": False, "arg_name": "merge_duplicate"},

195 {"arg": limit_to_account, "instance": bool, "allow_none": False, "arg_name": "limit_to_account"},

196 {"arg": add_journal_info, "instance": bool, "allow_none": False, "arg_name": "add_journal_info"},

197 {"arg": dry_run, "instance": bool, "allow_none": False, "arg_name": "dry_run"},

198 {"arg": update_article_id, "instance": str, "allow_none": True, "arg_name": "update_article_id"}

199 ], exceptions.ArgumentException)

200

201 # quickly validate that the article is acceptable - it must have a DOI and/or a fulltext

202 # this raises an exception if the article is not acceptable, containing all the relevant validation details

203

204 try:

205 self.is_acceptable(article)

206 except Exception as e:

207 raise e

208

209 has_permissions_result = self.has_permissions(account, article, limit_to_account)

210 if isinstance(has_permissions_result,dict):

211 return has_permissions_result

212

213 is_update = 0

214 if duplicate_check:

215 # ~~!ArticleCreate:Feature->ArticleDeduplication:Feature~~

216 duplicate = self.get_duplicate(article)

217 try:

218 if account.has_role("admin") and update_article_id is not None: # is update_article_id is None then treat as normal publisher upload

219 # for testing by admin

220 is_update = self._prepare_update_admin(article, duplicate, update_article_id, merge_duplicate)

221 else:

222 is_update = self._prepare_update_publisher(article, duplicate, merge_duplicate, account, limit_to_account)

223 except (exceptions.DuplicateArticleException, exceptions.ArticleMergeConflict, exceptions.ConfigurationException) as e:

224 raise e

225

226 if add_journal_info:

227 article.add_journal_metadata()

228

229 # finally, save the new article

230 if not dry_run:

231 article.save()

232

233 return {"success": 1, "fail": 0, "update": is_update, "new": 1 - is_update, "shared": set(), "unowned": set(),

234 "unmatched": set()}

235

236

237 def has_permissions(self, account, article, limit_to_account):

238

239 if limit_to_account:

240 legit = account.has_role("admin") or self.is_legitimate_owner(article, account.id)

241 if not legit:

242 owned, shared, unowned, unmatched = self.issn_ownership_status(article, account.id)

243 return {"success": 0, "fail": 1, "update": 0, "new": 0, "shared": shared, "unowned": unowned,

244 "unmatched": unmatched}

245 return True

246

247

248 def is_acceptable(self, article):

249 """

250 conduct some deep validation on the article to make sure we will accept it

251 or the moment, this just means making sure it has a DOI and a fulltext

252 """

253 bj = article.bibjson()

254

255 # do we have a DOI. If so, no need to go further

256 doi = bj.get_one_identifier(bj.DOI)

257 ft = bj.get_single_url(bj.FULLTEXT)

258 if doi is None and ft is None:

259 raise exceptions.ArticleNotAcceptable(message=Messages.EXCEPTION_NO_DOI_NO_FULLTEXT)

260

261 self._validate_issns(article)

262

263 def is_legitimate_owner(self, article, owner):

264 """

265 Determine if the owner id is the owner of the article

266

267 :param article: an article model

268 :param owner: string account ID

269 :return: True or False

270 """

271 # first validate the incoming arguments to ensure that we've got the right thing

272 argvalidate("is_legitimate_owner", [

273 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},

274 {"arg": owner, "instance": str, "allow_none": False, "arg_name": "owner"}

275 ], exceptions.ArgumentException)

276

277 # get all the issns for the article

278 b = article.bibjson()

279 article_issns = b.get_identifiers(b.P_ISSN)

280 article_issns += b.get_identifiers(b.E_ISSN)

281

282 # check each issn against the index, and if a related journal is found

283 # record the owner of that journal

284 owners = []

285 seen_journal_issns = {}

286 for issn in article_issns:

287 journals = models.Journal.find_by_issn(issn)

288 if journals is not None and len(journals) > 0:

289 for j in journals:

290 owners.append(j.owner)

291 if j.owner not in seen_journal_issns:

292 seen_journal_issns[j.owner] = []

293 seen_journal_issns[j.owner] += j.bibjson().issns()

294

295 # deduplicate the list of owners

296 owners = list(set(owners))

297

298 # no owner means we can't confirm

299 if len(owners) == 0:

300 return False

301

302 # multiple owners means ownership of this article is confused

303 if len(owners) > 1:

304 return False

305

306 # if the found owner is not the same as the desired owner, return false

307 if owners[0] != owner:

308 return False

309

310 # single owner must still know of all supplied issns

311 journal_issns = set(seen_journal_issns[owner])

312 for issn in article_issns:

313 if issn not in journal_issns:

314 return False

315

316 return True

317

318 def _doi_or_fulltext_updated(self, new_article, update_id):

319 if new_article.id is None:

320 return False

321

322 old_art = models.Article.pull(update_id) # ~~->Article:Model~~

323 old_doi = old_art.get_normalised_doi()

324 old_ft_url = old_art.get_normalised_fulltext()

325

326 new_doi = new_article.get_normalised_doi()

327 new_ft_url = new_article.get_normalised_fulltext()

328

329 return old_doi != new_doi or old_ft_url != new_ft_url

330

331 def issn_ownership_status(self, article, owner):

332 """

333 Determine the ownership status of the supplied owner over the issns in the given article

334

335 This will give you a tuple back which lists the following (in order):

336

337 * which issns are owned by that owner

338 * which issns are shared with another owner

339 * which issns are not owned by this owner

340 * which issns are not found in the DOAJ database

341

342 :param article:

343 :param owner:

344 :return:

345 """

346 # first validate the incoming arguments to ensure that we've got the right thing

347 argvalidate("issn_ownership_status", [

348 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},

349 {"arg": owner, "instance": str, "allow_none": False, "arg_name": "owner"}

350 ], exceptions.ArgumentException)

351

352 # get all the issns for the article

353 b = article.bibjson()

354 issns = b.get_identifiers(b.P_ISSN)

355 issns += b.get_identifiers(b.E_ISSN)

356

357 owned = []

358 shared = []

359 unowned = []

360 unmatched = []

361

362 # check each issn against the index, and if a related journal is found

363 # record the owner of that journal

364 seen_issns = {}

365 for issn in issns:

366 journals = models.Journal.find_by_issn(issn)

367 if journals is not None and len(journals) > 0:

368 for j in journals:

369 if issn not in seen_issns:

370 seen_issns[issn] = set()

371 if j.owner is not None:

372 seen_issns[issn].add(j.owner)

373

374 for issn in issns:

375 if issn not in list(seen_issns.keys()):

376 unmatched.append(issn)

377

378 for issn, owners in seen_issns.items():

379 owners = list(owners)

380 if len(owners) == 0:

381 unowned.append(issn)

382 elif len(owners) == 1 and owners[0] == owner:

383 owned.append(issn)

384 elif len(owners) == 1 and owners[0] != owner:

385 unowned.append(issn)

386 elif len(owners) > 1:

387 if owner in owners:

388 shared.append(issn)

389 else:

390 unowned.append(issn)

391

392 return owned, shared, unowned, unmatched

393

394 def get_duplicate(self, article):

395 """

396 Get at most one one, most recent, duplicate article for the supplied article.

397

398 If the owner id is provided, this will limit the search to duplicates owned by that owner

399

400 ~~->ArticleDeduplication:Feature~~

401

402 :param article:

403 :param owner:

404 :return:

405 """

406 # first validate the incoming arguments to ensure that we've got the right thing

407 argvalidate("get_duplicate", [

408 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},

409 ], exceptions.ArgumentException)

410

411 article.prep()

412 dup = self.get_duplicates(article, max_results=2)

413 if len(dup) > 1:

414 raise exceptions.ArticleMergeConflict(Messages.EXCEPTION_ARTICLE_MERGE_CONFLICT)

415 elif dup:

416 return dup.pop()

417 else:

418 return None

419

420 def get_duplicates(self, article, max_results=10):

421 """

422 Get all known duplicates of an article

423

424 If the owner id is provided, this will limit the search to duplicates owned by that owner

425

426 ~~->ArticleDeduplication:Feature~~

427

428 :param article:

429 :return:

430 """

431 # first validate the incoming arguments to ensure that we've got the right thing

432 argvalidate("get_duplicates", [

433 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},

434 ], exceptions.ArgumentException)

435

436 possible_articles_dict = self.discover_duplicates(article, max_results)

437 if not possible_articles_dict:

438 return []

439

440 # We don't need the details of duplicate types, so flatten the lists.

441 all_possible_articles = [article for dup_type in list(possible_articles_dict.values()) for article in dup_type]

442

443 # An article may fulfil more than one duplication criteria, so needs to be de-duplicated

444 ids = []

445 possible_articles = []

446 for a in all_possible_articles:

447 if a.id not in ids:

448 ids.append(a.id)

449 possible_articles.append(a)

450

451 # Sort the articles newest -> oldest by last_updated so we can get the most recent at [0]

452 possible_articles.sort(key=lambda x: datetime.strptime(x.last_updated, "%Y-%m-%dT%H:%M:%SZ"), reverse=True)

453

454 return possible_articles[:max_results]

455

456 def discover_duplicates(self, article, results_per_match_type=10, include_article=True):

457 """

458 Identify duplicates, separated by duplication criteria

459

460 If the owner id is provided, this will limit the search to duplicates owned by that owner

461

462 ~~->ArticleDeduplication:Feature~~

463

464 :param article:

465 :return:

466 """

467 # first validate the incoming arguments to ensure that we've got the right thing

468 argvalidate("discover_duplicates", [

469 {"arg": article, "instance": models.Article, "allow_none": False, "arg_name": "article"},

470 ], exceptions.ArgumentException)

471

472 # if we get more than one result, we'll record them here, and then at the end

473 # if we haven't got a definitive match we'll pick the most likely candidate

474 # (this isn't as bad as it sounds - the identifiers are pretty reliable, this catches

475 # issues like where there are already duplicates in the data, and not matching one

476 # of them propagates the issue)

477 possible_articles = {}

478 found = False

479

480 # Checking by DOI is our first step

481 # dois = b.get_identifiers(b.DOI)

482 doi = article.get_normalised_doi()

483 if doi is not None:

484 if isinstance(doi, str) and doi != '':

485 articles = models.Article.duplicates(doi=doi, size=results_per_match_type)

486 if len(articles) > 0:

487 if include_article:

488 possible_articles['doi'] = [a for a in articles]

489 else:

490 possible_articles['doi'] = [a for a in articles if a.id != article.id]

491 if len(possible_articles['doi']) > 0:

492 found = True

493

494 # Second test is to look by fulltext url

495 fulltext = article.get_normalised_fulltext()

496 if fulltext is not None:

497 articles = models.Article.duplicates(fulltexts=fulltext, size=results_per_match_type)

498 if len(articles) > 0:

499 if include_article:

500 possible_articles['fulltext'] = [a for a in articles]

501 else:

502 possible_articles['fulltext'] = [a for a in articles if a.id != article.id]

503 if possible_articles['fulltext']:

504 found = True

505

506 if doi is None and fulltext is None:

507 raise exceptions.DuplicateArticleException(Messages.EXCEPTION_DETECT_DUPLICATE_NO_ID)

508

509 return possible_articles if found else None