Coverage for portality / tasks / preservation.py: 62%

472 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-05 00:09 +0100

1import csv 

2import hashlib 

3import json 

4import os 

5import shutil 

6import tarfile 

7from copy import deepcopy 

8from zipfile import ZipFile 

9 

10import requests 

11from bagit import make_bag, BagError 

12 

13from portality.background import BackgroundTask, BackgroundApi 

14from portality.bll import DOAJ 

15from portality.core import app 

16from portality.lib import dates 

17from portality.models import Account, Article, BackgroundJob, PreservationState 

18from portality.regex import DOI_COMPILED, HTTP_URL_COMPILED 

19from portality.tasks.helpers import background_helper 

20from portality.tasks.redis_huey import events_queue as queue 

21 

22 

23class PreservationException(Exception): 

24 """~~PreservationException:Exception~~""" 

25 pass 

26 

27 

28class PreservationStorageException(Exception): 

29 pass 

30 

31 

32class ValidationError(Exception): 

33 pass 

34 

35 

36class ArticlePackage: 

37 """ ~~ArticlePackage:Feature~~""" 

38 

39 def __init__(self, article_dir, files): 

40 self.issn = None 

41 self.article_id = None 

42 self.metadata = None 

43 self.article_dir = article_dir 

44 self.article_files = files 

45 self.package_dir = None 

46 self.has_error = False 

47 self.error_details = None 

48 

49 def create_article_bagit_structure(self): 

50 """ ~~-> BagIt:Library~~ 

51 Create directory structure for packaging 

52 Create required additional files 

53 Create bagit files 

54 """ 

55 # Validate if required data is available 

56 self.validate() 

57 

58 journal_dir = os.path.join(self.package_dir, self.issn) 

59 if not os.path.exists(journal_dir): 

60 os.mkdir(journal_dir) 

61 

62 dest_article_dir = os.path.join(journal_dir, self.article_id) 

63 if not os.path.exists(dest_article_dir): 

64 # Create article directory 

65 os.mkdir(dest_article_dir) 

66 

67 # Create metadata directory 

68 metada_dir = os.path.join(dest_article_dir, "metadata") 

69 if not os.path.exists(metada_dir): 

70 os.mkdir(metada_dir) 

71 

72 # Copy the files from user uploaded directory to the package 

73 for file in self.article_files: 

74 if not file == Preservation.IDENTIFIER_FILE: 

75 src = os.path.join(self.article_dir, file) 

76 dest = os.path.join(dest_article_dir, file) 

77 shutil.copy(src, dest) 

78 

79 # Create metadata file with article information 

80 with open(os.path.join(metada_dir, "metadata.json"), 'w+') as metadata_file: 

81 metadata_file.write(json.dumps(self.metadata, indent=4)) 

82 

83 # Create a identifier file with uuid of the article 

84 with open(os.path.join(metada_dir, "identifier.txt"), 'w+') as metadata_file: 

85 metadata_file.write(self.article_id) 

86 

87 try: 

88 # Bag the article 

89 make_bag(dest_article_dir, checksums=["sha256"]) 

90 except BagError: 

91 app.logger.excception(f"Error while creating Bag for article {self.article_id}") 

92 raise PreservationException("Error while creating Bag") 

93 

94 def validate(self): 

95 variables_list = [] 

96 

97 if not self.package_dir: 

98 variables_list.append("package_dir") 

99 if not self.metadata: 

100 variables_list.append("metadata") 

101 if not self.article_dir: 

102 variables_list.append("article_dir") 

103 if not self.article_files or len(self.article_files) == 0: 

104 variables_list.append("article_files") 

105 if not self.article_id: 

106 variables_list.append("article_id") 

107 if not self.issn: 

108 variables_list.append("issn") 

109 

110 if len(variables_list) > 0: 

111 app.logger.debug(f"Validation Values : package_dir {self.package_dir} " 

112 f"metadata {self.metadata} article_dir {self.article_dir} " 

113 f"article_files {self.article_files} article_id {self.article_id} issn {self.issn}") 

114 raise ValidationError(f"Required fields cannot be empty {variables_list}") 

115 

116 

117class ArticlesList: 

118 """This class contains different types of lists depending on the article state""" 

119 

120 def __init__(self): 

121 self.__successful_articles = [] 

122 self.__unowned_articles = [] 

123 self.__no_identifier_articles = [] 

124 self.__unbagged_articles = [] 

125 self.__not_found_articles = [] 

126 self.__no_files_articles = [] 

127 self.__uploaded_journals = [] 

128 self.has_errors = False 

129 

130 def add_successful_article(self, article: ArticlePackage): 

131 self.__successful_articles.append(os.path.basename(article.article_dir)) 

132 

133 def add_uploaded_journal(self, journal_package): 

134 self.__uploaded_journals.append(journal_package) 

135 

136 def add_unowned_articles(self, article: ArticlePackage): 

137 self.has_errors = True 

138 self.__unowned_articles.append(os.path.basename(article.article_dir)) 

139 

140 def add_no_identifier_articles(self, article: ArticlePackage): 

141 self.has_errors = True 

142 self.__no_identifier_articles.append(os.path.basename(article.article_dir)) 

143 

144 def add_unbagged_articles(self, article: ArticlePackage): 

145 self.has_errors = True 

146 self.__unbagged_articles.append(os.path.basename(article.article_dir)) 

147 

148 def add_not_found_articles(self, article: ArticlePackage): 

149 self.has_errors = True 

150 self.__not_found_articles.append(os.path.basename(article.article_dir)) 

151 

152 def add_no_files_articles(self, article: ArticlePackage): 

153 self.__no_files_articles.append(os.path.basename(article.article_dir)) 

154 

155 def successful_articles(self): 

156 return self.__successful_articles 

157 

158 def unowned_articles(self): 

159 return self.__unowned_articles 

160 

161 def no_identifier_articles(self): 

162 return self.__no_identifier_articles 

163 

164 def unbagged_articles(self): 

165 return self.__unbagged_articles 

166 

167 def not_found_articles(self): 

168 return self.__not_found_articles 

169 

170 def no_files_articles(self): 

171 return self.__no_files_articles 

172 

173 def uploaded_journals(self): 

174 return self.__uploaded_journals 

175 

176 def get_count(self): 

177 return len(self.__successful_articles) + \ 

178 len(self.__unowned_articles) + \ 

179 len(self.__no_identifier_articles) + \ 

180 len(self.__unbagged_articles) + \ 

181 len(self.__not_found_articles) + \ 

182 len(self.__no_files_articles) 

183 

184 def is_partial_success(self): 

185 if len(self.__successful_articles) > 0 and \ 

186 (len(self.__unbagged_articles) > 0 or 

187 len(self.__unowned_articles) > 0 or 

188 len(self.__not_found_articles) > 0 or 

189 len(self.__no_identifier_articles) > 0 or 

190 len(self.__no_files_articles)): 

191 return True 

192 

193 return False 

194 

195 

196class PreservationBackgroundTask(BackgroundTask): 

197 """~~PreservationBackground:Feature~~""" 

198 

199 __action__ = "preserve" 

200 

201 @classmethod 

202 def prepare(cls, username, **kwargs): 

203 """ 

204 Create necessary directories and save the file. 

205 Creates the background job 

206 :param username: 

207 :param kwargs: 

208 :return: background job 

209 """ 

210 

211 created_time = dates.now_str("%Y-%m-%d-%H-%M-%S") 

212 dir_name = username + "-" + created_time 

213 local_dir = os.path.join(Preservation.UPLOAD_DIR, dir_name) 

214 file = kwargs.get("upload_file") 

215 

216 preservation = Preservation(local_dir, username) 

217 preservation.save_file(file) 

218 

219 # prepare a job record 

220 params = {} 

221 cls.set_param(params, "local_dir", local_dir) 

222 job = background_helper.create_job(username, cls.__action__, 

223 queue_id=huey_helper.queue_id, params=params) 

224 return job 

225 

226 def run(self): 

227 

228 job = self.background_job 

229 

230 params = job.params 

231 local_dir = self.get_param(params, "local_dir") 

232 model_id = self.get_param(params, "model_id") 

233 app.logger.debug(f"Local dir {local_dir}") 

234 app.logger.debug(f"model_id {model_id}") 

235 

236 preserve_model = PreservationState.pull(model_id) 

237 preserve_model.background_task_id = job.id 

238 preserve_model.pending() 

239 preserve_model.save() 

240 

241 # ~~-> Preservation:Feature~~ 

242 preserv = Preservation(local_dir, job.user) 

243 preserv.upload_filename = preserve_model.filename 

244 try: 

245 job.add_audit_message("Extract zip file") 

246 preserv.extract_zip_file() 

247 app.logger.debug("Extracted zip file") 

248 

249 job.add_audit_message("Create Package structure") 

250 articles_list = preserv.create_package_structure() 

251 

252 app.logger.debug("Created package structure") 

253 

254 if len(articles_list.successful_articles()) > 0: 

255 # Each subdirectory is a jornal and the directory name is ISSN of the journal 

256 # iterate through the directories and upload each journal as an individual package 

257 dirs = [f.name for f in os.scandir(preserv.preservation_dir) if f.is_dir()] 

258 upload_failed = False 

259 for sub_dir in dirs: 

260 

261 package = PreservationPackage(preserv.preservation_dir, sub_dir, job.user) 

262 job.add_audit_message("Create preservation package for " + sub_dir) 

263 tar_file = package.create_package() 

264 

265 app.logger.debug(f"Created tar file {tar_file}") 

266 

267 job.add_audit_message("Create shasum for " + sub_dir) 

268 sha256 = package.sha256(package.tar_file) 

269 

270 job.add_audit_message("Upload package " + sub_dir) 

271 response = package.upload_package(sha256, package.tar_file) 

272 app.logger.debug(f"Uploaded. Response{response.text}") 

273 

274 job.add_audit_message("Validate response") 

275 self.validate_response(response, tar_file, sha256, preserve_model) 

276 

277 if preserve_model.status == 'failed': 

278 upload_failed = True 

279 break 

280 else: 

281 articles_list.add_uploaded_journal(package.tar_file_name) 

282 

283 # Upload the identifier file 

284 job.add_audit_message("Create shasum for identifier") 

285 sha256 = package.sha256(package.identifier_file) 

286 

287 identifier_file_name = os.path.basename(package.identifier_file) 

288 job.add_audit_message("Upload identifier file " + identifier_file_name) 

289 package.upload_package(sha256, package.identifier_file) 

290 articles_list.add_uploaded_journal(identifier_file_name) 

291 app.logger.debug(f"Uploaded identifier file " + identifier_file_name) 

292 

293 if not upload_failed: 

294 preserve_model.uploaded_to_ia() 

295 

296 # Check if the only few articles are successful 

297 if articles_list.is_partial_success(): 

298 preserve_model.partial() 

299 preserve_model.save() 

300 else: 

301 # If no previous errors found, check other failure reasons 

302 if not preserve_model.error: 

303 # Check if any articles available 

304 if articles_list.get_count() == 0: 

305 preserve_model.failed(FailedReasons.no_article_found) 

306 preserve_model.save() 

307 # All the articles available are invalid 

308 else: 

309 preserve_model.failed(FailedReasons.no_valid_article_available) 

310 preserve_model.save() 

311 

312 self.save_articles_list(articles_list, preserve_model) 

313 

314 except (PreservationException, Exception) as exp: 

315 # ~~-> PreservationException:Exception~~ 

316 preserve_model.failed(str(exp)) 

317 preserve_model.save() 

318 app.logger.exception("Error at background task") 

319 raise 

320 

321 def save_articles_list(self, articles_list: ArticlesList, model: PreservationState): 

322 """ 

323 Saves articles info to the model 

324 :param articles_list: articles list 

325 :param model: model object 

326 """ 

327 if len(articles_list.successful_articles()) > 0: 

328 model.successful_articles(articles_list.successful_articles()) 

329 if len(articles_list.not_found_articles()) > 0: 

330 model.not_found_articles(articles_list.not_found_articles()) 

331 if len(articles_list.no_identifier_articles()) > 0: 

332 model.no_identifier_articles(articles_list.no_identifier_articles()) 

333 if len(articles_list.no_identifier_articles()) == articles_list.get_count(): 

334 model.failed(FailedReasons.no_identifier) 

335 if len(articles_list.unowned_articles()) > 0: 

336 model.unowned_articles(articles_list.unowned_articles()) 

337 if len(articles_list.unbagged_articles()) > 0: 

338 model.unbagged_articles(articles_list.unbagged_articles()) 

339 if len(articles_list.no_files_articles()) > 0: 

340 model.no_files_articles(articles_list.no_files_articles()) 

341 if len(articles_list.uploaded_journals()) > 0: 

342 model.uploaded_journals(articles_list.uploaded_journals()) 

343 model.save() 

344 

345 def cleanup(self): 

346 """ 

347 Cleanup any resources 

348 :return: 

349 """ 

350 job = self.background_job 

351 params = job.params 

352 local_dir = self.get_param(params, "local_dir") 

353 Preservation.delete_local_directory(local_dir) 

354 

355 def validate_response(self, response, tar_file, sha256, model): 

356 """ 

357 Validate the response from server 

358 :param response: response object 

359 :param tar_file: tar file name 

360 :param sha256: sha256sum value 

361 :param model: model object to update status 

362 """ 

363 if response.status_code == 200: 

364 res_json = json.loads(response.text) 

365 files = res_json["files"] 

366 # Success response 

367 # {"files": [{"name": "name_of_tarball.tar.gz", 

368 # "sha256": "decafbad"}]} 

369 if files: 

370 # Check if the response is type dict or list 

371 res_filename = None 

372 res_shasum = None 

373 if isinstance(files, dict): 

374 res_filename = files["name"] 

375 res_shasum = files["sha256"] 

376 elif isinstance(files, list): 

377 if len(files) > 0: 

378 res_filename = files[0]["name"] 

379 res_shasum = files[0]["sha256"] 

380 

381 if res_filename and res_filename == tar_file: 

382 if res_shasum and res_shasum == sha256: 

383 app.logger.info("successfully uploaded " + tar_file) 

384 else: 

385 model.failed(FailedReasons.checksum_doesnot_match) 

386 else: 

387 model.failed(FailedReasons.tar_filename_doesnot_match) 

388 

389 else: 

390 # Error response 

391 # {"result": "ERROR","manifest_type": "BagIt", 

392 # "manifests": [ 

393 # {"id": "033168cd016a49eb8c3097d800f1b85f", 

394 # "result": "SUCCESS"}, 

395 # {"id": "00003741594643f4996e2555a01e03c7", 

396 # "result": "ERROR", 

397 # "errors": [ 

398 # "missing_files": [], 

399 # "mismatch_hashes": [{ 

400 # "file": "path/to/file", 

401 # "expected": "decafbad", 

402 # "actual": "deadbeaf"}], 

403 # "manifest_parsing_errors": [ 

404 # "some weird error"]]}]} 

405 result = res_json["result"] 

406 if result and result == "ERROR": 

407 error_str = FailedReasons.error_response 

408 else: 

409 error_str = FailedReasons.unknown_error_response 

410 

411 app.logger.error(error_str) 

412 model.failed(error_str) 

413 

414 model.save() 

415 else: 

416 app.logger.error(f"Upload failed for {tar_file}. Reason - {response.text}") 

417 model.failed(response.text) 

418 model.save() 

419 

420 @classmethod 

421 def submit(cls, background_job): 

422 """ 

423 Submit Background job""" 

424 background_job.save(blocking=True) 

425 preserve.schedule(args=(background_job.id,), delay=app.config.get('HUEY_ASYNC_DELAY', 10)) 

426 

427 

428huey_helper = PreservationBackgroundTask.create_huey_helper(queue) 

429 

430 

431@huey_helper.register_execute(is_load_config=True) 

432def preserve(job_id): 

433 """~~-> PreservationBackgroundTask:Queue""" 

434 job = BackgroundJob.pull(job_id) 

435 task = PreservationBackgroundTask(job) 

436 BackgroundApi.execute(task) 

437 

438 

439class CSVReader: 

440 """~~CSVReader:Feature~~""" 

441 

442 # column names for csv file. 

443 # Given more identifiers just to handle any mistakes by user like empty identifiers 

444 # Max expected identifier are 2 (Full Text URL, DOI) in any order 

445 FIELD_DIR = "dir_name" 

446 FIELDS = (FIELD_DIR, "id_1", "id_2", "id_3", "id_4") 

447 

448 def __init__(self, csv_file): 

449 self.__csv_file = csv_file 

450 

451 def articles_info(self): 

452 """ 

453 Reads the csv file and returns dictionary with first column(directory name) as keys 

454 and remaining columns as array elements. 

455 

456 Ex: {'article_1': ['http://link.springer.com/article/10.1186/s40478-018-0619-9', 

457 '10.1136/bmjophth-2021-000774'], 'article_2': ['10.1136/bmjophth-2021-000775']} 

458 

459 :return: Dictionary with articles info 

460 """ 

461 data = {} 

462 

463 with open(self.__csv_file, mode='r', encoding='utf-8-sig') as file: 

464 reader = csv.DictReader(file, CSVReader.FIELDS) 

465 for row in reader: 

466 dir_name = row[CSVReader.FIELD_DIR] 

467 # Remove first column so it will not be part of iteration later 

468 row.pop(CSVReader.FIELD_DIR) 

469 data[dir_name] = [] 

470 for key in row.keys(): 

471 if row[key]: 

472 data[dir_name].append(row[key]) 

473 return data 

474 

475 

476class Preservation: 

477 """~~Preservation:Feature~~""" 

478 

479 # Zip file name to download the zip file to temp directory 

480 ARTICLES_ZIP_NAME = "articles.zip" 

481 # Identifier file name 

482 IDENTIFIER_FILE = "identifier.txt" 

483 # CSV file for identifiers 

484 IDENTIFIERS_CSV = "identifiers.csv" 

485 # Temp directory 

486 UPLOAD_DIR = app.config.get("UPLOAD_DIR", ".") 

487 

488 def __init__(self, local_dir, owner): 

489 self.__dir_name = os.path.basename(local_dir) 

490 self.__local_dir = os.path.join(local_dir, "tmp") 

491 self.__preservation_dir = os.path.join(local_dir, self.__dir_name) 

492 self.__csv_articles_dict = None 

493 self.__owner = owner 

494 self.upload_filename = None 

495 

496 @property 

497 def dir_name(self): 

498 return self.__dir_name 

499 

500 @property 

501 def preservation_dir(self): 

502 return self.__preservation_dir 

503 

504 def create_local_directories(self): 

505 """ 

506 Create local directories to download the files and 

507 to create preservation package 

508 """ 

509 try: 

510 os.makedirs(self.__local_dir, exist_ok=True) 

511 os.makedirs(self.__preservation_dir, exist_ok=True) 

512 except OSError: 

513 raise PreservationStorageException("Could not create temp directory") 

514 

515 @classmethod 

516 def delete_local_directory(cls, local_dir): 

517 """ 

518 Deletes the directory 

519 """ 

520 if os.path.exists(local_dir): 

521 try: 

522 shutil.rmtree(local_dir) 

523 except Exception: 

524 raise PreservationStorageException("Could not delete Temp directory") 

525 

526 def save_file(self, file): 

527 """ 

528 Save the file on to local directory 

529 :param file: File object 

530 """ 

531 self.create_local_directories() 

532 file_path = os.path.join(self.__local_dir, Preservation.ARTICLES_ZIP_NAME) 

533 try: 

534 file.save(file_path) 

535 except Exception: 

536 raise PreservationStorageException("Could not save file in Temp directory") 

537 

538 def extract_zip_file(self): 

539 """ 

540 Extracts zip file in the Temp directory 

541 """ 

542 file_path = os.path.join(self.__local_dir, Preservation.ARTICLES_ZIP_NAME) 

543 

544 if os.path.exists(file_path): 

545 with ZipFile(file_path, 'r') as zFile: 

546 zFile.extractall(self.__local_dir) 

547 else: 

548 raise PreservationException(f"Could not find zip file at Temp directory {file_path}") 

549 

550 def create_package_structure(self) -> ArticlesList: 

551 """ 

552 Create preservation package 

553 

554 Iterates through the sub directories. 

555 Retrieve article info for each article. 

556 Creates preservation directories 

557 

558 """ 

559 articles_list = ArticlesList() 

560 

561 for dir, subdirs, files in os.walk(self.__local_dir): 

562 

563 if dir == self.__local_dir: 

564 continue 

565 

566 app.logger.debug("Directory : " + dir) 

567 app.logger.debug("Sub Directories : " + str(subdirs)) 

568 app.logger.debug("Files : " + str(files)) 

569 

570 # Fetch identifiers at the root directory 

571 if os.path.dirname(dir) == self.__local_dir: 

572 for file in files: 

573 if Preservation.IDENTIFIERS_CSV.lower() == file.lower(): 

574 # Get articles info from csv file 

575 # ~~-> CSVReader:Feature~~ 

576 csv_reader = CSVReader(os.path.join(dir, file)) 

577 self.__csv_articles_dict = csv_reader.articles_info() 

578 break 

579 # process only the directories that has articles 

580 else: 

581 self.__process_article(dir, files, articles_list) 

582 

583 return articles_list 

584 

585 def __process_article(self, dir_path, files, articles_list): 

586 

587 identifiers = None 

588 dir_name = os.path.basename(dir_path) 

589 package = ArticlePackage(dir_path, files) 

590 

591 if not os.path.dirname(dir_path) == self.__local_dir: 

592 if not self.__has_article_files(files): 

593 articles_list.add_no_files_articles(package) 

594 return 

595 

596 # check if identifier file exist 

597 for file in files: 

598 if Preservation.IDENTIFIER_FILE.lower() == file.lower(): 

599 with open(os.path.join(dir_path, file)) as identifier_file: 

600 identifiers = identifier_file.read().splitlines() 

601 

602 if not identifiers and self.__csv_articles_dict: 

603 if dir_name in self.__csv_articles_dict: 

604 identifiers = self.__csv_articles_dict[dir_name] 

605 

606 if identifiers: 

607 article = self.get_article(identifiers) 

608 

609 if article: 

610 article_data = article.data 

611 

612 is_owner = self.owner_of_article(article) 

613 

614 if isinstance(is_owner, bool) and is_owner == True: 

615 issn, article_id, metadata_json = self.get_article_info(article_data) 

616 try: 

617 package = ArticlePackage(dir_path, files) 

618 package.issn = issn 

619 package.article_id = article_id 

620 package.metadata = metadata_json 

621 package.package_dir = self.__preservation_dir 

622 

623 package.create_article_bagit_structure() 

624 

625 # Create and update the identifier file for all articles in the journal 

626 with open(os.path.join(self.__preservation_dir, issn + ".txt"), 'a') as identifier_file: 

627 identifier_file.write(os.path.basename(dir_path) + "," + article_id + "," + 

628 ','.join(identifiers) + "\n") 

629 

630 articles_list.add_successful_article(package) 

631 except Exception: 

632 articles_list.add_unbagged_articles(package) 

633 app.logger.exception(f"Error while create article ( {article_id} ) package") 

634 else: 

635 articles_list.add_unowned_articles(package) 

636 

637 else: 

638 # skip the article if not found 

639 app.logger.error(f"Could not retrieve article for identifier(s) {identifiers}") 

640 articles_list.add_not_found_articles(package) 

641 

642 else: 

643 # did not find any identifier for article 

644 articles_list.add_no_identifier_articles(package) 

645 

646 def __has_article_files(self, files): 

647 """ 

648 Checks if any article files available 

649 :param files: 

650 :return: True if files available otherwise returns False 

651 """ 

652 no_of_files = len(files) 

653 if Preservation.IDENTIFIER_FILE in files: 

654 if no_of_files > 1: 

655 return True 

656 else: 

657 return False 

658 else: 

659 if no_of_files > 0: 

660 return True 

661 else: 

662 return False 

663 

664 def owner_of_article(self, article): 

665 """ 

666 Checks if the article is owned by the user 

667 :param article: 

668 :return: 

669 """ 

670 articleService = DOAJ.articleService() 

671 account = Account.pull(self.__owner) 

672 is_owner = articleService.has_permissions(account, article, True) 

673 return is_owner 

674 

675 def get_article(self, identifiers): 

676 """ 

677 Checks if the identifier is doi or full text 

678 Pulls article related to the identifier 

679 :param identifiers: 

680 :return: article 

681 """ 

682 article = None 

683 for identifier in identifiers: 

684 if DOI_COMPILED.match(identifier): 

685 article = Article.pull_by_key("bibjson.identifier.id", identifier) 

686 elif HTTP_URL_COMPILED.match(identifier): 

687 article = Article.pull_by_key("bibjson.link.url", identifier) 

688 if article: 

689 return article 

690 else: 

691 return None 

692 

693 def get_article_info(self, article_json): 

694 """ 

695 Returns article info 

696 :param article_json: 

697 :return: issn, article id, metadata json 

698 """ 

699 

700 metadata_json = self.get_metadata_json(article_json) 

701 issn = article_json["bibjson"]["journal"]["issns"][0] 

702 article_id = article_json["id"] 

703 

704 return issn, article_id, metadata_json 

705 

706 def get_metadata_json(self, article_json): 

707 """ 

708 Returns metadata of article which is required for preservation 

709 :return: metadata 

710 """ 

711 # Remove unnecessary data 

712 metadata = deepcopy(article_json) 

713 metadata.pop("index") 

714 metadata.pop("admin") 

715 metadata.pop("es_type") 

716 

717 return metadata 

718 

719 

720class PreservationPackage: 

721 """~~PreservationPackage:Feature~~ 

722 Creates preservation package and upload to Internet Server 

723 """ 

724 

725 def __init__(self, preservation_dir, journal_dir, owner): 

726 self.preservation_dir = preservation_dir 

727 self.journal_dir = journal_dir 

728 self.package_dir = os.path.join(self.preservation_dir, journal_dir) 

729 self.created_time = dates.now_str("%Y-%m-%d-%H-%M-%S") 

730 self.tar_file = self.package_dir + "_" + self.created_time + ".tar.gz" 

731 self.tar_file_name = os.path.basename(self.tar_file) 

732 self.__owner = owner 

733 self.identifier_file = self.package_dir + "_" + self.created_time + ".txt" 

734 try: 

735 # Rename the identifier file to match the tar file 

736 shutil.move(self.package_dir + ".txt", self.identifier_file) 

737 except Exception as e: 

738 app.logger.exception(e) 

739 

740 def create_package(self): 

741 """ 

742 Creates tar file for the package directory 

743 :return: tar file name 

744 """ 

745 try: 

746 with tarfile.open(self.tar_file, "w:gz") as tar: 

747 tar.add(self.package_dir, arcname=os.path.basename(self.package_dir)) 

748 except Exception as exp: 

749 app.logger.exception("Error creating tar file") 

750 raise PreservationException("Error while creating the tar file") 

751 

752 return self.tar_file_name 

753 

754 def upload_package(self, sha256sum, file): 

755 

756 url = app.config.get("PRESERVATION_URL") 

757 username = app.config.get("PRESERVATION_USERNAME") 

758 password = app.config.get("PRESERVATION_PASSWD") 

759 collection_dict = app.config.get("PRESERVATION_COLLECTION") 

760 params = collection_dict[self.__owner] 

761 collection = params[0] 

762 collection_id = params[1] 

763 

764 file_name = os.path.basename(file) 

765 

766 # payload for upload request 

767 payload = { 

768 'directories': file_name, 

769 'org': 'DOAJ', 

770 'client': 'DOAJ_CLI', 

771 'username': 'doaj_uploader', 

772 'size': '', 

773 'organization': '1', 

774 'orgname': 'DOAJ', 

775 'collection': collection_id, 

776 'collname': collection, 

777 'sha256sum': sha256sum 

778 } 

779 app.logger.info(payload) 

780 

781 headers = {} 

782 # get the file to upload 

783 try: 

784 with open(file, "rb") as f: 

785 files = {'file_field': (file_name, f)} 

786 response = requests.post(url, headers=headers, auth=(username, password), files=files, data=payload) 

787 except (IOError, Exception) as exp: 

788 app.logger.exception("Error opening the tar file") 

789 raise PreservationException("Error Uploading tar file to IA server") 

790 

791 return response 

792 

793 def sha256(self, file): 

794 """ 

795 Creates sha256 hash for the tar file 

796 """ 

797 sha256_hash = hashlib.sha256() 

798 

799 with open(file, "rb") as f: 

800 # Read and update hash string value in blocks of 64K 

801 for byte_block in iter(lambda: f.read(65536), b""): 

802 sha256_hash.update(byte_block) 

803 

804 return sha256_hash.hexdigest() 

805 

806 

807class FailedReasons: 

808 no_identifier = "no_identifier" 

809 unknown = "unknown" 

810 checksum_doesnot_match = "checksum_doesnot_match" 

811 no_article_found = "no_article_found" 

812 no_valid_article_available = "no_valid_article_available" 

813 tar_filename_doesnot_match = "response_tar_filename_doesnot_match" 

814 error_response = "error_response" 

815 unknown_error_response = "unknown_error_response" 

816 collection_not_available = "collection_not_available"