Coverage for portality/tasks/preservation.py: 62%

1import csv

2import hashlib

3import json

4import os

5import shutil

6import tarfile

7from copy import deepcopy

8from zipfile import ZipFile

10import requests

11from bagit import make_bag, BagError

13from portality.background import BackgroundTask, BackgroundApi

14from portality.bll import DOAJ

15from portality.core import app

16from portality.lib import dates

17from portality.models import Account, Article, BackgroundJob, PreservationState

18from portality.regex import DOI_COMPILED, HTTP_URL_COMPILED

19from portality.tasks.helpers import background_helper

20from portality.tasks.redis_huey import events_queue as queue

23class PreservationException(Exception):

24 """~~PreservationException:Exception~~"""

25 pass

28class PreservationStorageException(Exception):

29 pass

32class ValidationError(Exception):

33 pass

36class ArticlePackage:

37 """ ~~ArticlePackage:Feature~~"""

39 def __init__(self, article_dir, files):

40 self.issn = None

41 self.article_id = None

42 self.metadata = None

43 self.article_dir = article_dir

44 self.article_files = files

45 self.package_dir = None

46 self.has_error = False

47 self.error_details = None

49 def create_article_bagit_structure(self):

50 """ ~~-> BagIt:Library~~

51 Create directory structure for packaging

52 Create required additional files

53 Create bagit files

54 """

55 # Validate if required data is available

56 self.validate()

58 journal_dir = os.path.join(self.package_dir, self.issn)

59 if not os.path.exists(journal_dir):

60 os.mkdir(journal_dir)

62 dest_article_dir = os.path.join(journal_dir, self.article_id)

63 if not os.path.exists(dest_article_dir):

64 # Create article directory

65 os.mkdir(dest_article_dir)

67 # Create metadata directory

68 metada_dir = os.path.join(dest_article_dir, "metadata")

69 if not os.path.exists(metada_dir):

70 os.mkdir(metada_dir)

72 # Copy the files from user uploaded directory to the package

73 for file in self.article_files:

74 if not file == Preservation.IDENTIFIER_FILE:

75 src = os.path.join(self.article_dir, file)

76 dest = os.path.join(dest_article_dir, file)

77 shutil.copy(src, dest)

79 # Create metadata file with article information

80 with open(os.path.join(metada_dir, "metadata.json"), 'w+') as metadata_file:

81 metadata_file.write(json.dumps(self.metadata, indent=4))

83 # Create a identifier file with uuid of the article

84 with open(os.path.join(metada_dir, "identifier.txt"), 'w+') as metadata_file:

85 metadata_file.write(self.article_id)

87 try:

88 # Bag the article

89 make_bag(dest_article_dir, checksums=["sha256"])

90 except BagError:

91 app.logger.excception(f"Error while creating Bag for article {self.article_id}")

92 raise PreservationException("Error while creating Bag")

94 def validate(self):

95 variables_list = []

97 if not self.package_dir:

98 variables_list.append("package_dir")

99 if not self.metadata:

100 variables_list.append("metadata")

101 if not self.article_dir:

102 variables_list.append("article_dir")

103 if not self.article_files or len(self.article_files) == 0:

104 variables_list.append("article_files")

105 if not self.article_id:

106 variables_list.append("article_id")

107 if not self.issn:

108 variables_list.append("issn")

109

110 if len(variables_list) > 0:

111 app.logger.debug(f"Validation Values : package_dir {self.package_dir} "

112 f"metadata {self.metadata} article_dir {self.article_dir} "

113 f"article_files {self.article_files} article_id {self.article_id} issn {self.issn}")

114 raise ValidationError(f"Required fields cannot be empty {variables_list}")

115

116

117class ArticlesList:

118 """This class contains different types of lists depending on the article state"""

119

120 def __init__(self):

121 self.__successful_articles = []

122 self.__unowned_articles = []

123 self.__no_identifier_articles = []

124 self.__unbagged_articles = []

125 self.__not_found_articles = []

126 self.__no_files_articles = []

127 self.__uploaded_journals = []

128 self.has_errors = False

129

130 def add_successful_article(self, article: ArticlePackage):

131 self.__successful_articles.append(os.path.basename(article.article_dir))

132

133 def add_uploaded_journal(self, journal_package):

134 self.__uploaded_journals.append(journal_package)

135

136 def add_unowned_articles(self, article: ArticlePackage):

137 self.has_errors = True

138 self.__unowned_articles.append(os.path.basename(article.article_dir))

139

140 def add_no_identifier_articles(self, article: ArticlePackage):

141 self.has_errors = True

142 self.__no_identifier_articles.append(os.path.basename(article.article_dir))

143

144 def add_unbagged_articles(self, article: ArticlePackage):

145 self.has_errors = True

146 self.__unbagged_articles.append(os.path.basename(article.article_dir))

147

148 def add_not_found_articles(self, article: ArticlePackage):

149 self.has_errors = True

150 self.__not_found_articles.append(os.path.basename(article.article_dir))

151

152 def add_no_files_articles(self, article: ArticlePackage):

153 self.__no_files_articles.append(os.path.basename(article.article_dir))

154

155 def successful_articles(self):

156 return self.__successful_articles

157

158 def unowned_articles(self):

159 return self.__unowned_articles

160

161 def no_identifier_articles(self):

162 return self.__no_identifier_articles

163

164 def unbagged_articles(self):

165 return self.__unbagged_articles

166

167 def not_found_articles(self):

168 return self.__not_found_articles

169

170 def no_files_articles(self):

171 return self.__no_files_articles

172

173 def uploaded_journals(self):

174 return self.__uploaded_journals

175

176 def get_count(self):

177 return len(self.__successful_articles) + \

178 len(self.__unowned_articles) + \

179 len(self.__no_identifier_articles) + \

180 len(self.__unbagged_articles) + \

181 len(self.__not_found_articles) + \

182 len(self.__no_files_articles)

183

184 def is_partial_success(self):

185 if len(self.__successful_articles) > 0 and \

186 (len(self.__unbagged_articles) > 0 or

187 len(self.__unowned_articles) > 0 or

188 len(self.__not_found_articles) > 0 or

189 len(self.__no_identifier_articles) > 0 or

190 len(self.__no_files_articles)):

191 return True

192

193 return False

194

195

196class PreservationBackgroundTask(BackgroundTask):

197 """~~PreservationBackground:Feature~~"""

198

199 __action__ = "preserve"

200

201 @classmethod

202 def prepare(cls, username, **kwargs):

203 """

204 Create necessary directories and save the file.

205 Creates the background job

206 :param username:

207 :param kwargs:

208 :return: background job

209 """

210

211 created_time = dates.now_str("%Y-%m-%d-%H-%M-%S")

212 dir_name = username + "-" + created_time

213 local_dir = os.path.join(Preservation.UPLOAD_DIR, dir_name)

214 file = kwargs.get("upload_file")

215

216 preservation = Preservation(local_dir, username)

217 preservation.save_file(file)

218

219 # prepare a job record

220 params = {}

221 cls.set_param(params, "local_dir", local_dir)

222 job = background_helper.create_job(username, cls.__action__,

223 queue_id=huey_helper.queue_id, params=params)

224 return job

225

226 def run(self):

227

228 job = self.background_job

229

230 params = job.params

231 local_dir = self.get_param(params, "local_dir")

232 model_id = self.get_param(params, "model_id")

233 app.logger.debug(f"Local dir {local_dir}")

234 app.logger.debug(f"model_id {model_id}")

235

236 preserve_model = PreservationState.pull(model_id)

237 preserve_model.background_task_id = job.id

238 preserve_model.pending()

239 preserve_model.save()

240

241 # ~~-> Preservation:Feature~~

242 preserv = Preservation(local_dir, job.user)

243 preserv.upload_filename = preserve_model.filename

244 try:

245 job.add_audit_message("Extract zip file")

246 preserv.extract_zip_file()

247 app.logger.debug("Extracted zip file")

248

249 job.add_audit_message("Create Package structure")

250 articles_list = preserv.create_package_structure()

251

252 app.logger.debug("Created package structure")

253

254 if len(articles_list.successful_articles()) > 0:

255 # Each subdirectory is a jornal and the directory name is ISSN of the journal

256 # iterate through the directories and upload each journal as an individual package

257 dirs = [f.name for f in os.scandir(preserv.preservation_dir) if f.is_dir()]

258 upload_failed = False

259 for sub_dir in dirs:

260

261 package = PreservationPackage(preserv.preservation_dir, sub_dir, job.user)

262 job.add_audit_message("Create preservation package for " + sub_dir)

263 tar_file = package.create_package()

264

265 app.logger.debug(f"Created tar file {tar_file}")

266

267 job.add_audit_message("Create shasum for " + sub_dir)

268 sha256 = package.sha256(package.tar_file)

269

270 job.add_audit_message("Upload package " + sub_dir)

271 response = package.upload_package(sha256, package.tar_file)

272 app.logger.debug(f"Uploaded. Response{response.text}")

273

274 job.add_audit_message("Validate response")

275 self.validate_response(response, tar_file, sha256, preserve_model)

276

277 if preserve_model.status == 'failed':

278 upload_failed = True

279 break

280 else:

281 articles_list.add_uploaded_journal(package.tar_file_name)

282

283 # Upload the identifier file

284 job.add_audit_message("Create shasum for identifier")

285 sha256 = package.sha256(package.identifier_file)

286

287 identifier_file_name = os.path.basename(package.identifier_file)

288 job.add_audit_message("Upload identifier file " + identifier_file_name)

289 package.upload_package(sha256, package.identifier_file)

290 articles_list.add_uploaded_journal(identifier_file_name)

291 app.logger.debug(f"Uploaded identifier file " + identifier_file_name)

292

293 if not upload_failed:

294 preserve_model.uploaded_to_ia()

295

296 # Check if the only few articles are successful

297 if articles_list.is_partial_success():

298 preserve_model.partial()

299 preserve_model.save()

300 else:

301 # If no previous errors found, check other failure reasons

302 if not preserve_model.error:

303 # Check if any articles available

304 if articles_list.get_count() == 0:

305 preserve_model.failed(FailedReasons.no_article_found)

306 preserve_model.save()

307 # All the articles available are invalid

308 else:

309 preserve_model.failed(FailedReasons.no_valid_article_available)

310 preserve_model.save()

311

312 self.save_articles_list(articles_list, preserve_model)

313

314 except (PreservationException, Exception) as exp:

315 # ~~-> PreservationException:Exception~~

316 preserve_model.failed(str(exp))

317 preserve_model.save()

318 app.logger.exception("Error at background task")

319 raise

320

321 def save_articles_list(self, articles_list: ArticlesList, model: PreservationState):

322 """

323 Saves articles info to the model

324 :param articles_list: articles list

325 :param model: model object

326 """

327 if len(articles_list.successful_articles()) > 0:

328 model.successful_articles(articles_list.successful_articles())

329 if len(articles_list.not_found_articles()) > 0:

330 model.not_found_articles(articles_list.not_found_articles())

331 if len(articles_list.no_identifier_articles()) > 0:

332 model.no_identifier_articles(articles_list.no_identifier_articles())

333 if len(articles_list.no_identifier_articles()) == articles_list.get_count():

334 model.failed(FailedReasons.no_identifier)

335 if len(articles_list.unowned_articles()) > 0:

336 model.unowned_articles(articles_list.unowned_articles())

337 if len(articles_list.unbagged_articles()) > 0:

338 model.unbagged_articles(articles_list.unbagged_articles())

339 if len(articles_list.no_files_articles()) > 0:

340 model.no_files_articles(articles_list.no_files_articles())

341 if len(articles_list.uploaded_journals()) > 0:

342 model.uploaded_journals(articles_list.uploaded_journals())

343 model.save()

344

345 def cleanup(self):

346 """

347 Cleanup any resources

348 :return:

349 """

350 job = self.background_job

351 params = job.params

352 local_dir = self.get_param(params, "local_dir")

353 Preservation.delete_local_directory(local_dir)

354

355 def validate_response(self, response, tar_file, sha256, model):

356 """

357 Validate the response from server

358 :param response: response object

359 :param tar_file: tar file name

360 :param sha256: sha256sum value

361 :param model: model object to update status

362 """

363 if response.status_code == 200:

364 res_json = json.loads(response.text)

365 files = res_json["files"]

366 # Success response

367 # {"files": [{"name": "name_of_tarball.tar.gz",

368 # "sha256": "decafbad"}]}

369 if files:

370 # Check if the response is type dict or list

371 res_filename = None

372 res_shasum = None

373 if isinstance(files, dict):

374 res_filename = files["name"]

375 res_shasum = files["sha256"]

376 elif isinstance(files, list):

377 if len(files) > 0:

378 res_filename = files[0]["name"]

379 res_shasum = files[0]["sha256"]

380

381 if res_filename and res_filename == tar_file:

382 if res_shasum and res_shasum == sha256:

383 app.logger.info("successfully uploaded " + tar_file)

384 else:

385 model.failed(FailedReasons.checksum_doesnot_match)

386 else:

387 model.failed(FailedReasons.tar_filename_doesnot_match)

388

389 else:

390 # Error response

391 # {"result": "ERROR","manifest_type": "BagIt",

392 # "manifests": [

393 # {"id": "033168cd016a49eb8c3097d800f1b85f",

394 # "result": "SUCCESS"},

395 # {"id": "00003741594643f4996e2555a01e03c7",

396 # "result": "ERROR",

397 # "errors": [

398 # "missing_files": [],

399 # "mismatch_hashes": [{

400 # "file": "path/to/file",

401 # "expected": "decafbad",

402 # "actual": "deadbeaf"}],

403 # "manifest_parsing_errors": [

404 # "some weird error"]]}]}

405 result = res_json["result"]

406 if result and result == "ERROR":

407 error_str = FailedReasons.error_response

408 else:

409 error_str = FailedReasons.unknown_error_response

410

411 app.logger.error(error_str)

412 model.failed(error_str)

413

414 model.save()

415 else:

416 app.logger.error(f"Upload failed for {tar_file}. Reason - {response.text}")

417 model.failed(response.text)

418 model.save()

419

420 @classmethod

421 def submit(cls, background_job):

422 """

423 Submit Background job"""

424 background_job.save(blocking=True)

425 preserve.schedule(args=(background_job.id,), delay=app.config.get('HUEY_ASYNC_DELAY', 10))

426

427

428huey_helper = PreservationBackgroundTask.create_huey_helper(queue)

429

430

431@huey_helper.register_execute(is_load_config=True)

432def preserve(job_id):

433 """~~-> PreservationBackgroundTask:Queue"""

434 job = BackgroundJob.pull(job_id)

435 task = PreservationBackgroundTask(job)

436 BackgroundApi.execute(task)

437

438

439class CSVReader:

440 """~~CSVReader:Feature~~"""

441

442 # column names for csv file.

443 # Given more identifiers just to handle any mistakes by user like empty identifiers

444 # Max expected identifier are 2 (Full Text URL, DOI) in any order

445 FIELD_DIR = "dir_name"

446 FIELDS = (FIELD_DIR, "id_1", "id_2", "id_3", "id_4")

447

448 def __init__(self, csv_file):

449 self.__csv_file = csv_file

450

451 def articles_info(self):

452 """

453 Reads the csv file and returns dictionary with first column(directory name) as keys

454 and remaining columns as array elements.

455

456 Ex: {'article_1': ['http://link.springer.com/article/10.1186/s40478-018-0619-9',

457 '10.1136/bmjophth-2021-000774'], 'article_2': ['10.1136/bmjophth-2021-000775']}

458

459 :return: Dictionary with articles info

460 """

461 data = {}

462

463 with open(self.__csv_file, mode='r', encoding='utf-8-sig') as file:

464 reader = csv.DictReader(file, CSVReader.FIELDS)

465 for row in reader:

466 dir_name = row[CSVReader.FIELD_DIR]

467 # Remove first column so it will not be part of iteration later

468 row.pop(CSVReader.FIELD_DIR)

469 data[dir_name] = []

470 for key in row.keys():

471 if row[key]:

472 data[dir_name].append(row[key])

473 return data

474

475

476class Preservation:

477 """~~Preservation:Feature~~"""

478

479 # Zip file name to download the zip file to temp directory

480 ARTICLES_ZIP_NAME = "articles.zip"

481 # Identifier file name

482 IDENTIFIER_FILE = "identifier.txt"

483 # CSV file for identifiers

484 IDENTIFIERS_CSV = "identifiers.csv"

485 # Temp directory

486 UPLOAD_DIR = app.config.get("UPLOAD_DIR", ".")

487

488 def __init__(self, local_dir, owner):

489 self.__dir_name = os.path.basename(local_dir)

490 self.__local_dir = os.path.join(local_dir, "tmp")

491 self.__preservation_dir = os.path.join(local_dir, self.__dir_name)

492 self.__csv_articles_dict = None

493 self.__owner = owner

494 self.upload_filename = None

495

496 @property

497 def dir_name(self):

498 return self.__dir_name

499

500 @property

501 def preservation_dir(self):

502 return self.__preservation_dir

503

504 def create_local_directories(self):

505 """

506 Create local directories to download the files and

507 to create preservation package

508 """

509 try:

510 os.makedirs(self.__local_dir, exist_ok=True)

511 os.makedirs(self.__preservation_dir, exist_ok=True)

512 except OSError:

513 raise PreservationStorageException("Could not create temp directory")

514

515 @classmethod

516 def delete_local_directory(cls, local_dir):

517 """

518 Deletes the directory

519 """

520 if os.path.exists(local_dir):

521 try:

522 shutil.rmtree(local_dir)

523 except Exception:

524 raise PreservationStorageException("Could not delete Temp directory")

525

526 def save_file(self, file):

527 """

528 Save the file on to local directory

529 :param file: File object

530 """

531 self.create_local_directories()

532 file_path = os.path.join(self.__local_dir, Preservation.ARTICLES_ZIP_NAME)

533 try:

534 file.save(file_path)

535 except Exception:

536 raise PreservationStorageException("Could not save file in Temp directory")

537

538 def extract_zip_file(self):

539 """

540 Extracts zip file in the Temp directory

541 """

542 file_path = os.path.join(self.__local_dir, Preservation.ARTICLES_ZIP_NAME)

543

544 if os.path.exists(file_path):

545 with ZipFile(file_path, 'r') as zFile:

546 zFile.extractall(self.__local_dir)

547 else:

548 raise PreservationException(f"Could not find zip file at Temp directory {file_path}")

549

550 def create_package_structure(self) -> ArticlesList:

551 """

552 Create preservation package

553

554 Iterates through the sub directories.

555 Retrieve article info for each article.

556 Creates preservation directories

557

558 """

559 articles_list = ArticlesList()

560

561 for dir, subdirs, files in os.walk(self.__local_dir):

562

563 if dir == self.__local_dir:

564 continue

565

566 app.logger.debug("Directory : " + dir)

567 app.logger.debug("Sub Directories : " + str(subdirs))

568 app.logger.debug("Files : " + str(files))

569

570 # Fetch identifiers at the root directory

571 if os.path.dirname(dir) == self.__local_dir:

572 for file in files:

573 if Preservation.IDENTIFIERS_CSV.lower() == file.lower():

574 # Get articles info from csv file

575 # ~~-> CSVReader:Feature~~

576 csv_reader = CSVReader(os.path.join(dir, file))

577 self.__csv_articles_dict = csv_reader.articles_info()

578 break

579 # process only the directories that has articles

580 else:

581 self.__process_article(dir, files, articles_list)

582

583 return articles_list

584

585 def __process_article(self, dir_path, files, articles_list):

586

587 identifiers = None

588 dir_name = os.path.basename(dir_path)

589 package = ArticlePackage(dir_path, files)

590

591 if not os.path.dirname(dir_path) == self.__local_dir:

592 if not self.__has_article_files(files):

593 articles_list.add_no_files_articles(package)

594 return

595

596 # check if identifier file exist

597 for file in files:

598 if Preservation.IDENTIFIER_FILE.lower() == file.lower():

599 with open(os.path.join(dir_path, file)) as identifier_file:

600 identifiers = identifier_file.read().splitlines()

601

602 if not identifiers and self.__csv_articles_dict:

603 if dir_name in self.__csv_articles_dict:

604 identifiers = self.__csv_articles_dict[dir_name]

605

606 if identifiers:

607 article = self.get_article(identifiers)

608

609 if article:

610 article_data = article.data

611

612 is_owner = self.owner_of_article(article)

613

614 if isinstance(is_owner, bool) and is_owner == True:

615 issn, article_id, metadata_json = self.get_article_info(article_data)

616 try:

617 package = ArticlePackage(dir_path, files)

618 package.issn = issn

619 package.article_id = article_id

620 package.metadata = metadata_json

621 package.package_dir = self.__preservation_dir

622

623 package.create_article_bagit_structure()

624

625 # Create and update the identifier file for all articles in the journal

626 with open(os.path.join(self.__preservation_dir, issn + ".txt"), 'a') as identifier_file:

627 identifier_file.write(os.path.basename(dir_path) + "," + article_id + "," +

628 ','.join(identifiers) + "\n")

629

630 articles_list.add_successful_article(package)

631 except Exception:

632 articles_list.add_unbagged_articles(package)

633 app.logger.exception(f"Error while create article ( {article_id} ) package")

634 else:

635 articles_list.add_unowned_articles(package)

636

637 else:

638 # skip the article if not found

639 app.logger.error(f"Could not retrieve article for identifier(s) {identifiers}")

640 articles_list.add_not_found_articles(package)

641

642 else:

643 # did not find any identifier for article

644 articles_list.add_no_identifier_articles(package)

645

646 def __has_article_files(self, files):

647 """

648 Checks if any article files available

649 :param files:

650 :return: True if files available otherwise returns False

651 """

652 no_of_files = len(files)

653 if Preservation.IDENTIFIER_FILE in files:

654 if no_of_files > 1:

655 return True

656 else:

657 return False

658 else:

659 if no_of_files > 0:

660 return True

661 else:

662 return False

663

664 def owner_of_article(self, article):

665 """

666 Checks if the article is owned by the user

667 :param article:

668 :return:

669 """

670 articleService = DOAJ.articleService()

671 account = Account.pull(self.__owner)

672 is_owner = articleService.has_permissions(account, article, True)

673 return is_owner

674

675 def get_article(self, identifiers):

676 """

677 Checks if the identifier is doi or full text

678 Pulls article related to the identifier

679 :param identifiers:

680 :return: article

681 """

682 article = None

683 for identifier in identifiers:

684 if DOI_COMPILED.match(identifier):

685 article = Article.pull_by_key("bibjson.identifier.id", identifier)

686 elif HTTP_URL_COMPILED.match(identifier):

687 article = Article.pull_by_key("bibjson.link.url", identifier)

688 if article:

689 return article

690 else:

691 return None

692

693 def get_article_info(self, article_json):

694 """

695 Returns article info

696 :param article_json:

697 :return: issn, article id, metadata json

698 """

699

700 metadata_json = self.get_metadata_json(article_json)

701 issn = article_json["bibjson"]["journal"]["issns"][0]

702 article_id = article_json["id"]

703

704 return issn, article_id, metadata_json

705

706 def get_metadata_json(self, article_json):

707 """

708 Returns metadata of article which is required for preservation

709 :return: metadata

710 """

711 # Remove unnecessary data

712 metadata = deepcopy(article_json)

713 metadata.pop("index")

714 metadata.pop("admin")

715 metadata.pop("es_type")

716

717 return metadata

718

719

720class PreservationPackage:

721 """~~PreservationPackage:Feature~~

722 Creates preservation package and upload to Internet Server

723 """

724

725 def __init__(self, preservation_dir, journal_dir, owner):

726 self.preservation_dir = preservation_dir

727 self.journal_dir = journal_dir

728 self.package_dir = os.path.join(self.preservation_dir, journal_dir)

729 self.created_time = dates.now_str("%Y-%m-%d-%H-%M-%S")

730 self.tar_file = self.package_dir + "_" + self.created_time + ".tar.gz"

731 self.tar_file_name = os.path.basename(self.tar_file)

732 self.__owner = owner

733 self.identifier_file = self.package_dir + "_" + self.created_time + ".txt"

734 try:

735 # Rename the identifier file to match the tar file

736 shutil.move(self.package_dir + ".txt", self.identifier_file)

737 except Exception as e:

738 app.logger.exception(e)

739

740 def create_package(self):

741 """

742 Creates tar file for the package directory

743 :return: tar file name

744 """

745 try:

746 with tarfile.open(self.tar_file, "w:gz") as tar:

747 tar.add(self.package_dir, arcname=os.path.basename(self.package_dir))

748 except Exception as exp:

749 app.logger.exception("Error creating tar file")

750 raise PreservationException("Error while creating the tar file")

751

752 return self.tar_file_name

753

754 def upload_package(self, sha256sum, file):

755

756 url = app.config.get("PRESERVATION_URL")

757 username = app.config.get("PRESERVATION_USERNAME")

758 password = app.config.get("PRESERVATION_PASSWD")

759 collection_dict = app.config.get("PRESERVATION_COLLECTION")

760 params = collection_dict[self.__owner]

761 collection = params[0]

762 collection_id = params[1]

763

764 file_name = os.path.basename(file)

765

766 # payload for upload request

767 payload = {

768 'directories': file_name,

769 'org': 'DOAJ',

770 'client': 'DOAJ_CLI',

771 'username': 'doaj_uploader',

772 'size': '',

773 'organization': '1',

774 'orgname': 'DOAJ',

775 'collection': collection_id,

776 'collname': collection,

777 'sha256sum': sha256sum

778 }

779 app.logger.info(payload)

780

781 headers = {}

782 # get the file to upload

783 try:

784 with open(file, "rb") as f:

785 files = {'file_field': (file_name, f)}

786 response = requests.post(url, headers=headers, auth=(username, password), files=files, data=payload)

787 except (IOError, Exception) as exp:

788 app.logger.exception("Error opening the tar file")

789 raise PreservationException("Error Uploading tar file to IA server")

790

791 return response

792

793 def sha256(self, file):

794 """

795 Creates sha256 hash for the tar file

796 """

797 sha256_hash = hashlib.sha256()

798

799 with open(file, "rb") as f:

800 # Read and update hash string value in blocks of 64K

801 for byte_block in iter(lambda: f.read(65536), b""):

802 sha256_hash.update(byte_block)

803

804 return sha256_hash.hexdigest()

805

806

807class FailedReasons:

808 no_identifier = "no_identifier"

809 unknown = "unknown"

810 checksum_doesnot_match = "checksum_doesnot_match"

811 no_article_found = "no_article_found"

812 no_valid_article_available = "no_valid_article_available"

813 tar_filename_doesnot_match = "response_tar_filename_doesnot_match"

814 error_response = "error_response"

815 unknown_error_response = "unknown_error_response"

816 collection_not_available = "collection_not_available"

Coverage for portality / tasks / preservation.py: 62%

472 statements