Coverage for portality / tasks / preservation.py: 62%
472 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-04 09:41 +0100
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-04 09:41 +0100
1import csv
2import hashlib
3import json
4import os
5import shutil
6import tarfile
7from copy import deepcopy
8from zipfile import ZipFile
10import requests
11from bagit import make_bag, BagError
13from portality.background import BackgroundTask, BackgroundApi
14from portality.bll import DOAJ
15from portality.core import app
16from portality.lib import dates
17from portality.models import Account, Article, BackgroundJob, PreservationState
18from portality.regex import DOI_COMPILED, HTTP_URL_COMPILED
19from portality.tasks.helpers import background_helper
20from portality.tasks.redis_huey import events_queue as queue
23class PreservationException(Exception):
24 """~~PreservationException:Exception~~"""
25 pass
28class PreservationStorageException(Exception):
29 pass
32class ValidationError(Exception):
33 pass
36class ArticlePackage:
37 """ ~~ArticlePackage:Feature~~"""
39 def __init__(self, article_dir, files):
40 self.issn = None
41 self.article_id = None
42 self.metadata = None
43 self.article_dir = article_dir
44 self.article_files = files
45 self.package_dir = None
46 self.has_error = False
47 self.error_details = None
49 def create_article_bagit_structure(self):
50 """ ~~-> BagIt:Library~~
51 Create directory structure for packaging
52 Create required additional files
53 Create bagit files
54 """
55 # Validate if required data is available
56 self.validate()
58 journal_dir = os.path.join(self.package_dir, self.issn)
59 if not os.path.exists(journal_dir):
60 os.mkdir(journal_dir)
62 dest_article_dir = os.path.join(journal_dir, self.article_id)
63 if not os.path.exists(dest_article_dir):
64 # Create article directory
65 os.mkdir(dest_article_dir)
67 # Create metadata directory
68 metada_dir = os.path.join(dest_article_dir, "metadata")
69 if not os.path.exists(metada_dir):
70 os.mkdir(metada_dir)
72 # Copy the files from user uploaded directory to the package
73 for file in self.article_files:
74 if not file == Preservation.IDENTIFIER_FILE:
75 src = os.path.join(self.article_dir, file)
76 dest = os.path.join(dest_article_dir, file)
77 shutil.copy(src, dest)
79 # Create metadata file with article information
80 with open(os.path.join(metada_dir, "metadata.json"), 'w+') as metadata_file:
81 metadata_file.write(json.dumps(self.metadata, indent=4))
83 # Create a identifier file with uuid of the article
84 with open(os.path.join(metada_dir, "identifier.txt"), 'w+') as metadata_file:
85 metadata_file.write(self.article_id)
87 try:
88 # Bag the article
89 make_bag(dest_article_dir, checksums=["sha256"])
90 except BagError:
91 app.logger.excception(f"Error while creating Bag for article {self.article_id}")
92 raise PreservationException("Error while creating Bag")
94 def validate(self):
95 variables_list = []
97 if not self.package_dir:
98 variables_list.append("package_dir")
99 if not self.metadata:
100 variables_list.append("metadata")
101 if not self.article_dir:
102 variables_list.append("article_dir")
103 if not self.article_files or len(self.article_files) == 0:
104 variables_list.append("article_files")
105 if not self.article_id:
106 variables_list.append("article_id")
107 if not self.issn:
108 variables_list.append("issn")
110 if len(variables_list) > 0:
111 app.logger.debug(f"Validation Values : package_dir {self.package_dir} "
112 f"metadata {self.metadata} article_dir {self.article_dir} "
113 f"article_files {self.article_files} article_id {self.article_id} issn {self.issn}")
114 raise ValidationError(f"Required fields cannot be empty {variables_list}")
117class ArticlesList:
118 """This class contains different types of lists depending on the article state"""
120 def __init__(self):
121 self.__successful_articles = []
122 self.__unowned_articles = []
123 self.__no_identifier_articles = []
124 self.__unbagged_articles = []
125 self.__not_found_articles = []
126 self.__no_files_articles = []
127 self.__uploaded_journals = []
128 self.has_errors = False
130 def add_successful_article(self, article: ArticlePackage):
131 self.__successful_articles.append(os.path.basename(article.article_dir))
133 def add_uploaded_journal(self, journal_package):
134 self.__uploaded_journals.append(journal_package)
136 def add_unowned_articles(self, article: ArticlePackage):
137 self.has_errors = True
138 self.__unowned_articles.append(os.path.basename(article.article_dir))
140 def add_no_identifier_articles(self, article: ArticlePackage):
141 self.has_errors = True
142 self.__no_identifier_articles.append(os.path.basename(article.article_dir))
144 def add_unbagged_articles(self, article: ArticlePackage):
145 self.has_errors = True
146 self.__unbagged_articles.append(os.path.basename(article.article_dir))
148 def add_not_found_articles(self, article: ArticlePackage):
149 self.has_errors = True
150 self.__not_found_articles.append(os.path.basename(article.article_dir))
152 def add_no_files_articles(self, article: ArticlePackage):
153 self.__no_files_articles.append(os.path.basename(article.article_dir))
155 def successful_articles(self):
156 return self.__successful_articles
158 def unowned_articles(self):
159 return self.__unowned_articles
161 def no_identifier_articles(self):
162 return self.__no_identifier_articles
164 def unbagged_articles(self):
165 return self.__unbagged_articles
167 def not_found_articles(self):
168 return self.__not_found_articles
170 def no_files_articles(self):
171 return self.__no_files_articles
173 def uploaded_journals(self):
174 return self.__uploaded_journals
176 def get_count(self):
177 return len(self.__successful_articles) + \
178 len(self.__unowned_articles) + \
179 len(self.__no_identifier_articles) + \
180 len(self.__unbagged_articles) + \
181 len(self.__not_found_articles) + \
182 len(self.__no_files_articles)
184 def is_partial_success(self):
185 if len(self.__successful_articles) > 0 and \
186 (len(self.__unbagged_articles) > 0 or
187 len(self.__unowned_articles) > 0 or
188 len(self.__not_found_articles) > 0 or
189 len(self.__no_identifier_articles) > 0 or
190 len(self.__no_files_articles)):
191 return True
193 return False
196class PreservationBackgroundTask(BackgroundTask):
197 """~~PreservationBackground:Feature~~"""
199 __action__ = "preserve"
201 @classmethod
202 def prepare(cls, username, **kwargs):
203 """
204 Create necessary directories and save the file.
205 Creates the background job
206 :param username:
207 :param kwargs:
208 :return: background job
209 """
211 created_time = dates.now_str("%Y-%m-%d-%H-%M-%S")
212 dir_name = username + "-" + created_time
213 local_dir = os.path.join(Preservation.UPLOAD_DIR, dir_name)
214 file = kwargs.get("upload_file")
216 preservation = Preservation(local_dir, username)
217 preservation.save_file(file)
219 # prepare a job record
220 params = {}
221 cls.set_param(params, "local_dir", local_dir)
222 job = background_helper.create_job(username, cls.__action__,
223 queue_id=huey_helper.queue_id, params=params)
224 return job
226 def run(self):
228 job = self.background_job
230 params = job.params
231 local_dir = self.get_param(params, "local_dir")
232 model_id = self.get_param(params, "model_id")
233 app.logger.debug(f"Local dir {local_dir}")
234 app.logger.debug(f"model_id {model_id}")
236 preserve_model = PreservationState.pull(model_id)
237 preserve_model.background_task_id = job.id
238 preserve_model.pending()
239 preserve_model.save()
241 # ~~-> Preservation:Feature~~
242 preserv = Preservation(local_dir, job.user)
243 preserv.upload_filename = preserve_model.filename
244 try:
245 job.add_audit_message("Extract zip file")
246 preserv.extract_zip_file()
247 app.logger.debug("Extracted zip file")
249 job.add_audit_message("Create Package structure")
250 articles_list = preserv.create_package_structure()
252 app.logger.debug("Created package structure")
254 if len(articles_list.successful_articles()) > 0:
255 # Each subdirectory is a jornal and the directory name is ISSN of the journal
256 # iterate through the directories and upload each journal as an individual package
257 dirs = [f.name for f in os.scandir(preserv.preservation_dir) if f.is_dir()]
258 upload_failed = False
259 for sub_dir in dirs:
261 package = PreservationPackage(preserv.preservation_dir, sub_dir, job.user)
262 job.add_audit_message("Create preservation package for " + sub_dir)
263 tar_file = package.create_package()
265 app.logger.debug(f"Created tar file {tar_file}")
267 job.add_audit_message("Create shasum for " + sub_dir)
268 sha256 = package.sha256(package.tar_file)
270 job.add_audit_message("Upload package " + sub_dir)
271 response = package.upload_package(sha256, package.tar_file)
272 app.logger.debug(f"Uploaded. Response{response.text}")
274 job.add_audit_message("Validate response")
275 self.validate_response(response, tar_file, sha256, preserve_model)
277 if preserve_model.status == 'failed':
278 upload_failed = True
279 break
280 else:
281 articles_list.add_uploaded_journal(package.tar_file_name)
283 # Upload the identifier file
284 job.add_audit_message("Create shasum for identifier")
285 sha256 = package.sha256(package.identifier_file)
287 identifier_file_name = os.path.basename(package.identifier_file)
288 job.add_audit_message("Upload identifier file " + identifier_file_name)
289 package.upload_package(sha256, package.identifier_file)
290 articles_list.add_uploaded_journal(identifier_file_name)
291 app.logger.debug(f"Uploaded identifier file " + identifier_file_name)
293 if not upload_failed:
294 preserve_model.uploaded_to_ia()
296 # Check if the only few articles are successful
297 if articles_list.is_partial_success():
298 preserve_model.partial()
299 preserve_model.save()
300 else:
301 # If no previous errors found, check other failure reasons
302 if not preserve_model.error:
303 # Check if any articles available
304 if articles_list.get_count() == 0:
305 preserve_model.failed(FailedReasons.no_article_found)
306 preserve_model.save()
307 # All the articles available are invalid
308 else:
309 preserve_model.failed(FailedReasons.no_valid_article_available)
310 preserve_model.save()
312 self.save_articles_list(articles_list, preserve_model)
314 except (PreservationException, Exception) as exp:
315 # ~~-> PreservationException:Exception~~
316 preserve_model.failed(str(exp))
317 preserve_model.save()
318 app.logger.exception("Error at background task")
319 raise
321 def save_articles_list(self, articles_list: ArticlesList, model: PreservationState):
322 """
323 Saves articles info to the model
324 :param articles_list: articles list
325 :param model: model object
326 """
327 if len(articles_list.successful_articles()) > 0:
328 model.successful_articles(articles_list.successful_articles())
329 if len(articles_list.not_found_articles()) > 0:
330 model.not_found_articles(articles_list.not_found_articles())
331 if len(articles_list.no_identifier_articles()) > 0:
332 model.no_identifier_articles(articles_list.no_identifier_articles())
333 if len(articles_list.no_identifier_articles()) == articles_list.get_count():
334 model.failed(FailedReasons.no_identifier)
335 if len(articles_list.unowned_articles()) > 0:
336 model.unowned_articles(articles_list.unowned_articles())
337 if len(articles_list.unbagged_articles()) > 0:
338 model.unbagged_articles(articles_list.unbagged_articles())
339 if len(articles_list.no_files_articles()) > 0:
340 model.no_files_articles(articles_list.no_files_articles())
341 if len(articles_list.uploaded_journals()) > 0:
342 model.uploaded_journals(articles_list.uploaded_journals())
343 model.save()
345 def cleanup(self):
346 """
347 Cleanup any resources
348 :return:
349 """
350 job = self.background_job
351 params = job.params
352 local_dir = self.get_param(params, "local_dir")
353 Preservation.delete_local_directory(local_dir)
355 def validate_response(self, response, tar_file, sha256, model):
356 """
357 Validate the response from server
358 :param response: response object
359 :param tar_file: tar file name
360 :param sha256: sha256sum value
361 :param model: model object to update status
362 """
363 if response.status_code == 200:
364 res_json = json.loads(response.text)
365 files = res_json["files"]
366 # Success response
367 # {"files": [{"name": "name_of_tarball.tar.gz",
368 # "sha256": "decafbad"}]}
369 if files:
370 # Check if the response is type dict or list
371 res_filename = None
372 res_shasum = None
373 if isinstance(files, dict):
374 res_filename = files["name"]
375 res_shasum = files["sha256"]
376 elif isinstance(files, list):
377 if len(files) > 0:
378 res_filename = files[0]["name"]
379 res_shasum = files[0]["sha256"]
381 if res_filename and res_filename == tar_file:
382 if res_shasum and res_shasum == sha256:
383 app.logger.info("successfully uploaded " + tar_file)
384 else:
385 model.failed(FailedReasons.checksum_doesnot_match)
386 else:
387 model.failed(FailedReasons.tar_filename_doesnot_match)
389 else:
390 # Error response
391 # {"result": "ERROR","manifest_type": "BagIt",
392 # "manifests": [
393 # {"id": "033168cd016a49eb8c3097d800f1b85f",
394 # "result": "SUCCESS"},
395 # {"id": "00003741594643f4996e2555a01e03c7",
396 # "result": "ERROR",
397 # "errors": [
398 # "missing_files": [],
399 # "mismatch_hashes": [{
400 # "file": "path/to/file",
401 # "expected": "decafbad",
402 # "actual": "deadbeaf"}],
403 # "manifest_parsing_errors": [
404 # "some weird error"]]}]}
405 result = res_json["result"]
406 if result and result == "ERROR":
407 error_str = FailedReasons.error_response
408 else:
409 error_str = FailedReasons.unknown_error_response
411 app.logger.error(error_str)
412 model.failed(error_str)
414 model.save()
415 else:
416 app.logger.error(f"Upload failed for {tar_file}. Reason - {response.text}")
417 model.failed(response.text)
418 model.save()
420 @classmethod
421 def submit(cls, background_job):
422 """
423 Submit Background job"""
424 background_job.save(blocking=True)
425 preserve.schedule(args=(background_job.id,), delay=app.config.get('HUEY_ASYNC_DELAY', 10))
428huey_helper = PreservationBackgroundTask.create_huey_helper(queue)
431@huey_helper.register_execute(is_load_config=True)
432def preserve(job_id):
433 """~~-> PreservationBackgroundTask:Queue"""
434 job = BackgroundJob.pull(job_id)
435 task = PreservationBackgroundTask(job)
436 BackgroundApi.execute(task)
439class CSVReader:
440 """~~CSVReader:Feature~~"""
442 # column names for csv file.
443 # Given more identifiers just to handle any mistakes by user like empty identifiers
444 # Max expected identifier are 2 (Full Text URL, DOI) in any order
445 FIELD_DIR = "dir_name"
446 FIELDS = (FIELD_DIR, "id_1", "id_2", "id_3", "id_4")
448 def __init__(self, csv_file):
449 self.__csv_file = csv_file
451 def articles_info(self):
452 """
453 Reads the csv file and returns dictionary with first column(directory name) as keys
454 and remaining columns as array elements.
456 Ex: {'article_1': ['http://link.springer.com/article/10.1186/s40478-018-0619-9',
457 '10.1136/bmjophth-2021-000774'], 'article_2': ['10.1136/bmjophth-2021-000775']}
459 :return: Dictionary with articles info
460 """
461 data = {}
463 with open(self.__csv_file, mode='r', encoding='utf-8-sig') as file:
464 reader = csv.DictReader(file, CSVReader.FIELDS)
465 for row in reader:
466 dir_name = row[CSVReader.FIELD_DIR]
467 # Remove first column so it will not be part of iteration later
468 row.pop(CSVReader.FIELD_DIR)
469 data[dir_name] = []
470 for key in row.keys():
471 if row[key]:
472 data[dir_name].append(row[key])
473 return data
476class Preservation:
477 """~~Preservation:Feature~~"""
479 # Zip file name to download the zip file to temp directory
480 ARTICLES_ZIP_NAME = "articles.zip"
481 # Identifier file name
482 IDENTIFIER_FILE = "identifier.txt"
483 # CSV file for identifiers
484 IDENTIFIERS_CSV = "identifiers.csv"
485 # Temp directory
486 UPLOAD_DIR = app.config.get("UPLOAD_DIR", ".")
488 def __init__(self, local_dir, owner):
489 self.__dir_name = os.path.basename(local_dir)
490 self.__local_dir = os.path.join(local_dir, "tmp")
491 self.__preservation_dir = os.path.join(local_dir, self.__dir_name)
492 self.__csv_articles_dict = None
493 self.__owner = owner
494 self.upload_filename = None
496 @property
497 def dir_name(self):
498 return self.__dir_name
500 @property
501 def preservation_dir(self):
502 return self.__preservation_dir
504 def create_local_directories(self):
505 """
506 Create local directories to download the files and
507 to create preservation package
508 """
509 try:
510 os.makedirs(self.__local_dir, exist_ok=True)
511 os.makedirs(self.__preservation_dir, exist_ok=True)
512 except OSError:
513 raise PreservationStorageException("Could not create temp directory")
515 @classmethod
516 def delete_local_directory(cls, local_dir):
517 """
518 Deletes the directory
519 """
520 if os.path.exists(local_dir):
521 try:
522 shutil.rmtree(local_dir)
523 except Exception:
524 raise PreservationStorageException("Could not delete Temp directory")
526 def save_file(self, file):
527 """
528 Save the file on to local directory
529 :param file: File object
530 """
531 self.create_local_directories()
532 file_path = os.path.join(self.__local_dir, Preservation.ARTICLES_ZIP_NAME)
533 try:
534 file.save(file_path)
535 except Exception:
536 raise PreservationStorageException("Could not save file in Temp directory")
538 def extract_zip_file(self):
539 """
540 Extracts zip file in the Temp directory
541 """
542 file_path = os.path.join(self.__local_dir, Preservation.ARTICLES_ZIP_NAME)
544 if os.path.exists(file_path):
545 with ZipFile(file_path, 'r') as zFile:
546 zFile.extractall(self.__local_dir)
547 else:
548 raise PreservationException(f"Could not find zip file at Temp directory {file_path}")
550 def create_package_structure(self) -> ArticlesList:
551 """
552 Create preservation package
554 Iterates through the sub directories.
555 Retrieve article info for each article.
556 Creates preservation directories
558 """
559 articles_list = ArticlesList()
561 for dir, subdirs, files in os.walk(self.__local_dir):
563 if dir == self.__local_dir:
564 continue
566 app.logger.debug("Directory : " + dir)
567 app.logger.debug("Sub Directories : " + str(subdirs))
568 app.logger.debug("Files : " + str(files))
570 # Fetch identifiers at the root directory
571 if os.path.dirname(dir) == self.__local_dir:
572 for file in files:
573 if Preservation.IDENTIFIERS_CSV.lower() == file.lower():
574 # Get articles info from csv file
575 # ~~-> CSVReader:Feature~~
576 csv_reader = CSVReader(os.path.join(dir, file))
577 self.__csv_articles_dict = csv_reader.articles_info()
578 break
579 # process only the directories that has articles
580 else:
581 self.__process_article(dir, files, articles_list)
583 return articles_list
585 def __process_article(self, dir_path, files, articles_list):
587 identifiers = None
588 dir_name = os.path.basename(dir_path)
589 package = ArticlePackage(dir_path, files)
591 if not os.path.dirname(dir_path) == self.__local_dir:
592 if not self.__has_article_files(files):
593 articles_list.add_no_files_articles(package)
594 return
596 # check if identifier file exist
597 for file in files:
598 if Preservation.IDENTIFIER_FILE.lower() == file.lower():
599 with open(os.path.join(dir_path, file)) as identifier_file:
600 identifiers = identifier_file.read().splitlines()
602 if not identifiers and self.__csv_articles_dict:
603 if dir_name in self.__csv_articles_dict:
604 identifiers = self.__csv_articles_dict[dir_name]
606 if identifiers:
607 article = self.get_article(identifiers)
609 if article:
610 article_data = article.data
612 is_owner = self.owner_of_article(article)
614 if isinstance(is_owner, bool) and is_owner == True:
615 issn, article_id, metadata_json = self.get_article_info(article_data)
616 try:
617 package = ArticlePackage(dir_path, files)
618 package.issn = issn
619 package.article_id = article_id
620 package.metadata = metadata_json
621 package.package_dir = self.__preservation_dir
623 package.create_article_bagit_structure()
625 # Create and update the identifier file for all articles in the journal
626 with open(os.path.join(self.__preservation_dir, issn + ".txt"), 'a') as identifier_file:
627 identifier_file.write(os.path.basename(dir_path) + "," + article_id + "," +
628 ','.join(identifiers) + "\n")
630 articles_list.add_successful_article(package)
631 except Exception:
632 articles_list.add_unbagged_articles(package)
633 app.logger.exception(f"Error while create article ( {article_id} ) package")
634 else:
635 articles_list.add_unowned_articles(package)
637 else:
638 # skip the article if not found
639 app.logger.error(f"Could not retrieve article for identifier(s) {identifiers}")
640 articles_list.add_not_found_articles(package)
642 else:
643 # did not find any identifier for article
644 articles_list.add_no_identifier_articles(package)
646 def __has_article_files(self, files):
647 """
648 Checks if any article files available
649 :param files:
650 :return: True if files available otherwise returns False
651 """
652 no_of_files = len(files)
653 if Preservation.IDENTIFIER_FILE in files:
654 if no_of_files > 1:
655 return True
656 else:
657 return False
658 else:
659 if no_of_files > 0:
660 return True
661 else:
662 return False
664 def owner_of_article(self, article):
665 """
666 Checks if the article is owned by the user
667 :param article:
668 :return:
669 """
670 articleService = DOAJ.articleService()
671 account = Account.pull(self.__owner)
672 is_owner = articleService.has_permissions(account, article, True)
673 return is_owner
675 def get_article(self, identifiers):
676 """
677 Checks if the identifier is doi or full text
678 Pulls article related to the identifier
679 :param identifiers:
680 :return: article
681 """
682 article = None
683 for identifier in identifiers:
684 if DOI_COMPILED.match(identifier):
685 article = Article.pull_by_key("bibjson.identifier.id", identifier)
686 elif HTTP_URL_COMPILED.match(identifier):
687 article = Article.pull_by_key("bibjson.link.url", identifier)
688 if article:
689 return article
690 else:
691 return None
693 def get_article_info(self, article_json):
694 """
695 Returns article info
696 :param article_json:
697 :return: issn, article id, metadata json
698 """
700 metadata_json = self.get_metadata_json(article_json)
701 issn = article_json["bibjson"]["journal"]["issns"][0]
702 article_id = article_json["id"]
704 return issn, article_id, metadata_json
706 def get_metadata_json(self, article_json):
707 """
708 Returns metadata of article which is required for preservation
709 :return: metadata
710 """
711 # Remove unnecessary data
712 metadata = deepcopy(article_json)
713 metadata.pop("index")
714 metadata.pop("admin")
715 metadata.pop("es_type")
717 return metadata
720class PreservationPackage:
721 """~~PreservationPackage:Feature~~
722 Creates preservation package and upload to Internet Server
723 """
725 def __init__(self, preservation_dir, journal_dir, owner):
726 self.preservation_dir = preservation_dir
727 self.journal_dir = journal_dir
728 self.package_dir = os.path.join(self.preservation_dir, journal_dir)
729 self.created_time = dates.now_str("%Y-%m-%d-%H-%M-%S")
730 self.tar_file = self.package_dir + "_" + self.created_time + ".tar.gz"
731 self.tar_file_name = os.path.basename(self.tar_file)
732 self.__owner = owner
733 self.identifier_file = self.package_dir + "_" + self.created_time + ".txt"
734 try:
735 # Rename the identifier file to match the tar file
736 shutil.move(self.package_dir + ".txt", self.identifier_file)
737 except Exception as e:
738 app.logger.exception(e)
740 def create_package(self):
741 """
742 Creates tar file for the package directory
743 :return: tar file name
744 """
745 try:
746 with tarfile.open(self.tar_file, "w:gz") as tar:
747 tar.add(self.package_dir, arcname=os.path.basename(self.package_dir))
748 except Exception as exp:
749 app.logger.exception("Error creating tar file")
750 raise PreservationException("Error while creating the tar file")
752 return self.tar_file_name
754 def upload_package(self, sha256sum, file):
756 url = app.config.get("PRESERVATION_URL")
757 username = app.config.get("PRESERVATION_USERNAME")
758 password = app.config.get("PRESERVATION_PASSWD")
759 collection_dict = app.config.get("PRESERVATION_COLLECTION")
760 params = collection_dict[self.__owner]
761 collection = params[0]
762 collection_id = params[1]
764 file_name = os.path.basename(file)
766 # payload for upload request
767 payload = {
768 'directories': file_name,
769 'org': 'DOAJ',
770 'client': 'DOAJ_CLI',
771 'username': 'doaj_uploader',
772 'size': '',
773 'organization': '1',
774 'orgname': 'DOAJ',
775 'collection': collection_id,
776 'collname': collection,
777 'sha256sum': sha256sum
778 }
779 app.logger.info(payload)
781 headers = {}
782 # get the file to upload
783 try:
784 with open(file, "rb") as f:
785 files = {'file_field': (file_name, f)}
786 response = requests.post(url, headers=headers, auth=(username, password), files=files, data=payload)
787 except (IOError, Exception) as exp:
788 app.logger.exception("Error opening the tar file")
789 raise PreservationException("Error Uploading tar file to IA server")
791 return response
793 def sha256(self, file):
794 """
795 Creates sha256 hash for the tar file
796 """
797 sha256_hash = hashlib.sha256()
799 with open(file, "rb") as f:
800 # Read and update hash string value in blocks of 64K
801 for byte_block in iter(lambda: f.read(65536), b""):
802 sha256_hash.update(byte_block)
804 return sha256_hash.hexdigest()
807class FailedReasons:
808 no_identifier = "no_identifier"
809 unknown = "unknown"
810 checksum_doesnot_match = "checksum_doesnot_match"
811 no_article_found = "no_article_found"
812 no_valid_article_available = "no_valid_article_available"
813 tar_filename_doesnot_match = "response_tar_filename_doesnot_match"
814 error_response = "error_response"
815 unknown_error_response = "unknown_error_response"
816 collection_not_available = "collection_not_available"