diff --git a/permanent_upload/__main__.py b/permanent_upload/__main__.py index e6fc664..7b21d9f 100644 --- a/permanent_upload/__main__.py +++ b/permanent_upload/__main__.py @@ -9,7 +9,7 @@ from tabulate import tabulate from .permanent import PermanentAPI -from .validation import validate_supported_types +from .validation import load_expected_formats, validate_supported_types def get_file_list(path): @@ -46,7 +46,7 @@ def main(environment, path): email = f"engineers+prmnttstr{unix_timestamp}@permanent.org" print("User account email:", email) password = "".join(random.choice(string.ascii_letters) for i in range(12)) - timeout = 60 + timeout = 300 print(f"Current timeout is {timeout} seconds") api = PermanentAPI( @@ -61,10 +61,13 @@ def main(environment, path): api.logged_in() parent_folder_id, parent_folder_link_id = api.get_folder_info() files = get_file_list(path) + formats_by_extension = load_expected_formats() results = [] headers = ["File Name", "Type", "Status", "File Formats", "Time"] for f in files: logging.info("Processing %s", f) + extension = os.path.splitext(f)[1].lstrip(".") + expected_formats = formats_by_extension.get(extension, set()) results.append( api.file_upload( f, @@ -73,6 +76,7 @@ def main(environment, path): archive["archiveId"], login_result.response["SimpleVO"]["value"], timeout, + expected_formats, ) ) print(tabulate(results, headers, tablefmt="github")) diff --git a/permanent_upload/data/supported_file_types.csv b/permanent_upload/data/supported_file_types.csv index 1f97a57..2909fd4 100644 --- a/permanent_upload/data/supported_file_types.csv +++ b/permanent_upload/data/supported_file_types.csv @@ -16,30 +16,25 @@ heic,image,ok,"heic,jpg" jpg,image,ok,"jpg" tif,image,ok,"tif,jpg" jpeg,image,ok,"jpeg,jpg" -html,document,ok,"html,pdfa,txt" -xlsx,spreadsheet,ok,"xlsx,ods,pdfa,csv" -csv,spreadsheet,ok,"csv" -ods,spreadsheet,ok,"ods,pdfa,csv" -htm,document,ok,"htm,pdfa,txt" -xls,spreadsheet,ok,"xls,ods,pdfa,csv" -key,presentation,ok,"key,odp,pdfa,txt" -pptx,presentation,ok,"pptx,odp,pdfa,txt" -odp,presentation,ok,"odp,pdfa,txt" -ppt,presentation,ok,"ppt,odp,pdfa,txt" +xlsx,spreadsheet,ok,"xlsx,pdf" +ods,spreadsheet,ok,"ods,pdf" +xls,spreadsheet,ok,"xls,pdf" +key,presentation,ok,"key,pdf" +pptx,presentation,ok,"pptx,pdf" +odp,presentation,ok,"odp,pdf" +ppt,presentation,ok,"ppt,pdf" txt,document,ok,"txt" -docx,document,ok,"docx,odt,pdfa,txt" -rtf,document,ok,"rtf,odt,pdfa,txt" -eml,document,ok,"eml,pdfa,txt" -odt,document,ok,"odt,pdfa,txt" -pdf,pdf,ok,"pdf,pdfa,txt" +docx,document,ok,"docx,pdf" +rtf,document,ok,"rtf,pdf" +eml,document,ok,"eml,pdf" +odt,document,ok,"odt,pdf" +pdf,pdf,ok,"pdf" zip,archive,ok,"zip" -doc,document,ok,"doc,odt,pdfa,txt" +doc,document,ok,"doc,pdf" 3gp,video,ok,"3gp,mp4" webm,video,ok,"webm,mp4" mkv,video,ok,"mkv,mp4" avi,video,ok,"avi,mp4" ogv,video,ok,"ogv,mp4" -m4v,video,ok,"m4v,mp4" -wmv,video,manual_review,"wmv,mp4" mov,video,ok,"mov,mp4" mp4,video,ok,"mp4" diff --git a/permanent_upload/permanent.py b/permanent_upload/permanent.py index 342ca7d..ff7b628 100644 --- a/permanent_upload/permanent.py +++ b/permanent_upload/permanent.py @@ -76,16 +76,25 @@ def __init__(self, base_url=None): raise Exception("Need `base_url` upon object creation") self.base_url = base_url - def _measure_post_upload_processing(self, record_vo, timeout): + def _measure_post_upload_processing(self, record_vo, expected_formats, timeout): record_id = record_vo["recordId"] archive_number = record_vo["archiveNbr"] i = 0 - status = "" record = "" + status = "" + processing_complete = False - while i < timeout and status != "status.generic.ok": + while i < timeout and ( + not processing_complete or status != "status.generic.ok" + ): record = self._get_record(record_id, archive_number) + actual_formats = { + vo["type"].split(".")[-1] for vo in (record.get("FileVOs") or []) + } + processing_complete = bool(expected_formats) and expected_formats.issubset( + actual_formats + ) status = record["status"] time.sleep(1) i += 1 @@ -135,6 +144,7 @@ def file_upload( archive_id, auth_token, timeout, + expected_formats, ): """ Perform the file upload requests, and then poll for status until the timeout. @@ -166,7 +176,7 @@ def file_upload( created_record_vo = self._register_record(auth_token, request) attempts, processed_record = self._measure_post_upload_processing( - created_record_vo, timeout + created_record_vo, expected_formats, timeout ) result = [ filename, diff --git a/permanent_upload/validation.py b/permanent_upload/validation.py index fdc8b4c..4a9c1bd 100644 --- a/permanent_upload/validation.py +++ b/permanent_upload/validation.py @@ -2,15 +2,26 @@ import os -def validate_supported_types(results, data_file="data/supported_file_types.csv"): - validation_dataset = {} +def _load_validation_dataset(data_file="data/supported_file_types.csv"): + dataset = {} data_file_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), data_file ) with open(data_file_path, "r") as csvfile: - validation_reader = csv.DictReader(csvfile) - for row in validation_reader: - validation_dataset[row["file_extension"]] = row + for row in csv.DictReader(csvfile): + dataset[row["file_extension"]] = row + return dataset + + +def load_expected_formats(data_file="data/supported_file_types.csv"): + return { + ext: set(row["conversions"].split(",")) + for ext, row in _load_validation_dataset(data_file).items() + } + + +def validate_supported_types(results, data_file="data/supported_file_types.csv"): + validation_dataset = _load_validation_dataset(data_file) for result in results: extension = result[0].split(".")[-1] assert validation_dataset[extension]