From bc27a394c6aae36fc3907360f86ae712eb4fbe68 Mon Sep 17 00:00:00 2001 From: Liam Lloyd-Tucker Date: Wed, 17 Jun 2026 10:11:59 -0700 Subject: [PATCH] Update supported file list to match new Archivematica processing This commit updates the supported file list that these tests use to check conversions against. It removes the intermediate file types that were generated by the legacy process but are not generated by the new Archivematica pipeline. It also removes some files types that we don't officially support from the list, increases the time the tests wait for files to finish processing, and changes how the tests determine whether a file is done processing to reflect the behavior of the new pipeline. --- permanent_upload/__main__.py | 8 +++-- .../data/supported_file_types.csv | 31 ++++++++----------- permanent_upload/permanent.py | 18 ++++++++--- permanent_upload/validation.py | 21 ++++++++++--- 4 files changed, 49 insertions(+), 29 deletions(-) diff --git a/permanent_upload/__main__.py b/permanent_upload/__main__.py index e6fc664..7b21d9f 100644 --- a/permanent_upload/__main__.py +++ b/permanent_upload/__main__.py @@ -9,7 +9,7 @@ from tabulate import tabulate from .permanent import PermanentAPI -from .validation import validate_supported_types +from .validation import load_expected_formats, validate_supported_types def get_file_list(path): @@ -46,7 +46,7 @@ def main(environment, path): email = f"engineers+prmnttstr{unix_timestamp}@permanent.org" print("User account email:", email) password = "".join(random.choice(string.ascii_letters) for i in range(12)) - timeout = 60 + timeout = 300 print(f"Current timeout is {timeout} seconds") api = PermanentAPI( @@ -61,10 +61,13 @@ def main(environment, path): api.logged_in() parent_folder_id, parent_folder_link_id = api.get_folder_info() files = get_file_list(path) + formats_by_extension = load_expected_formats() results = [] headers = ["File Name", "Type", "Status", "File Formats", "Time"] for f in files: logging.info("Processing %s", f) + extension = os.path.splitext(f)[1].lstrip(".") + expected_formats = formats_by_extension.get(extension, set()) results.append( api.file_upload( f, @@ -73,6 +76,7 @@ def main(environment, path): archive["archiveId"], login_result.response["SimpleVO"]["value"], timeout, + expected_formats, ) ) print(tabulate(results, headers, tablefmt="github")) diff --git a/permanent_upload/data/supported_file_types.csv b/permanent_upload/data/supported_file_types.csv index 1f97a57..2909fd4 100644 --- a/permanent_upload/data/supported_file_types.csv +++ b/permanent_upload/data/supported_file_types.csv @@ -16,30 +16,25 @@ heic,image,ok,"heic,jpg" jpg,image,ok,"jpg" tif,image,ok,"tif,jpg" jpeg,image,ok,"jpeg,jpg" -html,document,ok,"html,pdfa,txt" -xlsx,spreadsheet,ok,"xlsx,ods,pdfa,csv" -csv,spreadsheet,ok,"csv" -ods,spreadsheet,ok,"ods,pdfa,csv" -htm,document,ok,"htm,pdfa,txt" -xls,spreadsheet,ok,"xls,ods,pdfa,csv" -key,presentation,ok,"key,odp,pdfa,txt" -pptx,presentation,ok,"pptx,odp,pdfa,txt" -odp,presentation,ok,"odp,pdfa,txt" -ppt,presentation,ok,"ppt,odp,pdfa,txt" +xlsx,spreadsheet,ok,"xlsx,pdf" +ods,spreadsheet,ok,"ods,pdf" +xls,spreadsheet,ok,"xls,pdf" +key,presentation,ok,"key,pdf" +pptx,presentation,ok,"pptx,pdf" +odp,presentation,ok,"odp,pdf" +ppt,presentation,ok,"ppt,pdf" txt,document,ok,"txt" -docx,document,ok,"docx,odt,pdfa,txt" -rtf,document,ok,"rtf,odt,pdfa,txt" -eml,document,ok,"eml,pdfa,txt" -odt,document,ok,"odt,pdfa,txt" -pdf,pdf,ok,"pdf,pdfa,txt" +docx,document,ok,"docx,pdf" +rtf,document,ok,"rtf,pdf" +eml,document,ok,"eml,pdf" +odt,document,ok,"odt,pdf" +pdf,pdf,ok,"pdf" zip,archive,ok,"zip" -doc,document,ok,"doc,odt,pdfa,txt" +doc,document,ok,"doc,pdf" 3gp,video,ok,"3gp,mp4" webm,video,ok,"webm,mp4" mkv,video,ok,"mkv,mp4" avi,video,ok,"avi,mp4" ogv,video,ok,"ogv,mp4" -m4v,video,ok,"m4v,mp4" -wmv,video,manual_review,"wmv,mp4" mov,video,ok,"mov,mp4" mp4,video,ok,"mp4" diff --git a/permanent_upload/permanent.py b/permanent_upload/permanent.py index 342ca7d..ff7b628 100644 --- a/permanent_upload/permanent.py +++ b/permanent_upload/permanent.py @@ -76,16 +76,25 @@ def __init__(self, base_url=None): raise Exception("Need `base_url` upon object creation") self.base_url = base_url - def _measure_post_upload_processing(self, record_vo, timeout): + def _measure_post_upload_processing(self, record_vo, expected_formats, timeout): record_id = record_vo["recordId"] archive_number = record_vo["archiveNbr"] i = 0 - status = "" record = "" + status = "" + processing_complete = False - while i < timeout and status != "status.generic.ok": + while i < timeout and ( + not processing_complete or status != "status.generic.ok" + ): record = self._get_record(record_id, archive_number) + actual_formats = { + vo["type"].split(".")[-1] for vo in (record.get("FileVOs") or []) + } + processing_complete = bool(expected_formats) and expected_formats.issubset( + actual_formats + ) status = record["status"] time.sleep(1) i += 1 @@ -135,6 +144,7 @@ def file_upload( archive_id, auth_token, timeout, + expected_formats, ): """ Perform the file upload requests, and then poll for status until the timeout. @@ -166,7 +176,7 @@ def file_upload( created_record_vo = self._register_record(auth_token, request) attempts, processed_record = self._measure_post_upload_processing( - created_record_vo, timeout + created_record_vo, expected_formats, timeout ) result = [ filename, diff --git a/permanent_upload/validation.py b/permanent_upload/validation.py index fdc8b4c..4a9c1bd 100644 --- a/permanent_upload/validation.py +++ b/permanent_upload/validation.py @@ -2,15 +2,26 @@ import os -def validate_supported_types(results, data_file="data/supported_file_types.csv"): - validation_dataset = {} +def _load_validation_dataset(data_file="data/supported_file_types.csv"): + dataset = {} data_file_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), data_file ) with open(data_file_path, "r") as csvfile: - validation_reader = csv.DictReader(csvfile) - for row in validation_reader: - validation_dataset[row["file_extension"]] = row + for row in csv.DictReader(csvfile): + dataset[row["file_extension"]] = row + return dataset + + +def load_expected_formats(data_file="data/supported_file_types.csv"): + return { + ext: set(row["conversions"].split(",")) + for ext, row in _load_validation_dataset(data_file).items() + } + + +def validate_supported_types(results, data_file="data/supported_file_types.csv"): + validation_dataset = _load_validation_dataset(data_file) for result in results: extension = result[0].split(".")[-1] assert validation_dataset[extension]