From bc27a394c6aae36fc3907360f86ae712eb4fbe68 Mon Sep 17 00:00:00 2001
From: Liam Lloyd-Tucker <liam@permanent.org>
Date: Wed, 17 Jun 2026 10:11:59 -0700
Subject: [PATCH] Update supported file list to match new Archivematica
 processing

This commit updates the supported file list that these tests use to
check conversions against. It removes the intermediate file types that
were generated by the legacy process but are not generated by the new
Archivematica pipeline. It also removes some files types that we don't
officially support from the list, increases the time the tests wait
for files to finish processing, and changes how the tests determine
whether a file is done processing to reflect the behavior of the new
pipeline.
---
 permanent_upload/__main__.py                  |  8 +++--
 .../data/supported_file_types.csv             | 31 ++++++++-----------
 permanent_upload/permanent.py                 | 18 ++++++++---
 permanent_upload/validation.py                | 21 ++++++++++---
 4 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/permanent_upload/__main__.py b/permanent_upload/__main__.py
index e6fc664..7b21d9f 100644
--- a/permanent_upload/__main__.py
+++ b/permanent_upload/__main__.py
@@ -9,7 +9,7 @@
 from tabulate import tabulate
 
 from .permanent import PermanentAPI
-from .validation import validate_supported_types
+from .validation import load_expected_formats, validate_supported_types
 
 
 def get_file_list(path):
@@ -46,7 +46,7 @@ def main(environment, path):
     email = f"engineers+prmnttstr{unix_timestamp}@permanent.org"
     print("User account email:", email)
     password = "".join(random.choice(string.ascii_letters) for i in range(12))
-    timeout = 60
+    timeout = 300
     print(f"Current timeout is {timeout} seconds")
 
     api = PermanentAPI(
@@ -61,10 +61,13 @@ def main(environment, path):
     api.logged_in()
     parent_folder_id, parent_folder_link_id = api.get_folder_info()
     files = get_file_list(path)
+    formats_by_extension = load_expected_formats()
     results = []
     headers = ["File Name", "Type", "Status", "File Formats", "Time"]
     for f in files:
         logging.info("Processing %s", f)
+        extension = os.path.splitext(f)[1].lstrip(".")
+        expected_formats = formats_by_extension.get(extension, set())
         results.append(
             api.file_upload(
                 f,
@@ -73,6 +76,7 @@ def main(environment, path):
                 archive["archiveId"],
                 login_result.response["SimpleVO"]["value"],
                 timeout,
+                expected_formats,
             )
         )
     print(tabulate(results, headers, tablefmt="github"))
diff --git a/permanent_upload/data/supported_file_types.csv b/permanent_upload/data/supported_file_types.csv
index 1f97a57..2909fd4 100644
--- a/permanent_upload/data/supported_file_types.csv
+++ b/permanent_upload/data/supported_file_types.csv
@@ -16,30 +16,25 @@ heic,image,ok,"heic,jpg"
 jpg,image,ok,"jpg"
 tif,image,ok,"tif,jpg"
 jpeg,image,ok,"jpeg,jpg"
-html,document,ok,"html,pdfa,txt"
-xlsx,spreadsheet,ok,"xlsx,ods,pdfa,csv"
-csv,spreadsheet,ok,"csv"
-ods,spreadsheet,ok,"ods,pdfa,csv"
-htm,document,ok,"htm,pdfa,txt"
-xls,spreadsheet,ok,"xls,ods,pdfa,csv"
-key,presentation,ok,"key,odp,pdfa,txt"
-pptx,presentation,ok,"pptx,odp,pdfa,txt"
-odp,presentation,ok,"odp,pdfa,txt"
-ppt,presentation,ok,"ppt,odp,pdfa,txt"
+xlsx,spreadsheet,ok,"xlsx,pdf"
+ods,spreadsheet,ok,"ods,pdf"
+xls,spreadsheet,ok,"xls,pdf"
+key,presentation,ok,"key,pdf"
+pptx,presentation,ok,"pptx,pdf"
+odp,presentation,ok,"odp,pdf"
+ppt,presentation,ok,"ppt,pdf"
 txt,document,ok,"txt"
-docx,document,ok,"docx,odt,pdfa,txt"
-rtf,document,ok,"rtf,odt,pdfa,txt"
-eml,document,ok,"eml,pdfa,txt"
-odt,document,ok,"odt,pdfa,txt"
-pdf,pdf,ok,"pdf,pdfa,txt"
+docx,document,ok,"docx,pdf"
+rtf,document,ok,"rtf,pdf"
+eml,document,ok,"eml,pdf"
+odt,document,ok,"odt,pdf"
+pdf,pdf,ok,"pdf"
 zip,archive,ok,"zip"
-doc,document,ok,"doc,odt,pdfa,txt"
+doc,document,ok,"doc,pdf"
 3gp,video,ok,"3gp,mp4"
 webm,video,ok,"webm,mp4"
 mkv,video,ok,"mkv,mp4"
 avi,video,ok,"avi,mp4"
 ogv,video,ok,"ogv,mp4"
-m4v,video,ok,"m4v,mp4"
-wmv,video,manual_review,"wmv,mp4"
 mov,video,ok,"mov,mp4"
 mp4,video,ok,"mp4"
diff --git a/permanent_upload/permanent.py b/permanent_upload/permanent.py
index 342ca7d..ff7b628 100644
--- a/permanent_upload/permanent.py
+++ b/permanent_upload/permanent.py
@@ -76,16 +76,25 @@ def __init__(self, base_url=None):
             raise Exception("Need `base_url` upon object creation")
         self.base_url = base_url
 
-    def _measure_post_upload_processing(self, record_vo, timeout):
+    def _measure_post_upload_processing(self, record_vo, expected_formats, timeout):
         record_id = record_vo["recordId"]
         archive_number = record_vo["archiveNbr"]
 
         i = 0
-        status = ""
         record = ""
+        status = ""
+        processing_complete = False
 
-        while i < timeout and status != "status.generic.ok":
+        while i < timeout and (
+            not processing_complete or status != "status.generic.ok"
+        ):
             record = self._get_record(record_id, archive_number)
+            actual_formats = {
+                vo["type"].split(".")[-1] for vo in (record.get("FileVOs") or [])
+            }
+            processing_complete = bool(expected_formats) and expected_formats.issubset(
+                actual_formats
+            )
             status = record["status"]
             time.sleep(1)
             i += 1
@@ -135,6 +144,7 @@ def file_upload(
         archive_id,
         auth_token,
         timeout,
+        expected_formats,
     ):
         """
         Perform the file upload requests, and then poll for status until the timeout.
@@ -166,7 +176,7 @@ def file_upload(
         created_record_vo = self._register_record(auth_token, request)
 
         attempts, processed_record = self._measure_post_upload_processing(
-            created_record_vo, timeout
+            created_record_vo, expected_formats, timeout
         )
         result = [
             filename,
diff --git a/permanent_upload/validation.py b/permanent_upload/validation.py
index fdc8b4c..4a9c1bd 100644
--- a/permanent_upload/validation.py
+++ b/permanent_upload/validation.py
@@ -2,15 +2,26 @@
 import os
 
 
-def validate_supported_types(results, data_file="data/supported_file_types.csv"):
-    validation_dataset = {}
+def _load_validation_dataset(data_file="data/supported_file_types.csv"):
+    dataset = {}
     data_file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)), data_file
     )
     with open(data_file_path, "r") as csvfile:
-        validation_reader = csv.DictReader(csvfile)
-        for row in validation_reader:
-            validation_dataset[row["file_extension"]] = row
+        for row in csv.DictReader(csvfile):
+            dataset[row["file_extension"]] = row
+    return dataset
+
+
+def load_expected_formats(data_file="data/supported_file_types.csv"):
+    return {
+        ext: set(row["conversions"].split(","))
+        for ext, row in _load_validation_dataset(data_file).items()
+    }
+
+
+def validate_supported_types(results, data_file="data/supported_file_types.csv"):
+    validation_dataset = _load_validation_dataset(data_file)
     for result in results:
         extension = result[0].split(".")[-1]
         assert validation_dataset[extension]