Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions permanent_upload/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from tabulate import tabulate

from .permanent import PermanentAPI
from .validation import validate_supported_types
from .validation import load_expected_formats, validate_supported_types


def get_file_list(path):
Expand Down Expand Up @@ -46,7 +46,7 @@ def main(environment, path):
email = f"engineers+prmnttstr{unix_timestamp}@permanent.org"
print("User account email:", email)
password = "".join(random.choice(string.ascii_letters) for i in range(12))
timeout = 60
timeout = 300
print(f"Current timeout is {timeout} seconds")

api = PermanentAPI(
Expand All @@ -61,10 +61,13 @@ def main(environment, path):
api.logged_in()
parent_folder_id, parent_folder_link_id = api.get_folder_info()
files = get_file_list(path)
formats_by_extension = load_expected_formats()
results = []
headers = ["File Name", "Type", "Status", "File Formats", "Time"]
for f in files:
logging.info("Processing %s", f)
extension = os.path.splitext(f)[1].lstrip(".")
expected_formats = formats_by_extension.get(extension, set())
Comment on lines +69 to +70
results.append(
api.file_upload(
f,
Expand All @@ -73,6 +76,7 @@ def main(environment, path):
archive["archiveId"],
login_result.response["SimpleVO"]["value"],
timeout,
expected_formats,
)
)
print(tabulate(results, headers, tablefmt="github"))
Expand Down
31 changes: 13 additions & 18 deletions permanent_upload/data/supported_file_types.csv
Original file line number Diff line number Diff line change
Expand Up @@ -16,30 +16,25 @@ heic,image,ok,"heic,jpg"
jpg,image,ok,"jpg"
tif,image,ok,"tif,jpg"
jpeg,image,ok,"jpeg,jpg"
html,document,ok,"html,pdfa,txt"
xlsx,spreadsheet,ok,"xlsx,ods,pdfa,csv"
csv,spreadsheet,ok,"csv"
ods,spreadsheet,ok,"ods,pdfa,csv"
htm,document,ok,"htm,pdfa,txt"
xls,spreadsheet,ok,"xls,ods,pdfa,csv"
key,presentation,ok,"key,odp,pdfa,txt"
pptx,presentation,ok,"pptx,odp,pdfa,txt"
odp,presentation,ok,"odp,pdfa,txt"
ppt,presentation,ok,"ppt,odp,pdfa,txt"
xlsx,spreadsheet,ok,"xlsx,pdf"
ods,spreadsheet,ok,"ods,pdf"
xls,spreadsheet,ok,"xls,pdf"
key,presentation,ok,"key,pdf"
pptx,presentation,ok,"pptx,pdf"
odp,presentation,ok,"odp,pdf"
ppt,presentation,ok,"ppt,pdf"
txt,document,ok,"txt"
docx,document,ok,"docx,odt,pdfa,txt"
rtf,document,ok,"rtf,odt,pdfa,txt"
eml,document,ok,"eml,pdfa,txt"
odt,document,ok,"odt,pdfa,txt"
pdf,pdf,ok,"pdf,pdfa,txt"
docx,document,ok,"docx,pdf"
rtf,document,ok,"rtf,pdf"
eml,document,ok,"eml,pdf"
odt,document,ok,"odt,pdf"
pdf,pdf,ok,"pdf"
zip,archive,ok,"zip"
doc,document,ok,"doc,odt,pdfa,txt"
doc,document,ok,"doc,pdf"
3gp,video,ok,"3gp,mp4"
webm,video,ok,"webm,mp4"
mkv,video,ok,"mkv,mp4"
avi,video,ok,"avi,mp4"
ogv,video,ok,"ogv,mp4"
m4v,video,ok,"m4v,mp4"
wmv,video,manual_review,"wmv,mp4"
mov,video,ok,"mov,mp4"
mp4,video,ok,"mp4"
18 changes: 14 additions & 4 deletions permanent_upload/permanent.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,25 @@ def __init__(self, base_url=None):
raise Exception("Need `base_url` upon object creation")
self.base_url = base_url

def _measure_post_upload_processing(self, record_vo, timeout):
def _measure_post_upload_processing(self, record_vo, expected_formats, timeout):
record_id = record_vo["recordId"]
archive_number = record_vo["archiveNbr"]

i = 0
status = ""
record = ""
status = ""
processing_complete = False

while i < timeout and status != "status.generic.ok":
while i < timeout and (
not processing_complete or status != "status.generic.ok"
):
record = self._get_record(record_id, archive_number)
actual_formats = {
vo["type"].split(".")[-1] for vo in (record.get("FileVOs") or [])
}
processing_complete = bool(expected_formats) and expected_formats.issubset(
actual_formats
)
Comment on lines +92 to +97
status = record["status"]
time.sleep(1)
i += 1
Expand Down Expand Up @@ -135,6 +144,7 @@ def file_upload(
archive_id,
auth_token,
timeout,
expected_formats,
):
"""
Perform the file upload requests, and then poll for status until the timeout.
Expand Down Expand Up @@ -166,7 +176,7 @@ def file_upload(
created_record_vo = self._register_record(auth_token, request)

attempts, processed_record = self._measure_post_upload_processing(
created_record_vo, timeout
created_record_vo, expected_formats, timeout
)
result = [
filename,
Expand Down
21 changes: 16 additions & 5 deletions permanent_upload/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,26 @@
import os


def validate_supported_types(results, data_file="data/supported_file_types.csv"):
validation_dataset = {}
def _load_validation_dataset(data_file="data/supported_file_types.csv"):
dataset = {}
data_file_path = os.path.join(
os.path.dirname(os.path.realpath(__file__)), data_file
)
with open(data_file_path, "r") as csvfile:
validation_reader = csv.DictReader(csvfile)
for row in validation_reader:
validation_dataset[row["file_extension"]] = row
for row in csv.DictReader(csvfile):
dataset[row["file_extension"]] = row
return dataset


def load_expected_formats(data_file="data/supported_file_types.csv"):
return {
ext: set(row["conversions"].split(","))
for ext, row in _load_validation_dataset(data_file).items()
}
Comment on lines +16 to +20


def validate_supported_types(results, data_file="data/supported_file_types.csv"):
validation_dataset = _load_validation_dataset(data_file)
for result in results:
extension = result[0].split(".")[-1]
assert validation_dataset[extension]
Comment on lines +23 to 27
Expand Down
Loading