Skip to content

Commit 0bfc953

Browse files
committed
add function to export mismatches
1 parent 7e07c37 commit 0bfc953

3 files changed

Lines changed: 94 additions & 11 deletions

File tree

digitization/cli.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import click
2-
from .file_import.file_import import create_import_xml_files
2+
from .file_import.file_import import create_import_xml_files, get_matching_errors
33
from .xml_collect.xml_collect import records_collection
44
from .xml_collect.utils import (
55
download_files_from_ftp,
@@ -8,6 +8,7 @@
88
records_collection_creation,
99
)
1010
import os
11+
import logging
1112

1213
@click.group()
1314
def digitization():
@@ -83,6 +84,35 @@ def create_import_xml(data_path, output_path):
8384
click.echo("✅ XML files created successfully.")
8485

8586

87+
@digitization.command("get-s3-matching-errors")
88+
@click.option("-d", "--data-path", type=str, required=True, help="Path to the boite data files folder.")
89+
@click.option("-o", "--log-path", type=str, required=True, help="Path to save the log file.")
90+
def get_s3_matching_errors(data_path, log_path):
91+
"""Log missing files in S3 and Excel."""
92+
log_file = os.path.join(log_path, "matching_errors.log")
93+
logging.basicConfig(
94+
filename=log_file,
95+
level=logging.INFO,
96+
format="%(asctime)s - %(levelname)s - %(message)s",
97+
)
98+
for box_file in os.listdir(data_path):
99+
matching_errors = get_matching_errors(data_path, box_file, False)
100+
for filetype, missing_files in matching_errors["missing_in_excel"].items():
101+
if missing_files:
102+
logging.warning(
103+
f"[{box_file}] Missing in Excel ({filetype}): {', '.join(missing_files)}"
104+
)
105+
else:
106+
logging.info(f"[{box_file}] No missing files in Excel for {filetype}.")
107+
for filetype, missing_files in matching_errors["missing_in_s3"].items():
108+
if missing_files:
109+
logging.warning(
110+
f"[{box_file}] Missing in S3 ({filetype}): {', '.join(missing_files)}"
111+
)
112+
else:
113+
logging.info(f"[{box_file}] No missing files in S3 for {filetype}.")
114+
click.echo(f"✅ Log file created: {log_file}")
115+
86116

87117
if __name__ == "__main__":
88118
digitization()

digitization/file_import/file_import.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import logging
33
import pandas as pd
44
from tqdm import tqdm
5-
from .utils import generate_s3_url, get_s3_client, get_s3_file_path, list_s3_files, create_custom_xml
5+
from .utils import generate_s3_url, get_s3_client, get_s3_file_path, list_s3_files_and_folders, create_custom_xml, transform_box_file_name
66

77

88

@@ -19,7 +19,7 @@ def process_row(row, box_file, s3_client):
1919

2020
for filetype in ['PDF', 'PDF_LATEX', 'TIFF']:
2121
s3_prefix = get_s3_file_path(filetype=filetype, box_file=box_file, filename=record_name)
22-
files = list_s3_files('cern-archives', s3_prefix, s3_client)
22+
files = list_s3_files_and_folders('cern-archives', s3_prefix, s3_client)['files']
2323

2424
if not files:
2525
logging.info(f"[MISSING] {filetype} for record {record_name} (ID: {record_id}) in {s3_prefix}")
@@ -52,3 +52,41 @@ def create_import_xml_files(data_path, output_path):
5252
xml_path = os.path.join(xml_output_path, xml_filename)
5353
create_custom_xml(records_data, xml_path)
5454
print(f"✅ XML written: {xml_path}")
55+
56+
57+
def get_matching_errors(boite_data_path, box_file, corrections_folder=False):
58+
"""
59+
This function reads the Excel file and returns a dict:
60+
{
61+
'missing_in_excel': {filetype: [...], ...},
62+
'missing_in_s3': {filetype: [...], ...}
63+
}
64+
"""
65+
s3_client = get_s3_client()
66+
boite_data = pd.read_excel(os.path.join(boite_data_path, box_file), header=None)
67+
box_file_s3 = transform_box_file_name(box_file)
68+
filetypes = ['PDF', 'PDF_LATEX', 'TIFF']
69+
boite_values = boite_data[boite_data.columns[1]].tolist()
70+
71+
missing_in_excel_dict = {}
72+
missing_in_s3_dict = {}
73+
74+
for ft in filetypes:
75+
prefix = f'raw/CORRECTIONS/{ft}/{box_file_s3}/' if corrections_folder else f'raw/{ft}/{box_file_s3}/'
76+
files_for_type = list_s3_files_and_folders('cern-archives', prefix, s3_client)
77+
if ft == 'PDF_LATEX':
78+
s3_names = [f.split('/')[-1].split('.')[0] for f in files_for_type['files']]
79+
else:
80+
s3_names = [f.split('/')[-2] for f in files_for_type['folders']]
81+
82+
try:
83+
s3_names.remove(box_file_s3)
84+
except ValueError:
85+
pass
86+
missing_in_excel_dict[ft] = list(set(s3_names) - set(boite_values))
87+
missing_in_s3_dict[ft] = list(set(boite_values) - set(s3_names))
88+
89+
return {
90+
'missing_in_excel': missing_in_excel_dict,
91+
'missing_in_s3': missing_in_s3_dict
92+
}

digitization/file_import/utils.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@
66
load_dotenv()
77

88

9+
def transform_box_file_name(box_file):
10+
return box_file.split('.')[0].upper().replace('-', '_')
11+
912
def get_s3_file_path(filetype='', box_file='', filename=''):
10-
box_file = box_file.split('.')[0].upper().replace('-', '_')
13+
box_file = transform_box_file_name(box_file)
1114
if filetype == 'PDF' or filetype == 'TIFF':
1215
return f"raw/{filetype}/{box_file}/{filename}/"
1316
elif filetype == 'PDF_LATEX':
@@ -22,20 +25,32 @@ def get_s3_client():
2225
endpoint_url='https://s3.cern.ch',
2326
)
2427

25-
def list_s3_files(bucket_name, prefix, s3_client=None):
28+
def list_s3_files_and_folders(bucket_name, prefix, s3_client=None):
29+
if s3_client is None:
30+
s3_client = boto3.client('s3')
31+
2632
try:
2733
response = s3_client.list_objects_v2(
2834
Bucket=bucket_name,
2935
Prefix=prefix,
3036
Delimiter='/'
3137
)
38+
39+
files = []
40+
folders = []
41+
42+
if 'CommonPrefixes' in response:
43+
folders = [cp['Prefix'] for cp in response['CommonPrefixes']]
44+
3245
if 'Contents' in response:
33-
return [obj['Key'] for obj in response['Contents'] if
34-
not obj['Key'].endswith('/')]
35-
else:
36-
return []
37-
except Exception:
38-
return []
46+
files = [obj['Key'] for obj in response['Contents'] if not obj['Key'].endswith('/')]
47+
48+
return {'files': files, 'folders': folders}
49+
50+
except Exception as e:
51+
print(f"Error listing S3 path: {e}")
52+
return {'files': [], 'folders': []}
53+
3954

4055
def generate_s3_url(bucket_name, file_key, expiration=31556952, s3_client=None):
4156
return f"{bucket_name}/{file_key}/{expiration}"

0 commit comments

Comments
 (0)