22import logging
33import pandas as pd
44from tqdm import tqdm
5- from .utils import generate_s3_url , get_s3_client , get_s3_file_path , list_s3_files , create_custom_xml
5+ from .utils import generate_s3_url , get_s3_client , get_s3_file_path , list_s3_files_and_folders , create_custom_xml , transform_box_file_name
66
77
88
@@ -19,7 +19,7 @@ def process_row(row, box_file, s3_client):
1919
2020 for filetype in ['PDF' , 'PDF_LATEX' , 'TIFF' ]:
2121 s3_prefix = get_s3_file_path (filetype = filetype , box_file = box_file , filename = record_name )
22- files = list_s3_files ('cern-archives' , s3_prefix , s3_client )
22+ files = list_s3_files_and_folders ('cern-archives' , s3_prefix , s3_client )[ 'files' ]
2323
2424 if not files :
2525 logging .info (f"[MISSING] { filetype } for record { record_name } (ID: { record_id } ) in { s3_prefix } " )
@@ -52,3 +52,41 @@ def create_import_xml_files(data_path, output_path):
5252 xml_path = os .path .join (xml_output_path , xml_filename )
5353 create_custom_xml (records_data , xml_path )
5454 print (f"✅ XML written: { xml_path } " )
55+
56+
57+ def get_matching_errors (boite_data_path , box_file , corrections_folder = False ):
58+ """
59+ This function reads the Excel file and returns a dict:
60+ {
61+ 'missing_in_excel': {filetype: [...], ...},
62+ 'missing_in_s3': {filetype: [...], ...}
63+ }
64+ """
65+ s3_client = get_s3_client ()
66+ boite_data = pd .read_excel (os .path .join (boite_data_path , box_file ), header = None )
67+ box_file_s3 = transform_box_file_name (box_file )
68+ filetypes = ['PDF' , 'PDF_LATEX' , 'TIFF' ]
69+ boite_values = boite_data [boite_data .columns [1 ]].tolist ()
70+
71+ missing_in_excel_dict = {}
72+ missing_in_s3_dict = {}
73+
74+ for ft in filetypes :
75+ prefix = f'raw/CORRECTIONS/{ ft } /{ box_file_s3 } /' if corrections_folder else f'raw/{ ft } /{ box_file_s3 } /'
76+ files_for_type = list_s3_files_and_folders ('cern-archives' , prefix , s3_client )
77+ if ft == 'PDF_LATEX' :
78+ s3_names = [f .split ('/' )[- 1 ].split ('.' )[0 ] for f in files_for_type ['files' ]]
79+ else :
80+ s3_names = [f .split ('/' )[- 2 ] for f in files_for_type ['folders' ]]
81+
82+ try :
83+ s3_names .remove (box_file_s3 )
84+ except ValueError :
85+ pass
86+ missing_in_excel_dict [ft ] = list (set (s3_names ) - set (boite_values ))
87+ missing_in_s3_dict [ft ] = list (set (boite_values ) - set (s3_names ))
88+
89+ return {
90+ 'missing_in_excel' : missing_in_excel_dict ,
91+ 'missing_in_s3' : missing_in_s3_dict
92+ }
0 commit comments