cern-sis
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎digitization/cli.py‎
Lines changed: 18 additions & 196 deletions b/‎digitization/cli.py‎
Lines changed: 18 additions & 196 deletions
diff --git a/‎digitization/file_import/README.md‎
Lines changed: 0 additions & 2 deletions b/‎digitization/file_import/README.md‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎digitization/file_import/__init__.py‎ b/‎digitization/file_import/__init__.py‎
diff --git a/‎digitization/file_import/file_import.py‎
Lines changed: 44 additions & 16 deletions b/‎digitization/file_import/file_import.py‎
Lines changed: 44 additions & 16 deletions
diff --git a/‎digitization/file_import/utils.py‎
Lines changed: 30 additions & 3 deletions b/‎digitization/file_import/utils.py‎
Lines changed: 30 additions & 3 deletions
@@ -5,10 +5,11 @@ build/
 dist/
 wheels/
 *.egg-info
-
+*.log
 .DS_Store
 .env
 
 data/
+import_xml_files/
 # Virtual environments
 .venv
@@ -21,5 +21,5 @@ The location to download the files.
 
 ```bash
 poetry install
-poetry run python cli.py
+poetry run digitization
 ```
@@ -1,207 +1,18 @@
 import click
-import logging
-import os
-import pysftp
-import re
-import shutil
-import xml.etree.ElementTree as ET
+from .file_import.file_import import create_import_xml_files
 from .xml_collect.xml_collect import records_collection
-
-URL = "https://digitization.web.cern.ch"
-
-main_directory = (
-    "/eos/project-p/psdigitization/public/CERN-Project-Files/CERN-Project-Files/www"
+from .xml_collect.utils import (
+    download_files_from_ftp,
+    fix_white_spaces_in_directory,
+    find_all_xmls,
+    records_collection_creation,
 )
-ERROR = []
-MISSING_XMLS = []
-REGEXP = r"(?!original)([\w\W]+)\.(xml)"
-MAX_NUMBER_OF_RECORDS_COLLECT = 500
-
-
-def url_from_eos_path(path):
-    return path.replace(main_directory, URL)
-
-
-def file_list_chunker(files, chunk_size=MAX_NUMBER_OF_RECORDS_COLLECT):
-    for i in range(0, len(files), chunk_size):
-        yield files[i : i + chunk_size]
-
-
-def records_collection_creation(input_dir, output_dir):
-    logging.info(f"Creating collection file for {input_dir}")
-    file_list = [
-        os.path.join(root, _file)
-        for root, _, files in os.walk(input_dir, topdown=False)
-        for _file in files
-        if re.match(REGEXP, _file)
-    ]
-
-    logging.info(
-        f"All files to be combined found: {len(file_list)}. Will generate {len(file_list) // MAX_NUMBER_OF_RECORDS_COLLECT} collection files."
-    )
-
-    chunks = list(file_list_chunker(file_list))
-
-    os.makedirs(output_dir, exist_ok=True)
-
-    for collection_file_name, chunk in enumerate(chunks, start=1):
-        filename = f"{output_dir}/{collection_file_name}.xml"
-
-        with open(filename, "w") as nf:
-            nf.write("<collection>")
-            for file_path in chunk:
-                logging.info(f"Processing {file_path}")
-                try:
-                    with open(file_path, "r") as f:
-                        data = f.read()
-                except Exception as e:
-                    logging.error(f"Error while reading file {file_path}: {e}")
-                    continue
-                try:
-                    data = data.replace("<collection>", "").replace("</collection>", "")
-                    nf.write(data)
-                except Exception as e:
-                    logging.error(f"Error while writing file {file_path}: {e}")
-                    continue
-
-            nf.write("</collection>")
-
-    logging.info(f"Collection {collection_file_name} written successfully.")
-
-
-def fix_white_spaces_in_directory(start_dir):
-    for root, dirs, files in os.walk(start_dir, topdown=False):
-        for directory in dirs:
-            if " " in directory:
-                new_directory = directory.replace(" ", "_")
-                os.rename(
-                    os.path.join(root, directory), os.path.join(root, new_directory)
-                )
-                print(f"Renamed directory: {directory} -> {new_directory}")
-        for filename in files:
-            if " " in filename:
-                new_filename = filename.replace(" ", "_")
-                os.rename(
-                    os.path.join(root, filename), os.path.join(root, new_filename)
-                )
-                print(f"Renamed file: {filename} -> {new_filename}")
-
-
-def download_files_from_ftp(force=False):
-    host = os.getenv("FTP_HOST")
-    username = os.getenv("FTP_USERNAME")
-    password = os.getenv("FTP_PASSWORD")
-
-    main_directory = os.getenv("FTP_ROOT_PATH", "/CERN-Project-Files")
-    download_directory = os.getenv("DOWNLOAD_DIR", "/tmp/")
-
-    cnopts = pysftp.CnOpts()
-    cnopts.hostkeys = None
-    downloaded_directories = []
-    with pysftp.Connection(
-        host=host, username=username, password=password, cnopts=cnopts
-    ) as sftp:
-        sftp.cwd(main_directory)
-        directory_structure = sftp.listdir_attr()
-        for attr in directory_structure:
-            if force or not os.path.isdir(
-                os.path.join(download_directory, attr.filename)
-            ):
-                click.echo(f"Downloading `{attr.filename}`.")
-                remoteFilePath = attr.filename
-                localFilePath = download_directory
-                sftp.get_r(remoteFilePath, localFilePath, preserve_mtime=True)
-                downloaded_directories.append(attr.filename)
-            else:
-                click.echo(
-                    f"Directory already exists`{attr.filename}`. Skip downloading..."
-                )
-    return downloaded_directories
-
-
-def fix_xml(root, xml_path, tif_path, pdf_path):
-    xml_file_name = os.path.basename(xml_path)
-    xml_file_name_original = "original_{}".format(xml_file_name)
-    xml_path_original = os.path.join(root, xml_file_name_original)
-
-    if os.path.isfile(xml_path):
-        if os.path.isfile(xml_path_original):
-            click.echo("We have original")
-            os.remove(xml_path)
-            shutil.copy2(
-                os.path.join(root, "original_{}".format(xml_file_name)), xml_path
-            )
-        else:
-            click.echo("Saving the original file")
-            shutil.copy2(
-                xml_path, os.path.join(root, "original_{}".format(xml_file_name))
-            )
-    else:
-        click.echo("XML files are missing completely!")
-        MISSING_XMLS.append(xml_path)
-
-    pdf_url = url_from_eos_path(pdf_path)
-    tif_url = url_from_eos_path(tif_path)
-
-    try:
-        tree = ET.parse(xml_path)
-        xml_root = tree.getroot()
-        for x in xml_root.findall('.//datafield[@tag="FFT"]'):
-            if x.find('.//subfield[@code="a"]').text.startswith("[PATH]"):
-                x.find('.//subfield[@code="a"]').text = pdf_url
-
-            elif x.find('.//subfield[@code="a"]').text.startswith("[EOS_PATH]"):
-                x.attrib["tag"] = "856"
-                x.attrib["ind1"] = "4"
-                for child in x.getchildren():
-                    if child.attrib["code"] == "d":
-                        x.remove(child)
-                    if child.attrib["code"] == "a":
-                        child.attrib["code"] = "u"
-                        child.text = tif_url
-                    if child.attrib["code"] == "t":
-                        child.attrib["code"] = "q"
-                        child.text = "TIFF"
-        tree.write(xml_path, encoding="utf-8")
-    except Exception:
-        ERROR.append(xml_path)
-
-
-def find_all_xmls():
-    os.chdir(main_directory)
-    for root, dirs, files in os.walk(main_directory, topdown=False):
-        try:
-            xml_path = os.path.join(
-                root,
-                next(
-                    filter(
-                        lambda x: not x.startswith("original_") and x.endswith(".xml"),
-                        files,
-                    )
-                ),
-            )
-            tif_path = os.path.join(
-                root, next(filter(lambda x: x.endswith(".tif"), files))
-            )
-            pdf_path = os.path.join(
-                root, next(filter(lambda x: x.endswith(".pdf"), files))
-            )
-        except StopIteration:
-            continue
-        fix_xml(root, xml_path, tif_path, pdf_path)
-        click.echo(xml_path)
-
-        test_file = os.path.join(root, "test.xml")
-        if os.path.isfile(test_file):
-            os.remove(test_file)
-            click.echo(test_file)
-
+import os
 
 @click.group()
 def digitization():
     pass
 
-
 @digitization.command()
 @click.option("--force", default=False, show_default=True, is_flag=True)
 @click.option("--fix-eos-paths", default=False, show_default=True, is_flag=True)
@@ -262,5 +73,16 @@ def create_collection_file(start_from_dir, output_dir):
     records_collection(start_from_dir, output_dir)
 
 
+@digitization.command("create-import-xml-files")
+@click.option("-d", "--data-path", type=str, required=True, help="Path to the boite data files folder.")
+@click.option("-o", "--output-path", type=str, required=True, help="Path to save the output logs and XML files.")
+def create_import_xml(data_path, output_path):
+    """Create XML files from the given data path. And logfile with missing files."""
+    click.echo(f"Creating import XML files from {data_path} to {output_path}.")
+    create_import_xml_files(data_path, output_path)
+    click.echo("✅ XML files created successfully.")
+
+
+
 if __name__ == "__main__":
     digitization()
@@ -1,5 +1,3 @@
 # WIP
 
 # Script to match and upload files to CDS records
-
-
@@ -1,26 +1,54 @@
 import os
-
+import logging
 import pandas as pd
+from tqdm import tqdm
+from .utils import generate_s3_url, get_s3_client, get_s3_file_path, list_s3_files, create_custom_xml
+
+
 
-from utils import generate_s3_url, get_s3_client, get_s3_file_path, list_s3_files
+def process_row(row, box_file, s3_client):
+    record_id = str(row[0])
+    record_name = str(row[1])
 
+    record_data = {
+        'record_id': record_id,
+        'pdf_url': None,
+        'pdf_latex_url': None,
+        'tiff_urls': [],
+    }
 
-def process_row(row, box_file):
     for filetype in ['PDF', 'PDF_LATEX', 'TIFF']:
-        s3_prefix_ = get_s3_file_path(filetype=filetype, 
-                                      box_file=box_file, filename=row[1])
-        files = list_s3_files('cern-archives', s3_prefix_, s3_client)
-        for file in files:
-            s3_url = generate_s3_url('cern-archives', file, s3_client=s3_client)
-            #print(f"S3 URL: {s3_url}")
+        s3_prefix = get_s3_file_path(filetype=filetype, box_file=box_file, filename=record_name)
+        files = list_s3_files('cern-archives', s3_prefix, s3_client)
+
         if not files:
-            print(f"No files found for {filetype} in {s3_prefix_}")
+            logging.info(f"[MISSING] {filetype} for record {record_name} (ID: {record_id}) in {s3_prefix}")
             continue
 
-s3_client = get_s3_client()
+        for file in files:
+            s3_url = generate_s3_url('cern-archives', file, s3_client=s3_client)
+            if filetype == 'PDF':
+                record_data['pdf_url'] = s3_url
+            elif filetype == 'PDF_LATEX':
+                record_data['pdf_latex_url'] = s3_url
+            elif filetype == 'TIFF':
+                record_data['tiff_urls'].append(s3_url)
+
+    return record_data
+
 
-data_path = os.path.join(os.getcwd(), 'data')
-for box_file in os.listdir(data_path):
-    print(f"Box file: {box_file}")
-    data = pd.read_excel(os.path.join(data_path, box_file))
-    data.apply(lambda x, box_file=box_file: process_row(x, box_file), axis=1, raw=True)
+def create_import_xml_files(data_path, output_path):
+    logging.basicConfig(filename=os.path.join(output_path, 'missing_records.log'), level=logging.INFO,
+                    format='%(asctime)s - %(message)s')
+    s3_client = get_s3_client()
+    xml_output_path = os.path.join(output_path, 'import_xml_files')
+    os.makedirs(xml_output_path, exist_ok=True)
+    for box_file in os.listdir(data_path):
+        df = pd.read_excel(os.path.join(data_path, box_file), header=None)
+        records_data = []
+        for _, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Processing {box_file}"):
+            records_data.append(process_row(row, box_file, s3_client))
+        xml_filename = os.path.splitext(box_file)[0] + ".xml"
+        xml_path = os.path.join(xml_output_path, xml_filename)
+        create_custom_xml(records_data, xml_path)
+        print(f"✅ XML written: {xml_path}")
@@ -1,8 +1,8 @@
 import os
-
 import boto3
 from dotenv import load_dotenv
-
+import xml.etree.ElementTree as ET
+from xml.dom import minidom
 load_dotenv()
 
 
@@ -30,7 +30,7 @@ def list_s3_files(bucket_name, prefix, s3_client=None):
             Delimiter='/'
         )
         if 'Contents' in response:
-            return [obj['Key'] for obj in response['Contents'] if 
+            return [obj['Key'] for obj in response['Contents'] if
                     not obj['Key'].endswith('/')]
         else:
             return []
@@ -44,3 +44,30 @@ def generate_s3_url(bucket_name, file_key, expiration=31556952, s3_client=None):
             Params={"Bucket": bucket_name, "Key": file_key},
             ExpiresIn=expiration,
         )
+
+
+def create_custom_xml(records_data, output_file_path):
+    collection = ET.Element("collection")
+    for rec in records_data:
+        if not rec.get('pdf_url') and not rec.get('pdf_latex_url') and not rec.get('tiff_urls'):
+            continue
+        record_elem = ET.SubElement(collection, "record")
+        controlfield = ET.SubElement(record_elem, "controlfield", tag="001")
+        controlfield.text = str(rec['record_id'])
+        if rec.get('pdf_url'):
+            datafield = ET.SubElement(record_elem, "datafield", tag="856", ind1="4", ind2=" ")
+            ET.SubElement(datafield, "subfield", code="u").text = rec['pdf_url']
+            ET.SubElement(datafield, "subfield", code="q").text = "PDF"
+        if rec.get('pdf_latex_url'):
+            datafield = ET.SubElement(record_elem, "datafield", tag="856", ind1="4", ind2=" ")
+            ET.SubElement(datafield, "subfield", code="u").text = rec['pdf_latex_url']
+            ET.SubElement(datafield, "subfield", code="q").text = "PDF_LATEX"
+        for tiff_url in rec.get('tiff_urls', []):
+            datafield = ET.SubElement(record_elem, "datafield", tag="856", ind1="4", ind2=" ")
+            ET.SubElement(datafield, "subfield", code="u").text = tiff_url
+            ET.SubElement(datafield, "subfield", code="q").text = "TIFF"
+    rough_string = ET.tostring(collection, encoding="utf-8")
+    reparsed = minidom.parseString(rough_string)
+    pretty_xml = reparsed.toprettyxml(indent="    ")
+    with open(output_file_path, "w", encoding="utf-8") as f:
+        f.write(pretty_xml)
-Original file line number
+Diff line change
@@ @@ -1,5 +1,3 @@ @@
 # WIP
 # Script to match and upload files to CDS records
+-
+-