Skip to content

Commit 7e07c37

Browse files
committed
add cli for xml import file creation
1 parent 84c60f9 commit 7e07c37

10 files changed

Lines changed: 315 additions & 220 deletions

File tree

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@ build/
55
dist/
66
wheels/
77
*.egg-info
8-
8+
*.log
99
.DS_Store
1010
.env
1111

1212
data/
13+
import_xml_files/
1314
# Virtual environments
1415
.venv

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,5 @@ The location to download the files.
2121

2222
```bash
2323
poetry install
24-
poetry run python cli.py
24+
poetry run digitization
2525
```

digitization/cli.py

Lines changed: 18 additions & 196 deletions
Original file line numberDiff line numberDiff line change
@@ -1,207 +1,18 @@
11
import click
2-
import logging
3-
import os
4-
import pysftp
5-
import re
6-
import shutil
7-
import xml.etree.ElementTree as ET
2+
from .file_import.file_import import create_import_xml_files
83
from .xml_collect.xml_collect import records_collection
9-
10-
URL = "https://digitization.web.cern.ch"
11-
12-
main_directory = (
13-
"/eos/project-p/psdigitization/public/CERN-Project-Files/CERN-Project-Files/www"
4+
from .xml_collect.utils import (
5+
download_files_from_ftp,
6+
fix_white_spaces_in_directory,
7+
find_all_xmls,
8+
records_collection_creation,
149
)
15-
ERROR = []
16-
MISSING_XMLS = []
17-
REGEXP = r"(?!original)([\w\W]+)\.(xml)"
18-
MAX_NUMBER_OF_RECORDS_COLLECT = 500
19-
20-
21-
def url_from_eos_path(path):
22-
return path.replace(main_directory, URL)
23-
24-
25-
def file_list_chunker(files, chunk_size=MAX_NUMBER_OF_RECORDS_COLLECT):
26-
for i in range(0, len(files), chunk_size):
27-
yield files[i : i + chunk_size]
28-
29-
30-
def records_collection_creation(input_dir, output_dir):
31-
logging.info(f"Creating collection file for {input_dir}")
32-
file_list = [
33-
os.path.join(root, _file)
34-
for root, _, files in os.walk(input_dir, topdown=False)
35-
for _file in files
36-
if re.match(REGEXP, _file)
37-
]
38-
39-
logging.info(
40-
f"All files to be combined found: {len(file_list)}. Will generate {len(file_list) // MAX_NUMBER_OF_RECORDS_COLLECT} collection files."
41-
)
42-
43-
chunks = list(file_list_chunker(file_list))
44-
45-
os.makedirs(output_dir, exist_ok=True)
46-
47-
for collection_file_name, chunk in enumerate(chunks, start=1):
48-
filename = f"{output_dir}/{collection_file_name}.xml"
49-
50-
with open(filename, "w") as nf:
51-
nf.write("<collection>")
52-
for file_path in chunk:
53-
logging.info(f"Processing {file_path}")
54-
try:
55-
with open(file_path, "r") as f:
56-
data = f.read()
57-
except Exception as e:
58-
logging.error(f"Error while reading file {file_path}: {e}")
59-
continue
60-
try:
61-
data = data.replace("<collection>", "").replace("</collection>", "")
62-
nf.write(data)
63-
except Exception as e:
64-
logging.error(f"Error while writing file {file_path}: {e}")
65-
continue
66-
67-
nf.write("</collection>")
68-
69-
logging.info(f"Collection {collection_file_name} written successfully.")
70-
71-
72-
def fix_white_spaces_in_directory(start_dir):
73-
for root, dirs, files in os.walk(start_dir, topdown=False):
74-
for directory in dirs:
75-
if " " in directory:
76-
new_directory = directory.replace(" ", "_")
77-
os.rename(
78-
os.path.join(root, directory), os.path.join(root, new_directory)
79-
)
80-
print(f"Renamed directory: {directory} -> {new_directory}")
81-
for filename in files:
82-
if " " in filename:
83-
new_filename = filename.replace(" ", "_")
84-
os.rename(
85-
os.path.join(root, filename), os.path.join(root, new_filename)
86-
)
87-
print(f"Renamed file: {filename} -> {new_filename}")
88-
89-
90-
def download_files_from_ftp(force=False):
91-
host = os.getenv("FTP_HOST")
92-
username = os.getenv("FTP_USERNAME")
93-
password = os.getenv("FTP_PASSWORD")
94-
95-
main_directory = os.getenv("FTP_ROOT_PATH", "/CERN-Project-Files")
96-
download_directory = os.getenv("DOWNLOAD_DIR", "/tmp/")
97-
98-
cnopts = pysftp.CnOpts()
99-
cnopts.hostkeys = None
100-
downloaded_directories = []
101-
with pysftp.Connection(
102-
host=host, username=username, password=password, cnopts=cnopts
103-
) as sftp:
104-
sftp.cwd(main_directory)
105-
directory_structure = sftp.listdir_attr()
106-
for attr in directory_structure:
107-
if force or not os.path.isdir(
108-
os.path.join(download_directory, attr.filename)
109-
):
110-
click.echo(f"Downloading `{attr.filename}`.")
111-
remoteFilePath = attr.filename
112-
localFilePath = download_directory
113-
sftp.get_r(remoteFilePath, localFilePath, preserve_mtime=True)
114-
downloaded_directories.append(attr.filename)
115-
else:
116-
click.echo(
117-
f"Directory already exists`{attr.filename}`. Skip downloading..."
118-
)
119-
return downloaded_directories
120-
121-
122-
def fix_xml(root, xml_path, tif_path, pdf_path):
123-
xml_file_name = os.path.basename(xml_path)
124-
xml_file_name_original = "original_{}".format(xml_file_name)
125-
xml_path_original = os.path.join(root, xml_file_name_original)
126-
127-
if os.path.isfile(xml_path):
128-
if os.path.isfile(xml_path_original):
129-
click.echo("We have original")
130-
os.remove(xml_path)
131-
shutil.copy2(
132-
os.path.join(root, "original_{}".format(xml_file_name)), xml_path
133-
)
134-
else:
135-
click.echo("Saving the original file")
136-
shutil.copy2(
137-
xml_path, os.path.join(root, "original_{}".format(xml_file_name))
138-
)
139-
else:
140-
click.echo("XML files are missing completely!")
141-
MISSING_XMLS.append(xml_path)
142-
143-
pdf_url = url_from_eos_path(pdf_path)
144-
tif_url = url_from_eos_path(tif_path)
145-
146-
try:
147-
tree = ET.parse(xml_path)
148-
xml_root = tree.getroot()
149-
for x in xml_root.findall('.//datafield[@tag="FFT"]'):
150-
if x.find('.//subfield[@code="a"]').text.startswith("[PATH]"):
151-
x.find('.//subfield[@code="a"]').text = pdf_url
152-
153-
elif x.find('.//subfield[@code="a"]').text.startswith("[EOS_PATH]"):
154-
x.attrib["tag"] = "856"
155-
x.attrib["ind1"] = "4"
156-
for child in x.getchildren():
157-
if child.attrib["code"] == "d":
158-
x.remove(child)
159-
if child.attrib["code"] == "a":
160-
child.attrib["code"] = "u"
161-
child.text = tif_url
162-
if child.attrib["code"] == "t":
163-
child.attrib["code"] = "q"
164-
child.text = "TIFF"
165-
tree.write(xml_path, encoding="utf-8")
166-
except Exception:
167-
ERROR.append(xml_path)
168-
169-
170-
def find_all_xmls():
171-
os.chdir(main_directory)
172-
for root, dirs, files in os.walk(main_directory, topdown=False):
173-
try:
174-
xml_path = os.path.join(
175-
root,
176-
next(
177-
filter(
178-
lambda x: not x.startswith("original_") and x.endswith(".xml"),
179-
files,
180-
)
181-
),
182-
)
183-
tif_path = os.path.join(
184-
root, next(filter(lambda x: x.endswith(".tif"), files))
185-
)
186-
pdf_path = os.path.join(
187-
root, next(filter(lambda x: x.endswith(".pdf"), files))
188-
)
189-
except StopIteration:
190-
continue
191-
fix_xml(root, xml_path, tif_path, pdf_path)
192-
click.echo(xml_path)
193-
194-
test_file = os.path.join(root, "test.xml")
195-
if os.path.isfile(test_file):
196-
os.remove(test_file)
197-
click.echo(test_file)
198-
10+
import os
19911

20012
@click.group()
20113
def digitization():
20214
pass
20315

204-
20516
@digitization.command()
20617
@click.option("--force", default=False, show_default=True, is_flag=True)
20718
@click.option("--fix-eos-paths", default=False, show_default=True, is_flag=True)
@@ -262,5 +73,16 @@ def create_collection_file(start_from_dir, output_dir):
26273
records_collection(start_from_dir, output_dir)
26374

26475

76+
@digitization.command("create-import-xml-files")
77+
@click.option("-d", "--data-path", type=str, required=True, help="Path to the boite data files folder.")
78+
@click.option("-o", "--output-path", type=str, required=True, help="Path to save the output logs and XML files.")
79+
def create_import_xml(data_path, output_path):
80+
"""Create XML files from the given data path. And logfile with missing files."""
81+
click.echo(f"Creating import XML files from {data_path} to {output_path}.")
82+
create_import_xml_files(data_path, output_path)
83+
click.echo("✅ XML files created successfully.")
84+
85+
86+
26587
if __name__ == "__main__":
26688
digitization()

digitization/file_import/README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
11
# WIP
22

33
# Script to match and upload files to CDS records
4-
5-

digitization/file_import/__init__.py

Whitespace-only changes.
Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,54 @@
11
import os
2-
2+
import logging
33
import pandas as pd
4+
from tqdm import tqdm
5+
from .utils import generate_s3_url, get_s3_client, get_s3_file_path, list_s3_files, create_custom_xml
6+
7+
48

5-
from utils import generate_s3_url, get_s3_client, get_s3_file_path, list_s3_files
9+
def process_row(row, box_file, s3_client):
10+
record_id = str(row[0])
11+
record_name = str(row[1])
612

13+
record_data = {
14+
'record_id': record_id,
15+
'pdf_url': None,
16+
'pdf_latex_url': None,
17+
'tiff_urls': [],
18+
}
719

8-
def process_row(row, box_file):
920
for filetype in ['PDF', 'PDF_LATEX', 'TIFF']:
10-
s3_prefix_ = get_s3_file_path(filetype=filetype,
11-
box_file=box_file, filename=row[1])
12-
files = list_s3_files('cern-archives', s3_prefix_, s3_client)
13-
for file in files:
14-
s3_url = generate_s3_url('cern-archives', file, s3_client=s3_client)
15-
#print(f"S3 URL: {s3_url}")
21+
s3_prefix = get_s3_file_path(filetype=filetype, box_file=box_file, filename=record_name)
22+
files = list_s3_files('cern-archives', s3_prefix, s3_client)
23+
1624
if not files:
17-
print(f"No files found for {filetype} in {s3_prefix_}")
25+
logging.info(f"[MISSING] {filetype} for record {record_name} (ID: {record_id}) in {s3_prefix}")
1826
continue
1927

20-
s3_client = get_s3_client()
28+
for file in files:
29+
s3_url = generate_s3_url('cern-archives', file, s3_client=s3_client)
30+
if filetype == 'PDF':
31+
record_data['pdf_url'] = s3_url
32+
elif filetype == 'PDF_LATEX':
33+
record_data['pdf_latex_url'] = s3_url
34+
elif filetype == 'TIFF':
35+
record_data['tiff_urls'].append(s3_url)
36+
37+
return record_data
38+
2139

22-
data_path = os.path.join(os.getcwd(), 'data')
23-
for box_file in os.listdir(data_path):
24-
print(f"Box file: {box_file}")
25-
data = pd.read_excel(os.path.join(data_path, box_file))
26-
data.apply(lambda x, box_file=box_file: process_row(x, box_file), axis=1, raw=True)
40+
def create_import_xml_files(data_path, output_path):
41+
logging.basicConfig(filename=os.path.join(output_path, 'missing_records.log'), level=logging.INFO,
42+
format='%(asctime)s - %(message)s')
43+
s3_client = get_s3_client()
44+
xml_output_path = os.path.join(output_path, 'import_xml_files')
45+
os.makedirs(xml_output_path, exist_ok=True)
46+
for box_file in os.listdir(data_path):
47+
df = pd.read_excel(os.path.join(data_path, box_file), header=None)
48+
records_data = []
49+
for _, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Processing {box_file}"):
50+
records_data.append(process_row(row, box_file, s3_client))
51+
xml_filename = os.path.splitext(box_file)[0] + ".xml"
52+
xml_path = os.path.join(xml_output_path, xml_filename)
53+
create_custom_xml(records_data, xml_path)
54+
print(f"✅ XML written: {xml_path}")

digitization/file_import/utils.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import os
2-
32
import boto3
43
from dotenv import load_dotenv
5-
4+
import xml.etree.ElementTree as ET
5+
from xml.dom import minidom
66
load_dotenv()
77

88

@@ -30,7 +30,7 @@ def list_s3_files(bucket_name, prefix, s3_client=None):
3030
Delimiter='/'
3131
)
3232
if 'Contents' in response:
33-
return [obj['Key'] for obj in response['Contents'] if
33+
return [obj['Key'] for obj in response['Contents'] if
3434
not obj['Key'].endswith('/')]
3535
else:
3636
return []
@@ -44,3 +44,30 @@ def generate_s3_url(bucket_name, file_key, expiration=31556952, s3_client=None):
4444
Params={"Bucket": bucket_name, "Key": file_key},
4545
ExpiresIn=expiration,
4646
)
47+
48+
49+
def create_custom_xml(records_data, output_file_path):
50+
collection = ET.Element("collection")
51+
for rec in records_data:
52+
if not rec.get('pdf_url') and not rec.get('pdf_latex_url') and not rec.get('tiff_urls'):
53+
continue
54+
record_elem = ET.SubElement(collection, "record")
55+
controlfield = ET.SubElement(record_elem, "controlfield", tag="001")
56+
controlfield.text = str(rec['record_id'])
57+
if rec.get('pdf_url'):
58+
datafield = ET.SubElement(record_elem, "datafield", tag="856", ind1="4", ind2=" ")
59+
ET.SubElement(datafield, "subfield", code="u").text = rec['pdf_url']
60+
ET.SubElement(datafield, "subfield", code="q").text = "PDF"
61+
if rec.get('pdf_latex_url'):
62+
datafield = ET.SubElement(record_elem, "datafield", tag="856", ind1="4", ind2=" ")
63+
ET.SubElement(datafield, "subfield", code="u").text = rec['pdf_latex_url']
64+
ET.SubElement(datafield, "subfield", code="q").text = "PDF_LATEX"
65+
for tiff_url in rec.get('tiff_urls', []):
66+
datafield = ET.SubElement(record_elem, "datafield", tag="856", ind1="4", ind2=" ")
67+
ET.SubElement(datafield, "subfield", code="u").text = tiff_url
68+
ET.SubElement(datafield, "subfield", code="q").text = "TIFF"
69+
rough_string = ET.tostring(collection, encoding="utf-8")
70+
reparsed = minidom.parseString(rough_string)
71+
pretty_xml = reparsed.toprettyxml(indent=" ")
72+
with open(output_file_path, "w", encoding="utf-8") as f:
73+
f.write(pretty_xml)

0 commit comments

Comments
 (0)