|
1 | 1 | import click |
2 | | -import logging |
3 | | -import os |
4 | | -import pysftp |
5 | | -import re |
6 | | -import shutil |
7 | | -import xml.etree.ElementTree as ET |
| 2 | +from .file_import.file_import import create_import_xml_files |
8 | 3 | from .xml_collect.xml_collect import records_collection |
9 | | - |
10 | | -URL = "https://digitization.web.cern.ch" |
11 | | - |
12 | | -main_directory = ( |
13 | | - "/eos/project-p/psdigitization/public/CERN-Project-Files/CERN-Project-Files/www" |
| 4 | +from .xml_collect.utils import ( |
| 5 | + download_files_from_ftp, |
| 6 | + fix_white_spaces_in_directory, |
| 7 | + find_all_xmls, |
| 8 | + records_collection_creation, |
14 | 9 | ) |
15 | | -ERROR = [] |
16 | | -MISSING_XMLS = [] |
17 | | -REGEXP = r"(?!original)([\w\W]+)\.(xml)" |
18 | | -MAX_NUMBER_OF_RECORDS_COLLECT = 500 |
19 | | - |
20 | | - |
21 | | -def url_from_eos_path(path): |
22 | | - return path.replace(main_directory, URL) |
23 | | - |
24 | | - |
25 | | -def file_list_chunker(files, chunk_size=MAX_NUMBER_OF_RECORDS_COLLECT): |
26 | | - for i in range(0, len(files), chunk_size): |
27 | | - yield files[i : i + chunk_size] |
28 | | - |
29 | | - |
30 | | -def records_collection_creation(input_dir, output_dir): |
31 | | - logging.info(f"Creating collection file for {input_dir}") |
32 | | - file_list = [ |
33 | | - os.path.join(root, _file) |
34 | | - for root, _, files in os.walk(input_dir, topdown=False) |
35 | | - for _file in files |
36 | | - if re.match(REGEXP, _file) |
37 | | - ] |
38 | | - |
39 | | - logging.info( |
40 | | - f"All files to be combined found: {len(file_list)}. Will generate {len(file_list) // MAX_NUMBER_OF_RECORDS_COLLECT} collection files." |
41 | | - ) |
42 | | - |
43 | | - chunks = list(file_list_chunker(file_list)) |
44 | | - |
45 | | - os.makedirs(output_dir, exist_ok=True) |
46 | | - |
47 | | - for collection_file_name, chunk in enumerate(chunks, start=1): |
48 | | - filename = f"{output_dir}/{collection_file_name}.xml" |
49 | | - |
50 | | - with open(filename, "w") as nf: |
51 | | - nf.write("<collection>") |
52 | | - for file_path in chunk: |
53 | | - logging.info(f"Processing {file_path}") |
54 | | - try: |
55 | | - with open(file_path, "r") as f: |
56 | | - data = f.read() |
57 | | - except Exception as e: |
58 | | - logging.error(f"Error while reading file {file_path}: {e}") |
59 | | - continue |
60 | | - try: |
61 | | - data = data.replace("<collection>", "").replace("</collection>", "") |
62 | | - nf.write(data) |
63 | | - except Exception as e: |
64 | | - logging.error(f"Error while writing file {file_path}: {e}") |
65 | | - continue |
66 | | - |
67 | | - nf.write("</collection>") |
68 | | - |
69 | | - logging.info(f"Collection {collection_file_name} written successfully.") |
70 | | - |
71 | | - |
72 | | -def fix_white_spaces_in_directory(start_dir): |
73 | | - for root, dirs, files in os.walk(start_dir, topdown=False): |
74 | | - for directory in dirs: |
75 | | - if " " in directory: |
76 | | - new_directory = directory.replace(" ", "_") |
77 | | - os.rename( |
78 | | - os.path.join(root, directory), os.path.join(root, new_directory) |
79 | | - ) |
80 | | - print(f"Renamed directory: {directory} -> {new_directory}") |
81 | | - for filename in files: |
82 | | - if " " in filename: |
83 | | - new_filename = filename.replace(" ", "_") |
84 | | - os.rename( |
85 | | - os.path.join(root, filename), os.path.join(root, new_filename) |
86 | | - ) |
87 | | - print(f"Renamed file: {filename} -> {new_filename}") |
88 | | - |
89 | | - |
90 | | -def download_files_from_ftp(force=False): |
91 | | - host = os.getenv("FTP_HOST") |
92 | | - username = os.getenv("FTP_USERNAME") |
93 | | - password = os.getenv("FTP_PASSWORD") |
94 | | - |
95 | | - main_directory = os.getenv("FTP_ROOT_PATH", "/CERN-Project-Files") |
96 | | - download_directory = os.getenv("DOWNLOAD_DIR", "/tmp/") |
97 | | - |
98 | | - cnopts = pysftp.CnOpts() |
99 | | - cnopts.hostkeys = None |
100 | | - downloaded_directories = [] |
101 | | - with pysftp.Connection( |
102 | | - host=host, username=username, password=password, cnopts=cnopts |
103 | | - ) as sftp: |
104 | | - sftp.cwd(main_directory) |
105 | | - directory_structure = sftp.listdir_attr() |
106 | | - for attr in directory_structure: |
107 | | - if force or not os.path.isdir( |
108 | | - os.path.join(download_directory, attr.filename) |
109 | | - ): |
110 | | - click.echo(f"Downloading `{attr.filename}`.") |
111 | | - remoteFilePath = attr.filename |
112 | | - localFilePath = download_directory |
113 | | - sftp.get_r(remoteFilePath, localFilePath, preserve_mtime=True) |
114 | | - downloaded_directories.append(attr.filename) |
115 | | - else: |
116 | | - click.echo( |
117 | | - f"Directory already exists`{attr.filename}`. Skip downloading..." |
118 | | - ) |
119 | | - return downloaded_directories |
120 | | - |
121 | | - |
122 | | -def fix_xml(root, xml_path, tif_path, pdf_path): |
123 | | - xml_file_name = os.path.basename(xml_path) |
124 | | - xml_file_name_original = "original_{}".format(xml_file_name) |
125 | | - xml_path_original = os.path.join(root, xml_file_name_original) |
126 | | - |
127 | | - if os.path.isfile(xml_path): |
128 | | - if os.path.isfile(xml_path_original): |
129 | | - click.echo("We have original") |
130 | | - os.remove(xml_path) |
131 | | - shutil.copy2( |
132 | | - os.path.join(root, "original_{}".format(xml_file_name)), xml_path |
133 | | - ) |
134 | | - else: |
135 | | - click.echo("Saving the original file") |
136 | | - shutil.copy2( |
137 | | - xml_path, os.path.join(root, "original_{}".format(xml_file_name)) |
138 | | - ) |
139 | | - else: |
140 | | - click.echo("XML files are missing completely!") |
141 | | - MISSING_XMLS.append(xml_path) |
142 | | - |
143 | | - pdf_url = url_from_eos_path(pdf_path) |
144 | | - tif_url = url_from_eos_path(tif_path) |
145 | | - |
146 | | - try: |
147 | | - tree = ET.parse(xml_path) |
148 | | - xml_root = tree.getroot() |
149 | | - for x in xml_root.findall('.//datafield[@tag="FFT"]'): |
150 | | - if x.find('.//subfield[@code="a"]').text.startswith("[PATH]"): |
151 | | - x.find('.//subfield[@code="a"]').text = pdf_url |
152 | | - |
153 | | - elif x.find('.//subfield[@code="a"]').text.startswith("[EOS_PATH]"): |
154 | | - x.attrib["tag"] = "856" |
155 | | - x.attrib["ind1"] = "4" |
156 | | - for child in x.getchildren(): |
157 | | - if child.attrib["code"] == "d": |
158 | | - x.remove(child) |
159 | | - if child.attrib["code"] == "a": |
160 | | - child.attrib["code"] = "u" |
161 | | - child.text = tif_url |
162 | | - if child.attrib["code"] == "t": |
163 | | - child.attrib["code"] = "q" |
164 | | - child.text = "TIFF" |
165 | | - tree.write(xml_path, encoding="utf-8") |
166 | | - except Exception: |
167 | | - ERROR.append(xml_path) |
168 | | - |
169 | | - |
170 | | -def find_all_xmls(): |
171 | | - os.chdir(main_directory) |
172 | | - for root, dirs, files in os.walk(main_directory, topdown=False): |
173 | | - try: |
174 | | - xml_path = os.path.join( |
175 | | - root, |
176 | | - next( |
177 | | - filter( |
178 | | - lambda x: not x.startswith("original_") and x.endswith(".xml"), |
179 | | - files, |
180 | | - ) |
181 | | - ), |
182 | | - ) |
183 | | - tif_path = os.path.join( |
184 | | - root, next(filter(lambda x: x.endswith(".tif"), files)) |
185 | | - ) |
186 | | - pdf_path = os.path.join( |
187 | | - root, next(filter(lambda x: x.endswith(".pdf"), files)) |
188 | | - ) |
189 | | - except StopIteration: |
190 | | - continue |
191 | | - fix_xml(root, xml_path, tif_path, pdf_path) |
192 | | - click.echo(xml_path) |
193 | | - |
194 | | - test_file = os.path.join(root, "test.xml") |
195 | | - if os.path.isfile(test_file): |
196 | | - os.remove(test_file) |
197 | | - click.echo(test_file) |
198 | | - |
| 10 | +import os |
199 | 11 |
|
200 | 12 | @click.group() |
201 | 13 | def digitization(): |
202 | 14 | pass |
203 | 15 |
|
204 | | - |
205 | 16 | @digitization.command() |
206 | 17 | @click.option("--force", default=False, show_default=True, is_flag=True) |
207 | 18 | @click.option("--fix-eos-paths", default=False, show_default=True, is_flag=True) |
@@ -262,5 +73,16 @@ def create_collection_file(start_from_dir, output_dir): |
262 | 73 | records_collection(start_from_dir, output_dir) |
263 | 74 |
|
264 | 75 |
|
| 76 | +@digitization.command("create-import-xml-files") |
| 77 | +@click.option("-d", "--data-path", type=str, required=True, help="Path to the boite data files folder.") |
| 78 | +@click.option("-o", "--output-path", type=str, required=True, help="Path to save the output logs and XML files.") |
| 79 | +def create_import_xml(data_path, output_path): |
| 80 | + """Create XML files from the given data path. And logfile with missing files.""" |
| 81 | + click.echo(f"Creating import XML files from {data_path} to {output_path}.") |
| 82 | + create_import_xml_files(data_path, output_path) |
| 83 | + click.echo("✅ XML files created successfully.") |
| 84 | + |
| 85 | + |
| 86 | + |
265 | 87 | if __name__ == "__main__": |
266 | 88 | digitization() |
0 commit comments