22import glob
33import os
44import shutil
5+ from pathlib import Path
6+ from typing import List
57from tqdm import tqdm
8+ from loguru import logger
69
7-
8- from DocParser .logger import logger
910from DocParser .vrdu import utils
1011from DocParser .vrdu import renderer
1112from DocParser .vrdu import preprocess
1415from DocParser .vrdu .config import config
1516from DocParser .vrdu .quality_check import generate_quality_report
1617
17- log = logger .setup_app_level_logger ( file_name = "vrdu_debug.log" )
18+ logger .add ( "vrdu_debug.log" , mode = "w " )
1819
1920
20- def transform_tex_to_images (main_directory : str ) -> None :
21+ def transform_tex_to_images (main_directory : Path ) -> None :
2122 """
2223 Transforms TeX files with pattern paper_*.tex in the specified directory into jpg images.
2324
2425 Args:
25- main_directory (str ): The main directory where the TeX files are located.
26+ main_directory (Path ): The main directory where the TeX files are located.
2627
2728 Returns:
2829 None
2930 """
3031 tex_files = glob .glob (f"{ main_directory } /paper_*.tex" )
31- output_directory = os .path .join (main_directory , "output" )
32- for tex_file in tqdm (tex_files ):
33- log .debug (f"[VRDU] file: { tex_file } , start transforming into images." )
34- utils .compile_latex (tex_file )
32+ output_directory = Path (main_directory ) / "output"
33+ for tex_file in tqdm (tex_files , desc = "Converting TeX files to images" ):
34+ logger .debug (f"[VRDU] file: { tex_file } , start transforming into images." )
35+ # Set colored flag based on filename
36+ colored = "paper_colored.tex" in tex_file
37+ utils .compile_latex (tex_file , colored = colored )
3538
3639 # get the pdf file name
37- filename_without_extension = os . path . splitext ( os . path . basename ( tex_file ))[ 0 ]
38- pdf_file = os . path . join (main_directory , f"{ filename_without_extension } .pdf" )
40+ filename_without_extension = Path ( tex_file ). stem
41+ pdf_file = Path (main_directory ) / f"{ filename_without_extension } .pdf"
3942
4043 # convert into images
41- image_directory = os .path .join (output_directory , filename_without_extension )
42- os .makedirs (image_directory )
43- utils .pdf2jpg (pdf_file , image_directory )
44+ image_directory = output_directory / filename_without_extension
45+ image_directory .mkdir (parents = True , exist_ok = True )
46+ utils .pdf2jpg (str (pdf_file ), str (image_directory ))
47+
48+
49+ def get_redundant_folders (main_directory : Path ) -> List [str ]:
50+ """Get list of redundant folders to remove."""
51+ pattern = f"{ main_directory } /output/paper_{ config .folder_prefix } *"
52+ redundant_folders = glob .glob (pattern )
53+ redundant_folders .extend (
54+ [
55+ f"{ main_directory } /output/paper_white" ,
56+ f"{ main_directory } /output/paper_original" ,
57+ ]
58+ )
59+ return redundant_folders
4460
4561
46- def remove_redundant_stuff (main_directory : str ) -> None :
62+ def remove_redundant_stuff (main_directory : Path ) -> None :
4763 """
4864 Remove redundant files and folders from the main directory.
4965
5066 Args:
51- main_directory (str ): The path of the main directory.
67+ main_directory (Path ): The path of the main directory.
5268
5369 Returns:
5470 None
5571 """
5672 # remove generated tex related files
57- redundant_files = glob .glob (f"{ main_directory } /paper_*" )
58- for file in redundant_files :
73+ for file in glob .glob (f"{ main_directory } /paper_*" ):
5974 os .remove (file )
6075
6176 # remove useless pdf and image files
62- # TODO: move this name pattern into config
63- redundant_folders = glob .glob (
64- f"{ main_directory } /output/paper_{ config .folder_prefix } *"
65- )
66- redundant_folders += [
67- f"{ main_directory } /output/paper_white" ,
68- f"{ main_directory } /output/paper_original" ,
69- ]
70- for folder in redundant_folders :
77+ for folder in get_redundant_folders (main_directory ):
7178 if os .path .exists (folder ):
7279 shutil .rmtree (folder )
7380
7481
75- def process_one_file (file_name : str ) -> None :
82+ def check_if_already_processed (main_directory : Path ) -> bool :
83+ quality_report_file = main_directory / "output/result/quality_report.json"
84+ return quality_report_file .exists ()
85+
86+
87+ def process_one_file (file_name : Path ) -> None :
7688 """
7789 Process a file through multiple steps including preprocessing, rendering,
7890 transforming into images, generating annotations, and handling exceptions.
@@ -83,33 +95,32 @@ def process_one_file(file_name: str) -> None:
8395 Returns:
8496 None
8597 """
86- main_directory = os . path . dirname (file_name )
87- log .info (f"[VRDU] file: { file_name } , start processing." )
98+ main_directory = Path (file_name ). parent
99+ logger .info (f"[VRDU] file: { file_name } , start processing." )
88100
89101 # check if this paper has been processed
90- quality_report_file = os .path .join (
91- main_directory , "output/result/quality_report.json"
92- )
93- if os .path .exists (quality_report_file ):
94- log .info (f"[VRDU] file: { file_name } , paper has been processed" )
102+ if check_if_already_processed (main_directory ):
103+ logger .info (f"[VRDU] file: { file_name } , paper has been processed" )
95104 return
96105
97106 # make a copy of the original tex file
98- original_tex = os . path . join ( main_directory , "paper_original.tex" )
107+ original_tex = main_directory / "paper_original.tex"
99108 shutil .copyfile (file_name , original_tex )
100109
101110 # remove the output folder if it exists
102- output_directory = os . path . join ( main_directory , "output" )
103- if os . path . exists (output_directory ):
111+ output_directory = main_directory / "output"
112+ if output_directory . exists ():
104113 shutil .rmtree (output_directory )
105114
115+ # change the working directory to the main directory of the paper
106116 cwd = os .getcwd ()
107117
108118 try :
109119 # change the working directory to the main directory of the paper
110120 os .chdir (main_directory )
111- # create output folder
112- os .makedirs (os .path .join (main_directory , "output/result" ))
121+ # create output folder and output/result folder
122+ result_dir = output_directory / "result"
123+ result_dir .mkdir (parents = True )
113124
114125 # step 1: preprocess the paper
115126 preprocess .run (original_tex )
@@ -118,14 +129,14 @@ def process_one_file(file_name: str) -> None:
118129 vrdu_renderer = renderer .Renderer ()
119130 vrdu_renderer .render (original_tex )
120131
121- # step 2.2: compling tex into PDFs
122- log .info (
132+ # step 2.2: compiling tex into PDFs
133+ logger .info (
123134 f"[VRDU] file: { original_tex } , start transforming into images, this may take a while..."
124135 )
125136 transform_tex_to_images (main_directory )
126137
127138 # Step 3: generate annotations
128- log .info (
139+ logger .info (
129140 f"[VRDU] file: { original_tex } , start generating annotations, this may take a while..."
130141 )
131142 vrdu_layout_annotation = layout .LayoutAnnotation (original_tex )
@@ -137,12 +148,12 @@ def process_one_file(file_name: str) -> None:
137148 # generate quality report for simple debugging
138149 generate_quality_report (main_directory )
139150
140- log .info (f"[VRDU] file: { original_tex } , successfully processed." )
151+ logger .info (f"[VRDU] file: { original_tex } , successfully processed." )
141152
142153 except Exception as e :
143154 # error_type = e.__class__.__name__
144155 # error_info = str(e)
145- # log .error(
156+ # logger .error(
146157 # f"[VRDU] file: {file_name}, type: {error_type}, message: {error_info}"
147158 # )
148159 raise e
@@ -180,18 +191,18 @@ def main() -> None:
180191 Returns:
181192 None
182193 """
183- parser = argparse .ArgumentParser ()
194+ parser = argparse .ArgumentParser (
195+ description = "Process TeX files to generate annotations and images"
196+ )
184197 parser .add_argument (
185198 "-f" ,
186199 "--file_name" ,
187- type = str ,
200+ type = Path ,
188201 required = True ,
189- help = "The name of the tex file will full path " ,
202+ help = "The path to the TeX file to process " ,
190203 )
191204 args = parser .parse_args ()
192- file_name = args .file_name
193-
194- process_one_file (file_name )
205+ process_one_file (Path (args .file_name ))
195206
196207
197208if __name__ == "__main__" :
0 commit comments