Skip to content

Commit 4c92883

Browse files
committed
refactor(all): refactor the whole projects for open-source
1 parent e4c3713 commit 4c92883

14 files changed

Lines changed: 1790 additions & 1955 deletions

DocParser/main.py

Lines changed: 62 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
import glob
33
import os
44
import shutil
5+
from pathlib import Path
6+
from typing import List
57
from tqdm import tqdm
8+
from loguru import logger
69

7-
8-
from DocParser.logger import logger
910
from DocParser.vrdu import utils
1011
from DocParser.vrdu import renderer
1112
from DocParser.vrdu import preprocess
@@ -14,65 +15,76 @@
1415
from DocParser.vrdu.config import config
1516
from DocParser.vrdu.quality_check import generate_quality_report
1617

17-
log = logger.setup_app_level_logger(file_name="vrdu_debug.log")
18+
logger.add("vrdu_debug.log", mode="w")
1819

1920

20-
def transform_tex_to_images(main_directory: str) -> None:
21+
def transform_tex_to_images(main_directory: Path) -> None:
2122
"""
2223
Transforms TeX files with pattern paper_*.tex in the specified directory into jpg images.
2324
2425
Args:
25-
main_directory (str): The main directory where the TeX files are located.
26+
main_directory (Path): The main directory where the TeX files are located.
2627
2728
Returns:
2829
None
2930
"""
3031
tex_files = glob.glob(f"{main_directory}/paper_*.tex")
31-
output_directory = os.path.join(main_directory, "output")
32-
for tex_file in tqdm(tex_files):
33-
log.debug(f"[VRDU] file: {tex_file}, start transforming into images.")
34-
utils.compile_latex(tex_file)
32+
output_directory = Path(main_directory) / "output"
33+
for tex_file in tqdm(tex_files, desc="Converting TeX files to images"):
34+
logger.debug(f"[VRDU] file: {tex_file}, start transforming into images.")
35+
# Set colored flag based on filename
36+
colored = "paper_colored.tex" in tex_file
37+
utils.compile_latex(tex_file, colored=colored)
3538

3639
# get the pdf file name
37-
filename_without_extension = os.path.splitext(os.path.basename(tex_file))[0]
38-
pdf_file = os.path.join(main_directory, f"{filename_without_extension}.pdf")
40+
filename_without_extension = Path(tex_file).stem
41+
pdf_file = Path(main_directory) / f"{filename_without_extension}.pdf"
3942

4043
# convert into images
41-
image_directory = os.path.join(output_directory, filename_without_extension)
42-
os.makedirs(image_directory)
43-
utils.pdf2jpg(pdf_file, image_directory)
44+
image_directory = output_directory / filename_without_extension
45+
image_directory.mkdir(parents=True, exist_ok=True)
46+
utils.pdf2jpg(str(pdf_file), str(image_directory))
47+
48+
49+
def get_redundant_folders(main_directory: Path) -> List[str]:
50+
"""Get list of redundant folders to remove."""
51+
pattern = f"{main_directory}/output/paper_{config.folder_prefix}*"
52+
redundant_folders = glob.glob(pattern)
53+
redundant_folders.extend(
54+
[
55+
f"{main_directory}/output/paper_white",
56+
f"{main_directory}/output/paper_original",
57+
]
58+
)
59+
return redundant_folders
4460

4561

46-
def remove_redundant_stuff(main_directory: str) -> None:
62+
def remove_redundant_stuff(main_directory: Path) -> None:
4763
"""
4864
Remove redundant files and folders from the main directory.
4965
5066
Args:
51-
main_directory (str): The path of the main directory.
67+
main_directory (Path): The path of the main directory.
5268
5369
Returns:
5470
None
5571
"""
5672
# remove generated tex related files
57-
redundant_files = glob.glob(f"{main_directory}/paper_*")
58-
for file in redundant_files:
73+
for file in glob.glob(f"{main_directory}/paper_*"):
5974
os.remove(file)
6075

6176
# remove useless pdf and image files
62-
# TODO: move this name pattern into config
63-
redundant_folders = glob.glob(
64-
f"{main_directory}/output/paper_{config.folder_prefix}*"
65-
)
66-
redundant_folders += [
67-
f"{main_directory}/output/paper_white",
68-
f"{main_directory}/output/paper_original",
69-
]
70-
for folder in redundant_folders:
77+
for folder in get_redundant_folders(main_directory):
7178
if os.path.exists(folder):
7279
shutil.rmtree(folder)
7380

7481

75-
def process_one_file(file_name: str) -> None:
82+
def check_if_already_processed(main_directory: Path) -> bool:
83+
quality_report_file = main_directory / "output/result/quality_report.json"
84+
return quality_report_file.exists()
85+
86+
87+
def process_one_file(file_name: Path) -> None:
7688
"""
7789
Process a file through multiple steps including preprocessing, rendering,
7890
transforming into images, generating annotations, and handling exceptions.
@@ -83,33 +95,32 @@ def process_one_file(file_name: str) -> None:
8395
Returns:
8496
None
8597
"""
86-
main_directory = os.path.dirname(file_name)
87-
log.info(f"[VRDU] file: {file_name}, start processing.")
98+
main_directory = Path(file_name).parent
99+
logger.info(f"[VRDU] file: {file_name}, start processing.")
88100

89101
# check if this paper has been processed
90-
quality_report_file = os.path.join(
91-
main_directory, "output/result/quality_report.json"
92-
)
93-
if os.path.exists(quality_report_file):
94-
log.info(f"[VRDU] file: {file_name}, paper has been processed")
102+
if check_if_already_processed(main_directory):
103+
logger.info(f"[VRDU] file: {file_name}, paper has been processed")
95104
return
96105

97106
# make a copy of the original tex file
98-
original_tex = os.path.join(main_directory, "paper_original.tex")
107+
original_tex = main_directory / "paper_original.tex"
99108
shutil.copyfile(file_name, original_tex)
100109

101110
# remove the output folder if it exists
102-
output_directory = os.path.join(main_directory, "output")
103-
if os.path.exists(output_directory):
111+
output_directory = main_directory / "output"
112+
if output_directory.exists():
104113
shutil.rmtree(output_directory)
105114

115+
# change the working directory to the main directory of the paper
106116
cwd = os.getcwd()
107117

108118
try:
109119
# change the working directory to the main directory of the paper
110120
os.chdir(main_directory)
111-
# create output folder
112-
os.makedirs(os.path.join(main_directory, "output/result"))
121+
# create output folder and output/result folder
122+
result_dir = output_directory / "result"
123+
result_dir.mkdir(parents=True)
113124

114125
# step 1: preprocess the paper
115126
preprocess.run(original_tex)
@@ -118,14 +129,14 @@ def process_one_file(file_name: str) -> None:
118129
vrdu_renderer = renderer.Renderer()
119130
vrdu_renderer.render(original_tex)
120131

121-
# step 2.2: compling tex into PDFs
122-
log.info(
132+
# step 2.2: compiling tex into PDFs
133+
logger.info(
123134
f"[VRDU] file: {original_tex}, start transforming into images, this may take a while..."
124135
)
125136
transform_tex_to_images(main_directory)
126137

127138
# Step 3: generate annotations
128-
log.info(
139+
logger.info(
129140
f"[VRDU] file: {original_tex}, start generating annotations, this may take a while..."
130141
)
131142
vrdu_layout_annotation = layout.LayoutAnnotation(original_tex)
@@ -137,12 +148,12 @@ def process_one_file(file_name: str) -> None:
137148
# generate quality report for simple debugging
138149
generate_quality_report(main_directory)
139150

140-
log.info(f"[VRDU] file: {original_tex}, successfully processed.")
151+
logger.info(f"[VRDU] file: {original_tex}, successfully processed.")
141152

142153
except Exception as e:
143154
# error_type = e.__class__.__name__
144155
# error_info = str(e)
145-
# log.error(
156+
# logger.error(
146157
# f"[VRDU] file: {file_name}, type: {error_type}, message: {error_info}"
147158
# )
148159
raise e
@@ -180,18 +191,18 @@ def main() -> None:
180191
Returns:
181192
None
182193
"""
183-
parser = argparse.ArgumentParser()
194+
parser = argparse.ArgumentParser(
195+
description="Process TeX files to generate annotations and images"
196+
)
184197
parser.add_argument(
185198
"-f",
186199
"--file_name",
187-
type=str,
200+
type=Path,
188201
required=True,
189-
help="The name of the tex file will full path",
202+
help="The path to the TeX file to process",
190203
)
191204
args = parser.parse_args()
192-
file_name = args.file_name
193-
194-
process_one_file(file_name)
205+
process_one_file(Path(args.file_name))
195206

196207

197208
if __name__ == "__main__":

0 commit comments

Comments
 (0)