forked from Travvy88/DocumentGenerator_DoGe
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
152 lines (132 loc) · 6.71 KB
/
main.py
File metadata and controls
152 lines (132 loc) · 6.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import argparse
import json
import os
from pathlib import Path
from src.manager import Manager
def create_parser():
parser = argparse.ArgumentParser(
description="Document Generator - Generate synthetic document images from text sources",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# FineWeb mode (recommended): Generate from FineWeb dataset and upload to HuggingFace
python main.py --out_dir ./output \\
--source_dataset karpathy/fineweb-edu-100b-shuffle \\
--target_dataset albertklorer/safedocs-v2 \\
--hf_token YOUR_HF_TOKEN \\
--num_documents 1000 \\
--upload_batch_size 1000
# Wikipedia mode (legacy): Generate from Wikipedia
python main.py --out_dir ./output \\
--start_page https://en.wikipedia.org/wiki/Main_Page \\
--max_urls 100
"""
)
# Output settings
parser.add_argument('--out_dir', type=str, required=True,
help='Output directory for saving results')
parser.add_argument('--remove_existing_dir', action='store_true',
help='If out_dir exists, delete the folder and files before creating a new one')
parser.add_argument('--debug', action='store_true',
help='Enable debug mode (saves colored images with bboxes)')
parser.add_argument('--image_size', type=int, default=244,
help='Size of the final images in pixels (default: 244)')
# FineWeb/HuggingFace mode settings
hf_group = parser.add_argument_group('HuggingFace Dataset Options (FineWeb mode)')
hf_group.add_argument('--source_dataset', type=str, default=None,
help='Source HuggingFace dataset name (e.g., karpathy/fineweb-edu-100b-shuffle). '
'If specified, uses FineWeb mode instead of Wikipedia mode.')
hf_group.add_argument('--target_dataset', type=str, default=None,
help='Target HuggingFace dataset name for upload (e.g., albertklorer/safedocs-v2)')
hf_group.add_argument('--hf_token', type=str, default=None,
help='HuggingFace API token for authentication. Can also be set via HF_TOKEN env var.')
hf_group.add_argument('--num_documents', type=int, default=100,
help='Number of documents to process from source dataset (default: 100)')
hf_group.add_argument('--upload_batch_size', type=int, default=1000,
help='Number of images to upload in each batch (default: 1000)')
hf_group.add_argument('--skip_documents', type=int, default=0,
help='Number of documents to skip from the beginning of the dataset (default: 0)')
hf_group.add_argument('--private_dataset', action='store_true',
help='Make the target dataset private on HuggingFace')
# Wikipedia mode settings (legacy)
wiki_group = parser.add_argument_group('Wikipedia Options (legacy mode)')
wiki_group.add_argument('--start_page', type=str, default='https://en.wikipedia.org/wiki/Main_Page',
help='Starting page URL for Wikipedia crawling (default: Wikipedia main page)')
wiki_group.add_argument('--languages', type=str, nargs='+', default=['en'],
help='Permitted Wikipedia languages. Other languages will be ignored (default: en)')
wiki_group.add_argument('--max_urls', type=int, default=16,
help='Maximum number of Wikipedia URLs to process (default: 16)')
# Processing settings
proc_group = parser.add_argument_group('Processing Options')
proc_group.add_argument('--num_processes', type=int, default=1,
help='Number of parallel processes to use (default: 1)')
proc_group.add_argument('--max_threads', type=int, default=3,
help='Maximum threads inside each process (default: 3)')
proc_group.add_argument('--ports', type=int, nargs='+', default=[8145, 8146],
help='List of ports for unoserver. Need 2x num_processes ports (default: [8145, 8146])')
return parser
def main():
parser = create_parser()
args = parser.parse_args()
# Get HF token from args or environment
hf_token = args.hf_token or os.environ.get('HF_TOKEN')
# Validate arguments
if args.source_dataset and args.target_dataset and not hf_token:
print("Warning: No HuggingFace token provided. Set --hf_token or HF_TOKEN env var for upload.")
# Validate ports
if args.num_processes * 2 > len(args.ports):
parser.error(f"Need at least {args.num_processes * 2} ports for {args.num_processes} processes. "
f"Got {len(args.ports)} ports.")
# Load docx config
config_path = Path(__file__).parent / 'docx_config.json'
with open(config_path, 'r') as f:
docx_config = json.load(f)
# Determine mode and create manager
if args.source_dataset:
print(f"Running in FineWeb mode")
print(f" Source: {args.source_dataset}")
if args.target_dataset:
print(f" Target: {args.target_dataset}")
print(f" Documents to process: {args.num_documents}")
print(f" Upload batch size: {args.upload_batch_size}")
manager = Manager(
docx_config=docx_config,
out_dir=Path(args.out_dir),
remove_existing_dir=args.remove_existing_dir,
debug=args.debug,
image_size=args.image_size,
# FineWeb mode settings
source_dataset=args.source_dataset,
target_dataset=args.target_dataset,
hf_token=hf_token,
num_documents=args.num_documents,
upload_batch_size=args.upload_batch_size,
skip_documents=args.skip_documents,
private_dataset=args.private_dataset,
# Processing settings
num_processes=args.num_processes,
max_threads=args.max_threads,
ports=tuple(args.ports)
)
else:
print(f"Running in Wikipedia mode")
print(f" Start page: {args.start_page}")
print(f" Max URLs: {args.max_urls}")
manager = Manager(
docx_config=docx_config,
out_dir=Path(args.out_dir),
remove_existing_dir=args.remove_existing_dir,
debug=args.debug,
image_size=args.image_size,
# Wikipedia mode settings
start_page=args.start_page,
languages=tuple(args.languages),
max_urls=args.max_urls,
# Processing settings
num_processes=args.num_processes,
max_threads=args.max_threads,
ports=tuple(args.ports)
)
manager.generate()
if __name__ == "__main__":
main()