DocumentGenerator_DoGe/main.py at master · albertklor/DocumentGenerator_DoGe · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import argparse
import json
import os
from pathlib import Path
from src.manager import Manager


def create_parser():
    parser = argparse.ArgumentParser(
        description="Document Generator - Generate synthetic document images from text sources",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # FineWeb mode (recommended): Generate from FineWeb dataset and upload to HuggingFace
  python main.py --out_dir ./output \\
      --source_dataset karpathy/fineweb-edu-100b-shuffle \\
      --target_dataset albertklorer/safedocs-v2 \\
      --hf_token YOUR_HF_TOKEN \\
      --num_documents 1000 \\
      --upload_batch_size 1000

  # Wikipedia mode (legacy): Generate from Wikipedia
  python main.py --out_dir ./output \\
      --start_page https://en.wikipedia.org/wiki/Main_Page \\
      --max_urls 100
        """
    )

    # Output settings
    parser.add_argument('--out_dir', type=str, required=True,
                        help='Output directory for saving results')
    parser.add_argument('--remove_existing_dir', action='store_true',
                        help='If out_dir exists, delete the folder and files before creating a new one')
    parser.add_argument('--debug', action='store_true',
                        help='Enable debug mode (saves colored images with bboxes)')
    parser.add_argument('--image_size', type=int, default=244,
                        help='Size of the final images in pixels (default: 244)')

    # FineWeb/HuggingFace mode settings
    hf_group = parser.add_argument_group('HuggingFace Dataset Options (FineWeb mode)')
    hf_group.add_argument('--source_dataset', type=str, default=None,
                          help='Source HuggingFace dataset name (e.g., karpathy/fineweb-edu-100b-shuffle). '
                               'If specified, uses FineWeb mode instead of Wikipedia mode.')
    hf_group.add_argument('--target_dataset', type=str, default=None,
                          help='Target HuggingFace dataset name for upload (e.g., albertklorer/safedocs-v2)')
    hf_group.add_argument('--hf_token', type=str, default=None,
                          help='HuggingFace API token for authentication. Can also be set via HF_TOKEN env var.')
    hf_group.add_argument('--num_documents', type=int, default=100,
                          help='Number of documents to process from source dataset (default: 100)')
    hf_group.add_argument('--upload_batch_size', type=int, default=1000,
                          help='Number of images to upload in each batch (default: 1000)')
    hf_group.add_argument('--skip_documents', type=int, default=0,
                          help='Number of documents to skip from the beginning of the dataset (default: 0)')
    hf_group.add_argument('--private_dataset', action='store_true',
                          help='Make the target dataset private on HuggingFace')

    # Wikipedia mode settings (legacy)
    wiki_group = parser.add_argument_group('Wikipedia Options (legacy mode)')
    wiki_group.add_argument('--start_page', type=str, default='https://en.wikipedia.org/wiki/Main_Page',
                            help='Starting page URL for Wikipedia crawling (default: Wikipedia main page)')
    wiki_group.add_argument('--languages', type=str, nargs='+', default=['en'],
                            help='Permitted Wikipedia languages. Other languages will be ignored (default: en)')
    wiki_group.add_argument('--max_urls', type=int, default=16,
                            help='Maximum number of Wikipedia URLs to process (default: 16)')

    # Processing settings
    proc_group = parser.add_argument_group('Processing Options')
    proc_group.add_argument('--num_processes', type=int, default=1,
                            help='Number of parallel processes to use (default: 1)')
    proc_group.add_argument('--max_threads', type=int, default=3,
                            help='Maximum threads inside each process (default: 3)')
    proc_group.add_argument('--ports', type=int, nargs='+', default=[8145, 8146],
                            help='List of ports for unoserver. Need 2x num_processes ports (default: [8145, 8146])')

    return parser


def main():
    parser = create_parser()
    args = parser.parse_args()

    # Get HF token from args or environment
    hf_token = args.hf_token or os.environ.get('HF_TOKEN')

    # Validate arguments
    if args.source_dataset and args.target_dataset and not hf_token:
        print("Warning: No HuggingFace token provided. Set --hf_token or HF_TOKEN env var for upload.")

    # Validate ports
    if args.num_processes * 2 > len(args.ports):
        parser.error(f"Need at least {args.num_processes * 2} ports for {args.num_processes} processes. "
                     f"Got {len(args.ports)} ports.")

    # Load docx config
    config_path = Path(__file__).parent / 'docx_config.json'
    with open(config_path, 'r') as f:
        docx_config = json.load(f)

    # Determine mode and create manager
    if args.source_dataset:
        print(f"Running in FineWeb mode")
        print(f"  Source: {args.source_dataset}")
        if args.target_dataset:
            print(f"  Target: {args.target_dataset}")
        print(f"  Documents to process: {args.num_documents}")
        print(f"  Upload batch size: {args.upload_batch_size}")

        manager = Manager(
            docx_config=docx_config,
            out_dir=Path(args.out_dir),
            remove_existing_dir=args.remove_existing_dir,
            debug=args.debug,
            image_size=args.image_size,
            # FineWeb mode settings
            source_dataset=args.source_dataset,
            target_dataset=args.target_dataset,
            hf_token=hf_token,
            num_documents=args.num_documents,
            upload_batch_size=args.upload_batch_size,
            skip_documents=args.skip_documents,
            private_dataset=args.private_dataset,
            # Processing settings
            num_processes=args.num_processes,
            max_threads=args.max_threads,
            ports=tuple(args.ports)
        )
    else:
        print(f"Running in Wikipedia mode")
        print(f"  Start page: {args.start_page}")
        print(f"  Max URLs: {args.max_urls}")

        manager = Manager(
            docx_config=docx_config,
            out_dir=Path(args.out_dir),
            remove_existing_dir=args.remove_existing_dir,
            debug=args.debug,
            image_size=args.image_size,
            # Wikipedia mode settings
            start_page=args.start_page,
            languages=tuple(args.languages),
            max_urls=args.max_urls,
            # Processing settings
            num_processes=args.num_processes,
            max_threads=args.max_threads,
            ports=tuple(args.ports)
        )

    manager.generate()


if __name__ == "__main__":
    main()