diff --git a/README.md b/README.md index 40c1429..3dea499 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,8 @@ This Python project focuses on detecting and recognizing Tibetan text in images. 3. **Inference**: Detect Tibetan text blocks in new images, including support for Staatsbibliothek zu Berlin digital collections 4. **OCR**: Apply Tesseract OCR to the detected text blocks to extract the actual text content -![Validation results](res/results_val_1.png) +## Example of synthetic data +![generated synthetic data](res/results_val_1.jpg) ## Quick Start Guide @@ -17,8 +18,8 @@ This Python project focuses on detecting and recognizing Tibetan text in images. ```bash # Clone the repository -git clone https://github.com/nih23/Tibetan-NLP.git -cd Tibetan-NLP +git clone https://github.com/CodexAITeam/PechaBridge +cd PechaBridge # Install dependencies pip install -r requirements.txt @@ -33,7 +34,37 @@ pip install -r requirements.txt ```bash # 1. Generate dataset -python generate_training_data.py --train_samples 1000 --val_samples 200 --image_size 1024 +python generate_training_data.py --train_samples 10 --val_samples 10 --font_path_tibetan ext/Microsoft\ Himalaya.ttf --font_path_chinese ext/simkai.ttf --dataset_name tibetan-yolo + +# 1.5 Inspect and validate dataset with Label Studio (optional) +# Install Label Studio if not already installed: +# pip install label-studio label-studio-converter + +# Set up environment variables for local file serving +export LABEL_STUDIO_LOCAL_FILES_SERVING_ENABLED=true +export LABEL_STUDIO_LOCAL_FILES_DOCUMENT_ROOT=$(pwd)/datasets/tibetan-yolo + +# Create classes.txt for Label studio compatibility +echo "tibetan_no\ntext_body\nchinese_no" > datasets/tibetan-yolo/train/classes.txt +echo "tibetan_no\ntext_body\nchinese_no" > datasets/tibetan-yolo/val/classes.txt + +# Convert YOLO annotations to Label Studio format +label-studio-converter import yolo -i datasets/tibetan-yolo/train -o ls-tasks.json --image-ext ".png" --image-root-url "/data/local-files/?d=train/images" + +# Start Label Studio web interface (opens at http://localhost:8080) +label-studio + +# In Label Studio: +# 1. Create a new project: +# 1.1 Go to the project settings and select Cloud Storage. +# 1.2 Click Add Source Storage and select Local files from the Storage Type options. +# 1.3 Set the Absolute local path to `$(pwd)/datasets/tibetan-yolo` (You need to resolv `$(pwd)`) +# 1.4 Click Add storage. +# 2. Import the generated ls-tasks.json file +# 3. Review and validate the generated annotations +# 4. Export corrections if needed + +# [1] https://github.com/HumanSignal/label-studio-sdk/tree/master/src/label_studio_sdk/converter#tutorial-importing-yolo-pre-annotated-images-to-label-studio-using-local-storage # 2. Train model python train_model.py --epochs 100 --export diff --git a/data/tibetan numbers/annotations/complete_layout/bg_PPN337138764X_00000005.csv b/data/tibetan numbers/annotations/complete_layout/bg_PPN337138764X_00000005.csv new file mode 100644 index 0000000..2c91139 --- /dev/null +++ b/data/tibetan numbers/annotations/complete_layout/bg_PPN337138764X_00000005.csv @@ -0,0 +1,6 @@ +yolo_label,label_name,bbox_x,bbox_y,bbox_width,bbox_height,image_name,image_width,image_height +0,tibetan_no,18,40,54,275,bg_PPN337138764X_00000005.png,1024,361 +1,illustration_left,73,40,211,276,bg_PPN337138764X_00000005.png,1024,361 +2,text_body,286,40,442,278,bg_PPN337138764X_00000005.png,1024,361 +3,illustration_right,731,40,224,277,bg_PPN337138764X_00000005.png,1024,361 +4,chinese_no,956,41,52,279,bg_PPN337138764X_00000005.png,1024,361 \ No newline at end of file diff --git a/data/tibetan numbers/annotations/complete_layout/bg_PPN337138764X_00000005.txt b/data/tibetan numbers/annotations/complete_layout/bg_PPN337138764X_00000005.txt new file mode 100644 index 0000000..f0ea684 --- /dev/null +++ b/data/tibetan numbers/annotations/complete_layout/bg_PPN337138764X_00000005.txt @@ -0,0 +1,5 @@ +0 0.044237 0.491942 0.052760 0.762097 +1 0.174513 0.493093 0.206169 0.764399 +2 0.495130 0.496546 0.431818 0.771306 +3 0.823052 0.494244 0.219156 0.766701 +4 0.958604 0.500000 0.050325 0.773609 \ No newline at end of file diff --git a/data/tibetan numbers/annotations/illustrations/bg_PPN337138764X_00000005.csv b/data/tibetan numbers/annotations/illustrations/bg_PPN337138764X_00000005.csv new file mode 100644 index 0000000..51de394 --- /dev/null +++ b/data/tibetan numbers/annotations/illustrations/bg_PPN337138764X_00000005.csv @@ -0,0 +1,6 @@ +yolo_label,label_name,bbox_x,bbox_y,bbox_width,bbox_height,image_name,image_width,image_height +0,tibetan_no,24,28,52,304,bg_PPN337138764X_00000005.png,1024,361 +1,illustration_left,80,28,263,303,bg_PPN337138764X_00000005.png,1024,361 +2,illustration_centered,348,28,313,304,bg_PPN337138764X_00000005.png,1024,361 +3,illustration_right,668,28,267,304,bg_PPN337138764X_00000005.png,1024,361 +4,chinese_no,940,28,57,305,bg_PPN337138764X_00000005.png,1024,361 \ No newline at end of file diff --git a/data/tibetan numbers/annotations/illustrations/bg_PPN337138764X_00000005.txt b/data/tibetan numbers/annotations/illustrations/bg_PPN337138764X_00000005.txt new file mode 100644 index 0000000..c26f063 --- /dev/null +++ b/data/tibetan numbers/annotations/illustrations/bg_PPN337138764X_00000005.txt @@ -0,0 +1,5 @@ +0 0.049107 0.497698 0.051136 0.842681 +1 0.206575 0.496546 0.257305 0.840378 +2 0.493101 0.500000 0.306006 0.842681 +3 0.782873 0.497698 0.260552 0.842681 +4 0.945617 0.498849 0.055195 0.844983 \ No newline at end of file diff --git a/data/tibetan numbers/annotations/tibetan_chinese_no/bg_PPN337138764X_00000005.csv b/data/tibetan numbers/annotations/tibetan_chinese_no/bg_PPN337138764X_00000005.csv new file mode 100644 index 0000000..df7d615 --- /dev/null +++ b/data/tibetan numbers/annotations/tibetan_chinese_no/bg_PPN337138764X_00000005.csv @@ -0,0 +1,4 @@ +yolo_label,label_name,bbox_x,bbox_y,bbox_width,bbox_height,image_name,image_width,image_height +0,tibetan_no,14,46,113,283,bg_PPN337138764X_00000005.png,1024,361 +1,text_body,130,45,772,284,bg_PPN337138764X_00000005.png,1024,361 +2,chinese_no,906,45,107,284,bg_PPN337138764X_00000005.png,1024,361 \ No newline at end of file diff --git a/data/tibetan numbers/annotations/tibetan_chinese_no/bg_PPN337138764X_00000005.txt b/data/tibetan numbers/annotations/tibetan_chinese_no/bg_PPN337138764X_00000005.txt new file mode 100644 index 0000000..16bc6fc --- /dev/null +++ b/data/tibetan numbers/annotations/tibetan_chinese_no/bg_PPN337138764X_00000005.txt @@ -0,0 +1,3 @@ +0 0.068994 0.519570 0.110390 0.785121 +1 0.504464 0.518419 0.754058 0.787423 +2 0.937094 0.518419 0.104708 0.787423 \ No newline at end of file diff --git a/data/tibetan numbers/backgrounds/bg_IMG_5086.jpg b/data/tibetan numbers/backgrounds/bg_IMG_5086.jpg new file mode 100644 index 0000000..911d5c9 Binary files /dev/null and b/data/tibetan numbers/backgrounds/bg_IMG_5086.jpg differ diff --git a/data/tibetan numbers/backgrounds/bg_PPN3371387534_00000007.jpg b/data/tibetan numbers/backgrounds/bg_PPN3371387534_00000007.jpg new file mode 100644 index 0000000..e2c3501 Binary files /dev/null and b/data/tibetan numbers/backgrounds/bg_PPN3371387534_00000007.jpg differ diff --git a/data/tibetan numbers/backgrounds/bg_PPN337138764X_00000005.jpg b/data/tibetan numbers/backgrounds/bg_PPN337138764X_00000005.jpg new file mode 100644 index 0000000..6eb6675 Binary files /dev/null and b/data/tibetan numbers/backgrounds/bg_PPN337138764X_00000005.jpg differ diff --git a/data/tibetan numbers/backgrounds/bg_PPN3371388603_00000004.jpg b/data/tibetan numbers/backgrounds/bg_PPN3371388603_00000004.jpg new file mode 100644 index 0000000..d4bb148 Binary files /dev/null and b/data/tibetan numbers/backgrounds/bg_PPN3371388603_00000004.jpg differ diff --git a/data/tibetan numbers/backgrounds/bg_PPN3371389286_00000003.jpg b/data/tibetan numbers/backgrounds/bg_PPN3371389286_00000003.jpg new file mode 100644 index 0000000..d38e582 Binary files /dev/null and b/data/tibetan numbers/backgrounds/bg_PPN3371389286_00000003.jpg differ diff --git a/data/tibetan numbers/bg_train/Dalle_1.jpg b/data/tibetan numbers/bg_train/Dalle_1.jpg new file mode 100644 index 0000000..d23ab00 Binary files /dev/null and b/data/tibetan numbers/bg_train/Dalle_1.jpg differ diff --git a/data/tibetan numbers/bg_train/Dalle_2.jpg b/data/tibetan numbers/bg_train/Dalle_2.jpg new file mode 100644 index 0000000..831fcf6 Binary files /dev/null and b/data/tibetan numbers/bg_train/Dalle_2.jpg differ diff --git a/data/tibetan numbers/bg_val/Dalle2.jpg b/data/tibetan numbers/bg_val/Dalle2.jpg new file mode 100644 index 0000000..6ec8966 Binary files /dev/null and b/data/tibetan numbers/bg_val/Dalle2.jpg differ diff --git a/data/tibetan numbers/bg_val/Dalle3.jpg b/data/tibetan numbers/bg_val/Dalle3.jpg new file mode 100644 index 0000000..cce4abe Binary files /dev/null and b/data/tibetan numbers/bg_val/Dalle3.jpg differ diff --git a/data/tibetan numbers/bg_val/Dalle4.jpg b/data/tibetan numbers/bg_val/Dalle4.jpg new file mode 100644 index 0000000..e2c314d Binary files /dev/null and b/data/tibetan numbers/bg_val/Dalle4.jpg differ diff --git a/data/tibetan numbers/buddha_illustrations/buddha_01.png b/data/tibetan numbers/buddha_illustrations/buddha_01.png new file mode 100644 index 0000000..27d5c81 Binary files /dev/null and b/data/tibetan numbers/buddha_illustrations/buddha_01.png differ diff --git a/data/tibetan numbers/buddha_illustrations/buddha_02.png b/data/tibetan numbers/buddha_illustrations/buddha_02.png new file mode 100644 index 0000000..58041d9 Binary files /dev/null and b/data/tibetan numbers/buddha_illustrations/buddha_02.png differ diff --git a/data/tibetan numbers/buddha_illustrations/buddha_03.png b/data/tibetan numbers/buddha_illustrations/buddha_03.png new file mode 100644 index 0000000..57723f4 Binary files /dev/null and b/data/tibetan numbers/buddha_illustrations/buddha_03.png differ diff --git a/data/tibetan numbers/buddha_illustrations/buddha_05.png b/data/tibetan numbers/buddha_illustrations/buddha_05.png new file mode 100644 index 0000000..11d4e0d Binary files /dev/null and b/data/tibetan numbers/buddha_illustrations/buddha_05.png differ diff --git a/data/tibetan numbers/buddha_illustrations/buddha_06.png b/data/tibetan numbers/buddha_illustrations/buddha_06.png new file mode 100644 index 0000000..ea39d8e Binary files /dev/null and b/data/tibetan numbers/buddha_illustrations/buddha_06.png differ diff --git a/data/tibetan numbers/buddha_illustrations/buddha_07.png b/data/tibetan numbers/buddha_illustrations/buddha_07.png new file mode 100644 index 0000000..9c72c1b Binary files /dev/null and b/data/tibetan numbers/buddha_illustrations/buddha_07.png differ diff --git a/data/tibetan numbers/corpora/tib_no_0001.txt b/data/tibetan numbers/corpora/tib_no_0001.txt new file mode 100644 index 0000000..822f14d --- /dev/null +++ b/data/tibetan numbers/corpora/tib_no_0001.txt @@ -0,0 +1 @@ +གཅིག་ \ No newline at end of file diff --git a/data/tibetan numbers/corpora/tib_no_0002.txt b/data/tibetan numbers/corpora/tib_no_0002.txt new file mode 100644 index 0000000..c4e945d --- /dev/null +++ b/data/tibetan numbers/corpora/tib_no_0002.txt @@ -0,0 +1 @@ +གཉིས \ No newline at end of file diff --git a/data/tibetan numbers/corpora/tib_no_0003.txt b/data/tibetan numbers/corpora/tib_no_0003.txt new file mode 100644 index 0000000..71882c5 --- /dev/null +++ b/data/tibetan numbers/corpora/tib_no_0003.txt @@ -0,0 +1 @@ +གསུམ \ No newline at end of file diff --git a/data/tibetan numbers/corpora/tib_no_0004.txt b/data/tibetan numbers/corpora/tib_no_0004.txt new file mode 100644 index 0000000..f69087e --- /dev/null +++ b/data/tibetan numbers/corpora/tib_no_0004.txt @@ -0,0 +1 @@ +བཞི \ No newline at end of file diff --git a/data/tibetan numbers/corpora/tib_no_0005.txt b/data/tibetan numbers/corpora/tib_no_0005.txt new file mode 100644 index 0000000..aa5711f --- /dev/null +++ b/data/tibetan numbers/corpora/tib_no_0005.txt @@ -0,0 +1 @@ +ལྔ \ No newline at end of file diff --git a/data/tibetan numbers/corpora/tib_no_0006.txt b/data/tibetan numbers/corpora/tib_no_0006.txt new file mode 100644 index 0000000..f50b901 --- /dev/null +++ b/data/tibetan numbers/corpora/tib_no_0006.txt @@ -0,0 +1 @@ +དྲུག་ \ No newline at end of file diff --git a/data/tibetan numbers/corpora/tib_no_0007.txt b/data/tibetan numbers/corpora/tib_no_0007.txt new file mode 100644 index 0000000..11ee7bc --- /dev/null +++ b/data/tibetan numbers/corpora/tib_no_0007.txt @@ -0,0 +1 @@ +བདུན་ \ No newline at end of file diff --git a/data/tibetan numbers/corpora/tib_no_0008.txt b/data/tibetan numbers/corpora/tib_no_0008.txt new file mode 100644 index 0000000..32cceb1 --- /dev/null +++ b/data/tibetan numbers/corpora/tib_no_0008.txt @@ -0,0 +1 @@ +བརྒྱད \ No newline at end of file diff --git a/data/tibetan numbers/corpora/tib_no_0009.txt b/data/tibetan numbers/corpora/tib_no_0009.txt new file mode 100644 index 0000000..f958559 --- /dev/null +++ b/data/tibetan numbers/corpora/tib_no_0009.txt @@ -0,0 +1 @@ +དགུ \ No newline at end of file diff --git a/data/tibetan numbers/corpora/tib_no_0010.txt b/data/tibetan numbers/corpora/tib_no_0010.txt new file mode 100644 index 0000000..26946c7 --- /dev/null +++ b/data/tibetan numbers/corpora/tib_no_0010.txt @@ -0,0 +1 @@ +བཅུ \ No newline at end of file diff --git a/generate_training_data.py b/generate_training_data.py index 40d9851..ea52593 100644 --- a/generate_training_data.py +++ b/generate_training_data.py @@ -1,12 +1,20 @@ #!/usr/bin/env python3 """ -Skript zur Generierung von Trainingsdaten für Tibetische OCR. -Erstellt synthetische Bilder mit Tibetischem Text für YOLO-Training. +Skript zur Generierung von Multi-Klassen-Trainingsdaten für Tibetische OCR. +Erstellt synthetische Bilder mit Tibetischem Text, chinesischen Zahlen und allgemeinem Text für YOLO-Training. + +Unterstützt 3 Klassen: +- Klasse 0: tibetan_number_word (Tibetische Zahlen) +- Klasse 1: tibetan_text (Allgemeiner tibetischer Text) +- Klasse 2: chinese_number_word (Chinesische Zahlen) """ from pathlib import Path from collections import OrderedDict -from ultralytics.data.utils import DATASETS_DIR +try: + from ultralytics.data.utils import DATASETS_DIR +except ImportError: + DATASETS_DIR = "./datasets" # Fallback if ultralytics not installed from tibetanDataGenerator.dataset_generator import generate_dataset # Importiere Funktionen aus der tibetan_utils-Bibliothek @@ -15,38 +23,56 @@ def main(): - # Parse arguments + # Parse arguments (Multi-Klassen-Support) parser = create_generate_dataset_parser() args = parser.parse_args() # Set dataset path - datasets_dir = Path(DATASETS_DIR) - path = str(datasets_dir / args.dataset_name) - args.dataset_name = path - print(f"Generiere YOLO-Datensatz {args.dataset_name}...") - - # Generate training dataset - train_dataset_dict = generate_dataset(args, validation=False) - - # Generate validation dataset - val_dataset_dict = generate_dataset(args, validation=True) - - # Combine train and val dataset information - dataset_dict = { - 'path': args.dataset_name, - 'train': 'train/images', - 'val': 'val/images', - 'nc': train_dataset_dict['nc'], - 'names': train_dataset_dict['names'] - } - - # Save dataset configuration - yaml_path = f"{args.dataset_name}/data.yml" - save_yaml(dataset_dict, yaml_path) - - print("Datensatzgenerierung abgeschlossen.") + full_dataset_path = Path(args.output_dir) / args.dataset_name + original_dataset_name = args.dataset_name + args.dataset_name = str(full_dataset_path) + + print(f"Generiere Multi-Klassen YOLO-Datensatz in {args.dataset_name}...") + print("Speicherort kann geändert werden per `yolo settings`.") + print("Unterstützte Klassen:") + print(" - Klasse 0: tibetan_number_word (Tibetische Zahlen)") + print(" - Klasse 1: tibetan_text (Allgemeiner tibetischer Text)") + print(" - Klasse 2: chinese_number_word (Chinesische Zahlen)") + + # Generate training dataset (Multi-Klassen) + train_dataset_info = generate_dataset(args, validation=False) + + # Generate validation dataset (Multi-Klassen) + val_dataset_info = generate_dataset(args, validation=True) + + # Multi-Klassen YAML-Konfiguration + yaml_content = OrderedDict() + yaml_content['path'] = original_dataset_name + yaml_content['train'] = 'train/images' + yaml_content['val'] = 'val/images' + yaml_content['test'] = '' + + if 'nc' not in train_dataset_info or 'names' not in train_dataset_info: + raise ValueError("generate_dataset did not return 'nc' or 'names' in its info dictionary.") + yaml_content['nc'] = train_dataset_info['nc'] + yaml_content['names'] = train_dataset_info['names'] + + # YAML speichern mit korrekter Funktion + yaml_file_path = Path(args.output_dir) / f"{original_dataset_name}.yaml" + + # Verwende die modulare save_yaml Funktion + import yaml + def represent_ordereddict(dumper, data): + return dumper.represent_mapping('tag:yaml.org,2002:map', data.items()) + + yaml.add_representer(OrderedDict, represent_ordereddict) + + with open(yaml_file_path, 'w', encoding='utf-8') as f: + yaml.dump(dict(yaml_content), f, sort_keys=False, allow_unicode=True) + + print(f"\nMulti-Klassen-Datensatzgenerierung abgeschlossen. YAML-Konfiguration gespeichert: {yaml_file_path}") print("Training kann mit folgendem Befehl gestartet werden:\n") - print(f"yolo detect train data={yaml_path} epochs=100 imgsz=1024 model=yolov8n.pt") + print(f"yolo detect train data={yaml_file_path} epochs=100 imgsz=[{args.image_height},{args.image_width}] model=yolov8n.pt") if __name__ == "__main__": diff --git a/res/results_val_1.jpg b/res/results_val_1.jpg new file mode 100644 index 0000000..cd5463f Binary files /dev/null and b/res/results_val_1.jpg differ diff --git a/res/results_val_1.png b/res/results_val_1.png deleted file mode 100644 index 9bd4798..0000000 Binary files a/res/results_val_1.png and /dev/null differ diff --git a/res/results_val_2.png b/res/results_val_2.png deleted file mode 100644 index 89b0ade..0000000 Binary files a/res/results_val_2.png and /dev/null differ diff --git a/tibetanDataGenerator/README.md b/tibetanDataGenerator/README.md new file mode 100644 index 0000000..9044cdb --- /dev/null +++ b/tibetanDataGenerator/README.md @@ -0,0 +1,70 @@ +# Tibetan Text Detection Dataset Generator + +A tool for generating synthetic YOLO-formatted datasets for detecting Tibetan text, numbers, and their Chinese number counterparts in document images. + +## Features +- Generates synthetic document images with Tibetan text, numbers, and Chinese numbers +- Creates corresponding YOLO-format annotations +- Maintains consistent numbering between Tibetan and Chinese number representations +- Supports multiple text corpora with intelligent text placement +- Includes data augmentation options (rotation, noise) + +## New Options +python main.py \ + --corpora_tibetan_numbers_path ./data/corpora/Tibetan\ Number\ Words/ \ + --corpora_tibetan_text_path ./data/corpora/UVA\ Tibetan\ Spoken\ Corpus/ \ + --corpora_chinese_numbers_path ./data/corpora/Chinese\ Number\ Words/ \ + --font_path_tibetan ./fonts/Microsoft\ Himalaya.ttf \ + --font_path_chinese ./fonts/simkai.ttf \ + --image_width 1024 \ + --image_height 361 \ + --annotations_file_path ./data/annotations/tibetan_chinese_no.txt \ + +## Example Usage +python path/to/main.py \ + --corpora_tibetan_numbers_path "path/to/data/corpora/Tibetan Number Words" \ + --corpora_tibetan_text_path "path/to/data/corpora/UVA Tibetan Spoken Corpus" \ + --corpora_chinese_numbers_path "path/to/data/corpora/Chinese Number Words" \ + --background_train "path/to/data/background_images_train" \ + --background_val "path/to/data/background_images_val" \ + --annotations_file_path "path/to/data/annotations/tibetan_chinese_no/bg_example_0001.txt" \ + --font_path_tibetan "path/to/fonts/Microsoft Himalaya.ttf" \ + --font_path_chinese "path/to/fonts/simkai.ttf" \ + --train_samples 2 \ + --val_samples 2 + +## List of altered scripts +- main.py (for correct use shift the script to the [initial project directory](https://github.com/CodexAITeam/TibetanOCR/tree/synthetic_generation_tib_chi_no) +- dataset_generator.py => altered to dataset_generator_tib_chi_no.py +- text_renderer.py =>altered to text_renderer_img_size.py + +## Script Details +The script loads the Corpus path inputs from main.py to their corresponding bounding boxes of their ann_class_id (YOLO CLASS ID) in order to produce different texts with generate_dataset_tib_chi_no.py. +The ann_class_id are parsed from a preconfigured annotation template named bg_PPN337138764X_00000005.txt which is located in the Tibetan Layout Analyser project. See our [Tibetan Numbers Dataset Folder](https://github.com/CodexAITeam/TibetanLayoutAnalyzer/tree/main/data/tibetan%20numbers) for sample files. Furthermore, the script uses different background image from that project in the format 1024x361 +because it reflects the original historical data format. The argparse input font_path_tibetan is used to display generated tibetan text, while is font_path_chinese used for chinese text. + +Here is the table of the label mapping: + +| Class Name | Corpus Path | Planned Label ID Range* | ann_class_id / YOLO Class ID | +|-----------------------|---------------------------------|-------------------------|------------------------------| +| Tibetan Number Words | `corpora_tibetan_numbers_path` | 000-009 | 0 | +| Tibetan Text Body | `corpora_tibetan_text_path` | 101-110 | 1 | +| Chinese Number Words | `corpora_chinese_numbers_path` | 201-210 | 2 | + +\* see Limitations + +The different text inputs are given by: +- Tibetan Numbers: tib_no_0001.txt to tib_no_0010.txt: Randomly selected +- Tibetan Text: uvrip*.txt: Randomly selected +- Chinese Numbers: chi_no_0001.txt to chi_no_0010.txt: Simultaneosly selected (for instance chi_no_001.txt is selected when tib_no_0001.txt is selected) +See our [Corpora Folder](https://github.com/CodexAITeam/TibetanOCR/tree/synthetic_generation_tib_chi_no/data/corpora) for sample files. + +## Generated synthetic image sample +- generated_sample.png + +## Limitations and Outline for future development +- Label_dict is still not producing correct results of Planned Label ID Ranges because it only uses tibetan number file labels so far. +- Augmentations are still very limited and will be expanded. + +## License +This project is licensed under the MIT License - see the [LICENSE](https://github.com/CodexAITeam/TibetanOCR/blob/synthetic_generation_tib_chi_no/LICENSE) file for details. diff --git a/tibetanDataGenerator/data/text_renderer.py b/tibetanDataGenerator/data/text_renderer.py index 4cdddc4..895f0de 100644 --- a/tibetanDataGenerator/data/text_renderer.py +++ b/tibetanDataGenerator/data/text_renderer.py @@ -31,9 +31,15 @@ def set_font(self, font_path, font_size=24): print("Warning: Default font used.") return self - def add_text(self, text, position, box_size): + def add_text(self, text, position, box_size, rotation=0): """ Fügt Text auf dem Bild an einer bestimmten Position mit automatischer Begrenzung hinzu. + + Args: + text: Text to render + position: (x, y) position + box_size: (width, height) of text box + rotation: Rotation angle in degrees (0, 90, 180, 270) """ if not self.font: raise ValueError("Font not set. Use set_font() before adding text.") @@ -42,26 +48,64 @@ def add_text(self, text, position, box_size): box_w, box_h = box_size max_y = box_y + box_h - wrapped_text = [] - for line in text.split('\n'): - while line: - for i in range(1, len(line) + 1): - if self.draw.textlength(line[:i], font=self.font) > box_w: - break - else: - i = len(line) - - wrapped_text.append(line[:i]) - line = line[i:] - - y_offset = 0 - for line in wrapped_text: - left, top, right, bottom = self.font.getbbox(line) - line_height = bottom - top - if box_y + y_offset + line_height > max_y: - break - self.draw.text((box_x, box_y + y_offset), line, font=self.font, fill=(0, 0, 0)) - y_offset += line_height + if rotation == 0: + # Standard horizontal text rendering + wrapped_text = [] + for line in text.split('\n'): + while line: + for i in range(1, len(line) + 1): + if self.draw.textlength(line[:i], font=self.font) > box_w: + break + else: + i = len(line) + + wrapped_text.append(line[:i]) + line = line[i:] + + y_offset = 0 + for line in wrapped_text: + left, top, right, bottom = self.font.getbbox(line) + line_height = bottom - top + if box_y + y_offset + line_height > max_y: + break + self.draw.text((box_x, box_y + y_offset), line, font=self.font, fill=(0, 0, 0)) + y_offset += line_height + + elif rotation == 90: + # Vertical text rendering (90 degrees clockwise) + # Create a temporary image for the rotated text + temp_img = Image.new('RGBA', (box_h, box_w), (255, 255, 255, 0)) + temp_draw = ImageDraw.Draw(temp_img) + + # Render text on temporary image + wrapped_text = [] + for line in text.split('\n'): + while line: + for i in range(1, len(line) + 1): + if temp_draw.textlength(line[:i], font=self.font) > box_h: + break + else: + i = len(line) + wrapped_text.append(line[:i]) + line = line[i:] + + y_offset = 0 + for line in wrapped_text: + left, top, right, bottom = self.font.getbbox(line) + line_height = bottom - top + if y_offset + line_height > box_w: + break + temp_draw.text((0, y_offset), line, font=self.font, fill=(0, 0, 0)) + y_offset += line_height + + # Rotate the temporary image and paste it + rotated = temp_img.rotate(-90, expand=True) + self.image.paste(rotated, (box_x, box_y), rotated) + + else: + # For other rotations, fall back to standard rendering + print(f"Warning: Rotation {rotation}° not fully supported, using 0°") + return self.add_text(text, position, box_size, rotation=0) return self diff --git a/tibetanDataGenerator/data/text_renderer_img_size.py b/tibetanDataGenerator/data/text_renderer_img_size.py new file mode 100644 index 0000000..42beb64 --- /dev/null +++ b/tibetanDataGenerator/data/text_renderer_img_size.py @@ -0,0 +1,107 @@ +import os +from PIL import Image, ImageDraw, ImageFont + +from tibetanDataGenerator.data.augmentation import AugmentationStrategy + + +class ImageBuilder: + def __init__(self, image_size=(1024, 361)): + self.image_width = image_size[0] + self.image_height = image_size[1] + self.image = Image.new('RGB', (self.image_width, self.image_height), color='white') # Leeres Bild standardmäßig + self.draw = ImageDraw.Draw(self.image) + self.font = None + + def set_background(self, background_path): + """ + Setze ein Hintergrundbild für das Bild. + """ + if not os.path.exists(background_path): + raise FileNotFoundError(f"Background image {background_path} not found.") + self.image = Image.open(background_path).resize(self.image.size, Image.Resampling.LANCZOS) + self.draw = ImageDraw.Draw(self.image) + return self + + def set_font(self, font_path, font_size=24): + """ + Lade eine Schriftart für das Rendern von Text. + """ + try: + self.font = ImageFont.truetype(font_path, font_size) + except IOError: + self.font = ImageFont.load_default() + print("Warning: Default font used.") + return self + + def add_text(self, text, position, box_size): + """ + Fügt Text auf dem Bild an einer bestimmten Position mit automatischer Begrenzung hinzu. + """ + if not self.font: + raise ValueError("Font not set. Use set_font() before adding text.") + + box_x, box_y = position + box_w, box_h = box_size + max_y = box_y + box_h + + wrapped_text = [] + for line in text.split('\n'): + while line: + for i in range(1, len(line) + 1): + if self.draw.textlength(line[:i], font=self.font) > box_w: + break + else: + i = len(line) + + wrapped_text.append(line[:i]) + line = line[i:] + + y_offset = 0 + for line in wrapped_text: + left, top, right, bottom = self.font.getbbox(line) + line_height = bottom - top + if box_y + y_offset + line_height > max_y: + break + self.draw.text((box_x, box_y + y_offset), line, font=self.font, fill=(0, 0, 0)) + y_offset += line_height + + return self + + def add_bounding_box(self, position, size, color=(255, 0, 0)): + """ + Zeichne eine Bounding Box auf dem Bild. + """ + x, y = position + w, h = size + self.draw.rectangle([x, y, x + w, y + h], outline=color, width=2) + return self + + def apply_augmentation(self, augmentation_strategy): + """ + Apply an augmentation strategy to the current image. + + :param augmentation_strategy: An instance of AugmentationStrategy + :return: self for method chaining + """ + if not isinstance(augmentation_strategy, AugmentationStrategy): + raise ValueError("augmentation_strategy must be an instance of AugmentationStrategy") + + self.image = augmentation_strategy.apply(self.image) + self.draw = ImageDraw.Draw(self.image) + return self + + + def save(self, output_path): + """ + Speichert das fertige Bild. + """ + os.makedirs(os.path.dirname(output_path), exist_ok=True) + self.image.save(output_path) + return self + + def show(self): + """ + Zeigt das Bild zur Vorschau an. + """ + self.image.show() + return self diff --git a/tibetanDataGenerator/dataset_generator.py b/tibetanDataGenerator/dataset_generator.py index 0cda876..7af2a70 100644 --- a/tibetanDataGenerator/dataset_generator.py +++ b/tibetanDataGenerator/dataset_generator.py @@ -1,253 +1,798 @@ -import argparse -import multiprocessing -import random -import re -import os -from typing import Tuple, Dict, List - -import yaml -from pathlib import Path -from collections import OrderedDict -from ultralytics.data.utils import DATASETS_DIR -from tibetanDataGenerator.utils.data_loader import TextFactory -from tibetanDataGenerator.data.text_renderer import ImageBuilder -from tibetanDataGenerator.data.augmentation import RotateAugmentation, NoiseAugmentation -from tibetanDataGenerator.utils.bounding_box import BoundingBoxCalculator -from tibetanDataGenerator.utils.identifier import hash_current_time - -# Define a dictionary of augmentation strategies -augmentation_strategies = { - 'rotate': RotateAugmentation(), - 'noise': NoiseAugmentation() -} - - -def generate_dataset(args: argparse.Namespace, validation: bool = False) -> Dict: - """ - Generate a dataset for training or validation. - - Args: - args (argparse.Namespace): Command-line arguments. - validation (bool): Whether to generate validation dataset. Defaults to False. - - Returns: - Dict: A dictionary containing dataset information. - """ - dataset_info = _setup_dataset_info(args, validation) - label_dict = _create_label_dict(args) - background_images = _load_background_images(dataset_info['background_folder']) - - generation_args = _prepare_generation_args(args, dataset_info, label_dict, background_images) - - results = _generate_images_in_parallel(generation_args, dataset_info['no_samples']) - - return _create_dataset_dict(dataset_info['folder'], label_dict) - - -def generate_synthetic_image( - images: List[str], - label_dict: Dict[str, int], - folder_with_background: str, - folder_with_corpora: str, - folder_for_train_data: str, - debug: bool = True, - font_path: str = 'res/Microsoft Himalaya.ttf', - single_label: bool = False, - image_size: int = 1024, - augmentation: str = "noise" -) -> Tuple[str, str]: - # Constants - FONT_SIZE = 24 - BORDER_OFFSET_RATIO = 0.05 - - ctr = hash_current_time() - border_offset = int(BORDER_OFFSET_RATIO * image_size) - - # Image setup - image_path = _select_random_background(folder_with_background, images) - builder = _setup_image_builder(image_path, image_size, font_path, FONT_SIZE) - - # Text generation and positioning - text, file_name = _generate_text(folder_with_corpora) - text_position, box_position, fitted_box_size = _calculate_text_position( - text, image_size, border_offset, font_path, FONT_SIZE - ) - - # Add text and bounding box - builder.add_text(text, text_position, fitted_box_size) - if debug == True: - builder.add_bounding_box(box_position, fitted_box_size) - - # Apply augmentation - _apply_augmentation(builder, augmentation) - - # Prepare and save image and label - image_filename, label_filename = _save_image_and_label( - builder, text, ctr, folder_for_train_data, label_dict, - single_label, file_name, box_position, fitted_box_size, - image_size, debug - ) - - return image_filename, label_filename - - -def _select_random_background(folder: str, images: List[str]) -> str: - return os.path.join(folder, random.choice(images)) - - -def _setup_image_builder(image_path: str, image_size: int, font_path: str, font_size: int) -> ImageBuilder: - builder = ImageBuilder((image_size, image_size)) - builder.set_background(image_path) - builder.set_font(font_path, font_size=font_size) - return builder - - -def _generate_text(folder_with_corpora: str) -> Tuple[str, str]: - text_generator = TextFactory.create_text_source("corpus", folder_with_corpora) - return text_generator.generate_text() - - -def _calculate_text_position( - text: str, - image_size: int, - border_offset: int, - font_path: str, - font_size: int -) -> Tuple[Tuple[int, int], Tuple[int, int], Tuple[int, int]]: - max_box_size_w = random.randint(100, image_size) - max_box_size = (max_box_size_w, 400) - - fitted_box_size = BoundingBoxCalculator.fit(text, max_box_size, font_size=font_size, font_path=font_path) - - text_pos_x = random.randint(border_offset, image_size - (fitted_box_size[0] + border_offset)) - text_pos_y = random.randint(border_offset, image_size - (fitted_box_size[1] + border_offset)) - - text_position = (text_pos_x, text_pos_y) - box_position = (text_pos_x + int(fitted_box_size[0] / 2), text_pos_y - int(fitted_box_size[1] / 2)) - - return text_position, box_position, fitted_box_size - - -def _apply_augmentation(builder: ImageBuilder, augmentation: str): - augmentation_strategy = augmentation_strategies[augmentation.lower()] - builder.apply_augmentation(augmentation_strategy) - - -def _save_image_and_label( - builder: ImageBuilder, - text: str, - ctr: str, - folder_for_train_data: str, - label_dict: Dict[str, int], - single_label: bool, - file_name: str, - box_position: Tuple[int, int], - fitted_box_size: Tuple[int, int], - image_size: int, - debug: bool -) -> Tuple[str, str]: - label = next(iter(label_dict.keys())) if single_label else os.path.splitext(file_name)[0] - label_id = label_dict[label] - - image_filename = f"{label}_{ctr}.png" - image_path = os.path.join(folder_for_train_data, 'images', image_filename) - builder.save(image_path) - - bbox_str = _create_bbox_string(label_id, box_position, fitted_box_size, image_size) - - labels_dir = os.path.join(folder_for_train_data, 'labels') - os.makedirs(labels_dir, exist_ok=True) - - label_filename = f"{label}_{ctr}.txt" - label_path = os.path.join(labels_dir, label_filename) - with open(label_path, 'w') as f: - f.write(bbox_str) - - if debug == True: - print(f"Generated sample: {image_filename}") - print(f"Bounding boxes:\n{bbox_str}") - - return image_filename, label_filename - - -def _create_bbox_string(label_id: int, box_position: Tuple[int, int], box_size: Tuple[int, int], image_size: int) -> str: - x, y = box_position - w, h = box_size - return f"{label_id} {x / image_size} {y / image_size} {w / image_size} {h / image_size}\n" - - -def _fill_label_dict(folder_path): - label_dict = {} - label_id = 0 - - # Get all txt files - files = [f for f in os.listdir(folder_path) if f.endswith(".txt")] - - # Sort files based on the numeric part - sorted_files = sorted(files, key=lambda x: int(re.findall(r'\d+', x)[-1])) - - for filename in sorted_files: - label = os.path.splitext(filename)[0] - if label not in label_dict: - label_dict[label] = label_id - label_id += 1 - - return label_dict - - -def _setup_dataset_info(args: argparse.Namespace, validation: bool) -> Dict: - """Set up basic dataset information based on validation flag.""" - if validation: - return { - 'background_folder': args.background_val, - 'folder': f'{args.dataset_name}/val/', - 'no_samples': args.val_samples - } - else: - return { - 'background_folder': args.background_train, - 'folder': f'{args.dataset_name}/train/', - 'no_samples': args.train_samples - } - - -def _create_label_dict(args: argparse.Namespace) -> Dict[str, int]: - """Create a dictionary of labels based on single_label flag.""" - if args.single_label: - return {'tibetan': 0} - else: - return _fill_label_dict(args.corpora_folder) - - -def _load_background_images(folder: str) -> List[str]: - """Load background image filenames from the specified folder.""" - return [file for file in os.listdir(folder) if file.lower().endswith(('.jpg', '.png'))] - - -def _prepare_generation_args(args: argparse.Namespace, dataset_info: Dict, label_dict: Dict, - images: List[str]) -> Tuple: - """Prepare arguments for image generation.""" - return ( - images, label_dict, dataset_info['background_folder'], args.corpora_folder, - dataset_info['folder'], args.debug, args.font_path, args.single_label, - args.image_size, args.augmentation - ) - - -def _generate_images_in_parallel(generation_args: Tuple, no_samples: int) -> List: - """Generate images in parallel using multiprocessing.""" - max_parallel_calls = os.cpu_count() - with multiprocessing.Pool(max_parallel_calls) as pool: - return pool.starmap(generate_synthetic_image, [generation_args] * no_samples) - - -def _create_dataset_dict(folder: str, label_dict: Dict[str, int]) -> OrderedDict: - """Create a dictionary containing dataset information.""" - label_dict_swap = {v: k for k, v in label_dict.items()} - return OrderedDict([ - ('path', f"../{folder}"), - ('train', 'train/images'), - ('val', 'val/images'), - ('nc', len(label_dict_swap)), - ('names', label_dict_swap) - ]) \ No newline at end of file +import argparse +import multiprocessing +import random +import re +import os +import csv +import time +import traceback +from typing import Tuple, Dict, List, Optional # Added Optional + +import yaml +from pathlib import Path +from collections import OrderedDict +try: + from ultralytics.data.utils import DATASETS_DIR +except ImportError: + DATASETS_DIR = "./datasets" # Fallback if ultralytics not installed +from tibetanDataGenerator.utils.data_loader import TextFactory +from tibetanDataGenerator.data.text_renderer import ImageBuilder +from tibetanDataGenerator.data.augmentation import RotateAugmentation, NoiseAugmentation, \ + AugmentationStrategy +from tibetan_utils.image_utils import BoundingBoxCalculator +from tibetan_utils.io_utils import hash_current_time + +# Define a dictionary of augmentation strategies +augmentation_strategies: Dict[str, AugmentationStrategy] = { + 'rotate': RotateAugmentation(), + 'noise': NoiseAugmentation() +} + +def _parse_yolo_annotations(file_path: str) -> List[Tuple[int, float, float, float, float]]: + """ + Parses a YOLO annotation file. + Each line is expected to be: class_id center_x center_y width height + Returns a list of tuples (class_id, x_center, y_center, width, height). + """ + annotations = [] + if not file_path: # If file_path is None or empty string + return annotations + + if not os.path.exists(file_path): + print(f"Warning: Annotation file '{file_path}' not found. No annotations will be loaded from this file.") + return annotations + + try: + with open(file_path, 'r', encoding='utf-8') as f: + for i, line in enumerate(f): + line_strip = line.strip() + if not line_strip: # Skip empty lines + continue + parts = line_strip.split() + if len(parts) == 5: + try: + class_id = int(parts[0]) + x_center = float(parts[1]) + y_center = float(parts[2]) + width = float(parts[3]) + height = float(parts[4]) + + # Basic validation for YOLO coordinates (normalized) + if not (0.0 <= x_center <= 1.0 and \ + 0.0 <= y_center <= 1.0 and \ + 0.0 <= width <= 1.0 and \ + 0.0 <= height <= 1.0): + # This warning can be made conditional on debug flag if too verbose + # print(f"Debug: Annotation values out of [0,1] range in {file_path}, line {i+1}: {line_strip}") + pass + + # Ensure width and height are positive for valid bounding box + if width <= 0 or height <= 0: + print( + f"Warning: Non-positive width/height in annotation file {file_path}, line {i + 1}: {line_strip}. Skipping this annotation.") + continue + + annotations.append((class_id, x_center, y_center, width, height)) + except ValueError: + print( + f"Warning: Malformed line (numeric conversion) in annotation file {file_path}, line {i + 1}: {line_strip}") + else: # Incorrect number of parts + print( + f"Warning: Incorrect number of parts in line in annotation file {file_path}, line {i + 1}: {line_strip}") + except Exception as e: + print(f"Error reading or parsing annotation file {file_path}: {e}") + return annotations + + +def generate_dataset(args: argparse.Namespace, validation: bool = False) -> Dict: + """ + Generate a dataset for training or validation. + + Args: + args (argparse.Namespace): Command-line arguments. + validation (bool): Whether to generate validation dataset. Defaults to False. + + Returns: + Dict: A dictionary containing dataset information. + """ + print(f"Starting dataset generation (validation={validation})...") + start_time = time.time() + + dataset_info = _setup_dataset_info(args, validation) + print(f"Dataset info setup completed. Target samples: {dataset_info['no_samples']}") + + label_dict = _create_label_dict(args) + print(f"Label dictionary created with {len(label_dict)} labels: {list(label_dict.keys())}") + + background_images = _load_background_images(dataset_info['background_folder']) + print(f"Loaded {len(background_images)} background images from {dataset_info['background_folder']}") + + # _prepare_generation_args now gets annotations_file_path from args + generation_args_tuple = _prepare_generation_args(args, dataset_info, label_dict, background_images) + print("Generation arguments prepared") + + results = _generate_images_in_parallel(generation_args_tuple, dataset_info['no_samples']) + + elapsed = time.time() - start_time + successful_results = [r for r in results if r[0] and r[1]] # Filter out failed generations + print(f"Dataset generation completed in {elapsed:.1f}s. Success rate: {len(successful_results)}/{len(results)}") + + return _create_dataset_dict(str(dataset_info['folder']), label_dict) + + +def generate_synthetic_image( + images: List[str], + label_dict: Dict[str, int], + folder_with_background: str, + corpora_tibetan_numbers_path: str, + corpora_tibetan_text_path: str, + corpora_chinese_numbers_path: str, + folder_for_train_data: str, + debug: bool = True, + font_path_tibetan: str = 'res/Microsoft Himalaya.ttf', + font_path_chinese: str = 'res/simkai.ttf', + single_label: bool = False, + image_width: int = 1024, + image_height: int = 361, + augmentation: str = "noise", + annotations_file_path: Optional[str] = None +) -> Tuple[str, str]: + """ + Generate a synthetic image with improved error handling and resource management. + """ + try: + return _generate_synthetic_image_impl( + images, label_dict, folder_with_background, + corpora_tibetan_numbers_path, corpora_tibetan_text_path, corpora_chinese_numbers_path, + folder_for_train_data, debug, font_path_tibetan, font_path_chinese, + single_label, image_width, image_height, augmentation, annotations_file_path + ) + except Exception as e: + # Log the error and return empty paths to indicate failure + print(f"Error in generate_synthetic_image: {e}") + if debug: + import traceback + traceback.print_exc() + return "", "" + + +def _generate_synthetic_image_impl( + images: List[str], + label_dict: Dict[str, int], + folder_with_background: str, + corpora_tibetan_numbers_path: str, + corpora_tibetan_text_path: str, + corpora_chinese_numbers_path: str, + folder_for_train_data: str, + debug: bool = True, + font_path_tibetan: str = 'res/Microsoft Himalaya.ttf', + font_path_chinese: str = 'res/simkai.ttf', + single_label: bool = False, + image_width: int = 1024, + image_height: int = 361, + augmentation: str = "noise", + annotations_file_path: Optional[str] = None +) -> Tuple[str, str]: + # Font configuration + BORDER_OFFSET_RATIO = 0.05 + font_size_class1 = None + font_size_0_2 = None + + ctr = hash_current_time() + + border_offset_x = int(BORDER_OFFSET_RATIO * image_width) + border_offset_y = int(BORDER_OFFSET_RATIO * image_height) + + image_path_bg = _select_random_background(folder_with_background, images) + # Determine which font to use based on annotation class + current_font_path = font_path_tibetan # Default to Tibetan font + if annotations_file_path: + parsed_annotations = _parse_yolo_annotations(annotations_file_path) + if parsed_annotations and parsed_annotations[0][0] == 2: # Check first annotation's class_id + current_font_path = font_path_chinese + + builder = _setup_image_builder(image_path_bg, image_width, image_height, current_font_path, 24) # Default font size only used if no annotations + + bbox_str_list = [] # Collect bounding box strings for all text instances + tibetan_number_match = None # Store the matching number if we find a Tibetan number file + + # ---- Start: Draw bounding boxes from YOLO annotation file ---- + if annotations_file_path: + parsed_annotations = _parse_yolo_annotations(annotations_file_path) + for ann_class_id, norm_cx, norm_cy, norm_w, norm_h in parsed_annotations: + # Convert YOLO normalized coordinates to pixel coordinates for drawing + x_center_pixel = norm_cx * image_width + y_center_pixel = norm_cy * image_height + pixel_w = norm_w * image_width + pixel_h = norm_h * image_height + + # Calculate top-left corner for add_bounding_box + tl_x = x_center_pixel - (pixel_w / 2) + tl_y = y_center_pixel - (pixel_h / 2) + + draw_tl_pos = (int(round(tl_x)), int(round(tl_y))) + draw_box_size = (int(round(pixel_w)), int(round(pixel_h))) + + # Draw only if width and height are positive + if draw_box_size[0] > 0 and draw_box_size[1] > 0: + # Select the text corpus based on ann_class_id + if ann_class_id == 0: # Tibetan numbers + text, file_name_from_corpus = _generate_text(corpora_tibetan_numbers_path) + # Calculate font size for class 1 with bounding box constraints + text_for_sizing = text if text else "default" + max_font = BoundingBoxCalculator.find_max_font( + text_for_sizing, + (draw_box_size[0], draw_box_size[1]), + font_path_tibetan, + max_size=100, + debug=debug + ) + font_size_class1 = random.randint(24, max(24, min(100, max_font))) + + # Set sibling classes to be ±1-2 sizes different + delta = random.choice([-2, -1, 1, 2]) + font_size_0_2 = max(1, min(100, font_size_class1 + delta)) + + builder.set_font(font_path_tibetan, font_size_class1) + # Extract the number part from the Tibetan filename + try: + tibetan_number_match = re.search(r'tib_no_(\d+)', file_name_from_corpus) + if tibetan_number_match: + tibetan_number_match = tibetan_number_match.group(1) + except: + tibetan_number_match = None + elif ann_class_id == 1: # Tibetan text + text, file_name_from_corpus = _generate_text(corpora_tibetan_text_path) + # Calculate font size for class 1 with bounding box constraints + text_for_sizing = text if text else "default" + max_font = BoundingBoxCalculator.find_max_font( + text_for_sizing, + (draw_box_size[0], draw_box_size[1]), + font_path_tibetan, + max_size=100, + debug=debug + ) + font_size_class1 = random.randint(24, max(24, min(100, max_font))) + builder.set_font(font_path_tibetan, font_size_class1) + elif ann_class_id == 2: # Chinese numbers + # Use the same number as Tibetan if available + chinese_number = f"chi_no_{tibetan_number_match}" if tibetan_number_match else None + text, file_name_from_corpus = _generate_text(corpora_chinese_numbers_path, chinese_number) + builder.set_font(font_path_chinese, font_size_0_2) + else: + if debug: + print(f"Debug: Unknown ann_class_id {ann_class_id}. Skipping this annotation box.") + continue + + # Ensure the text fits within the bounding box + # Calculate actual text dimensions and centered position + actual_text_box_size = BoundingBoxCalculator.fit( + text, + draw_box_size, + font_size=font_size_class1 if ann_class_id == 1 else font_size_0_2, + font_path=current_font_path, + debug=debug + ) + # Calculate random offset based on class ID + def get_offset(box_dim, percentage): + max_offset = box_dim * percentage / 100 + return random.uniform(-max_offset, max_offset) + + # Apply different variation based on class ID + if ann_class_id in [0, 2]: # Tibetan and Chinese numbers + x_offset = get_offset(draw_box_size[0], 10) + y_offset = get_offset(draw_box_size[1], 10) + else: # Tibetan text (class 1) + x_offset = get_offset(draw_box_size[0], 10) + y_offset = get_offset(draw_box_size[1], 10) + + # Calculate centered position with random offset + base_x = draw_tl_pos[0] + (draw_box_size[0] - actual_text_box_size[0]) // 2 + base_y = draw_tl_pos[1] + (draw_box_size[1] - actual_text_box_size[1]) // 2 + + # Apply offsets and clamp to stay within bounding box + text_tl_x = int(base_x + x_offset) + text_tl_y = int(base_y + y_offset) + + # Ensure text stays within bounding box + text_tl_x = max(draw_tl_pos[0], min(text_tl_x, draw_tl_pos[0] + draw_box_size[0] - actual_text_box_size[0])) + text_tl_y = max(draw_tl_pos[1], min(text_tl_y, draw_tl_pos[1] + draw_box_size[1] - actual_text_box_size[1])) + text_render_top_left_pos = (text_tl_x, text_tl_y) + yolo_box_center_pos = (int(round(x_center_pixel)), int(round(y_center_pixel))) + + # Apply rotation for Tibetan numbers (class 0) + rotation_angle = 90 if ann_class_id == 0 else 0 + builder.add_text(text, text_render_top_left_pos, actual_text_box_size, rotation=rotation_angle) + # Get the base filename without extension + label_key = os.path.splitext(file_name_from_corpus)[0] + + # For Tibetan numbers (class 0), ensure we use the tib_no_ prefix + if ann_class_id == 0: + if not label_key.startswith('tib_no_'): + # Extract the number from the filename if it exists + try: + num_part = re.search(r'\d+', label_key).group() + label_key = f'tib_no_{num_part.zfill(4)}' # Format as tib_no_0001 + except AttributeError: + label_key = 'tib_no_0001' # Default fallback + + # For ann_class_id 0, always use 0 as the label_id + # For other classes, get label ID from dictionary or use class ID as fallback + if ann_class_id == 0: + label_id = 0 + else: + label_id = label_dict.get(label_key, ann_class_id) + if label_key not in label_dict and debug: + print(f"Debug: Label '{label_key}' not found in label_dict. Using class_id {ann_class_id}") + + bbox_str = _create_bbox_string( + label_id, + yolo_box_center_pos, + actual_text_box_size, + image_width, + image_height + ) + bbox_str_list.append(bbox_str) + + if debug: + builder.add_bounding_box(text_render_top_left_pos, actual_text_box_size, color=(0, 255, 0)) # Green + builder.add_bounding_box(draw_tl_pos, draw_box_size, color=(255, 0, 0)) # Red + + else: + if debug: + print( + f"Debug: Skipping drawing annotation box from file (class {ann_class_id}) due to non-positive dimensions: size {draw_box_size}") + + if augmentation.lower() != 'none' and augmentation.lower() in augmentation_strategies: + _apply_augmentation(builder, augmentation) + elif augmentation.lower() != 'none': + print(f"Warning: Augmentation strategy '{augmentation}' not found. Skipping augmentation.") + + image_filename_saved = f"{ctr}.png" + image_full_path = os.path.join(folder_for_train_data, 'images', image_filename_saved) + os.makedirs(os.path.dirname(image_full_path), exist_ok=True) + builder.save(image_full_path) + + labels_dir = os.path.join(folder_for_train_data, 'labels') + os.makedirs(labels_dir, exist_ok=True) + + label_filename_saved = f"{ctr}.txt" + label_full_path = os.path.join(labels_dir, label_filename_saved) + with open(label_full_path, 'w', encoding='utf-8') as f: + f.writelines(bbox_str_list) # Write all bounding box strings into the file + + if debug: + print(f"Generated sample: {image_full_path}") + print(f"Label file: {label_full_path}") + print(f"Bounding boxes (YOLO format for synthetic text):\n{''.join(bbox_str_list).strip()}") + + return image_full_path, label_full_path + + +def _select_random_background(folder: str, images: List[str]) -> str: + if not images: + raise ValueError(f"No images found in background folder: {folder}. Cannot select a random background.") + return os.path.join(folder, random.choice(images)) + + +def _setup_image_builder(image_path_bg: str, image_width: int, image_height: int, + font_path: str, font_size: int) -> ImageBuilder: + builder = ImageBuilder(image_size=(image_width, image_height)) + try: + if image_path_bg and os.path.exists(image_path_bg): + builder.set_background(image_path_bg) + else: + if image_path_bg: + print(f"Warning: Background image {image_path_bg} not found. Using default white background.") + except FileNotFoundError: + print(f"Warning: Background image {image_path_bg} not found during set_background. Using default white background.") + except Exception as e: + print(f"Error setting background {image_path_bg}: {e}. Using default white background.") + + # Font will be set separately during text rendering + return builder + + +def _generate_text(folder_with_corpora: str, matching_number: str = None) -> Tuple[str, str]: + text_generator = TextFactory.create_text_source("corpus", folder_with_corpora) + if matching_number: + # If a matching number is specified, try to find the exact file + matching_file = f"{matching_number}.txt" + file_path = os.path.join(folder_with_corpora, matching_file) + if os.path.exists(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + text = f.read().strip() + return text, matching_file + # Fall back to random selection if no matching number or file not found + return text_generator.generate_text() + + +def _calculate_text_layout( + text: str, + image_width: int, + image_height: int, + border_offset_x: int, + border_offset_y: int, + font_path: str, + font_size: int +) -> Tuple[Tuple[int, int], Tuple[int, int], Tuple[int, int]]: + min_text_box_width = font_size * 2 + min_text_box_height = int(font_size * 1.2) + + max_width_for_text_area = image_width - 2 * border_offset_x + max_height_for_text_area = image_height - 2 * border_offset_y + + if max_width_for_text_area < min_text_box_width or max_height_for_text_area < min_text_box_height: + # Fallback: try to use at least minimal dimensions if text is very short. + # This might not be ideal if text is truly too large for the area. + # A more robust solution might involve text wrapping or scaling, but that's complex. + print( + f"Warning: Text area ({max_width_for_text_area}x{max_height_for_text_area}) might be too small for text. Attempting to fit.") + max_width_for_text_area = max(max_width_for_text_area, min_text_box_width) + max_height_for_text_area = max(max_height_for_text_area, min_text_box_height) + + conceptual_box_w = random.randint(min_text_box_width, max_width_for_text_area) + conceptual_box_h = random.randint(min_text_box_height, max_height_for_text_area) + max_placement_box = (conceptual_box_w, conceptual_box_h) + + actual_text_box_size = BoundingBoxCalculator.fit(text, max_placement_box, font_size=font_size, font_path=font_path, debug=False) + actual_w, actual_h = actual_text_box_size + + if actual_w <= 0 or actual_h <= 0: + print( + f"Warning: BoundingBoxCalculator.fit returned non-positive dimensions ({actual_w}x{actual_h}) for text: '{text[:50]}...'. Defaulting to minimal.") + actual_w = max(actual_w, font_size // 2 if text else 1) + actual_h = max(actual_h, font_size // 2 if text else 1) + actual_text_box_size = (actual_w, actual_h) + + pos_x_upper_bound = image_width - border_offset_x - actual_w + pos_y_upper_bound = image_height - border_offset_y - actual_h + + # Ensure random range is valid: lower_bound <= upper_bound + # If upper bound is less than lower, it means the box is too large. + # We should place it at the border_offset in such cases. + tl_pos_x = random.randint(border_offset_x, max(border_offset_x, + pos_x_upper_bound)) if pos_x_upper_bound >= border_offset_x else border_offset_x + tl_pos_y = random.randint(border_offset_y, max(border_offset_y, + pos_y_upper_bound)) if pos_y_upper_bound >= border_offset_y else border_offset_y + + text_render_top_left_pos = (tl_pos_x, tl_pos_y) + + center_x = tl_pos_x + actual_w // 2 + center_y = tl_pos_y + actual_h // 2 + yolo_box_center_pos = (center_x, center_y) + + return text_render_top_left_pos, yolo_box_center_pos, actual_text_box_size + + +def _apply_augmentation(builder: ImageBuilder, augmentation_name: str): + augmentation_strategy = augmentation_strategies[augmentation_name.lower()] + builder.apply_augmentation(augmentation_strategy) + + +def _save_image_and_label( + builder: ImageBuilder, + text_content: str, + ctr: str, + folder_for_train_data: str, + label_dict: Dict[str, int], + single_label: bool, + file_name_from_corpus: str, + yolo_box_center_pos: Tuple[int, int], + actual_text_box_size: Tuple[int, int], + image_width: int, + image_height: int, + debug: bool +) -> Tuple[str, str]: + label_str = next(iter(label_dict.keys())) if single_label else os.path.splitext(file_name_from_corpus)[0] + if label_str not in label_dict: + print( + f"Warning: Label '{label_str}' from corpus file '{file_name_from_corpus}' not found in label_dict. Defaulting to first available label.") + if not label_dict: + raise ValueError("Label dictionary is empty. Cannot determine a label.") + label_str = next(iter(label_dict.keys())) + label_id = label_dict[label_str] + + image_base_filename = f"{label_str}_{ctr}.png" + image_full_path = os.path.join(folder_for_train_data, 'images', image_base_filename) + builder.save(image_full_path) + + bbox_str = _create_bbox_string( + label_id, yolo_box_center_pos, actual_text_box_size, image_width, image_height + ) + + labels_dir = os.path.join(folder_for_train_data, 'labels') + os.makedirs(labels_dir, exist_ok=True) + + label_base_filename = f"{label_str}_{ctr}.txt" + label_full_path = os.path.join(labels_dir, label_base_filename) + with open(label_full_path, 'w', encoding='utf-8') as f: + f.write(bbox_str) + + if debug: + print(f"Generated sample: {image_full_path}") + print(f"Label file: {label_full_path}") + print(f"Bounding box (YOLO format for synthetic text):\n{bbox_str.strip()}") + + return image_full_path, label_full_path + + +def _create_bbox_string( + label_id: int, + box_center_xy: Tuple[int, int], + box_wh: Tuple[int, int], + image_width: int = 1024, + image_height: int = 361 +) -> str: + center_x, center_y = box_center_xy + box_w, box_h = box_wh + + if image_width == 0: raise ValueError("image_width cannot be zero.") + if image_height == 0: raise ValueError("image_height cannot be zero.") + + norm_center_x = max(0.0, min(1.0, center_x / image_width)) + norm_center_y = max(0.0, min(1.0, center_y / image_height)) + norm_w = max(0.0, min(1.0, box_w / image_width)) + norm_h = max(0.0, min(1.0, box_h / image_height)) + + return f"{label_id} {norm_center_x:.6f} {norm_center_y:.6f} {norm_w:.6f} {norm_h:.6f}\n" + + +def _fill_label_dict(folder_path: str) -> Dict[str, int]: + label_dict = OrderedDict() + label_id_counter = 0 + + if not os.path.isdir(folder_path): + print(f"Warning: Corpora folder '{folder_path}' not found. Returning empty label dict.") + return label_dict + + # Get all .txt files and sort them numerically by their suffix + files = [f for f in os.listdir(folder_path) if f.endswith(".txt") and f.startswith("tib_no_")] + + try: + # Sort files by their numeric suffix (tib_no_0001.txt -> 1) + sorted_files = sorted( + files, + key=lambda x: int(x.split("_")[-1].split(".")[0]) + ) + except (ValueError, IndexError): + print("Warning: Could not sort corpus files numerically. Using simple alphabetical sort.") + sorted_files = sorted(files) + + for filename in sorted_files: + label_name = os.path.splitext(filename)[0] # Gets 'tib_no_0001' from 'tib_no_0001.txt' + if label_name not in label_dict: + label_dict[label_name] = label_id_counter + label_id_counter += 1 + + if not label_dict: + print(f"Warning: No valid .txt files found in corpora folder '{folder_path}'. Label dictionary is empty.") + return label_dict + + +def _setup_dataset_info(args: argparse.Namespace, validation: bool) -> Dict: + base_output_folder = Path(args.dataset_name) + + if validation: + folder_path = base_output_folder / 'val' + num_samples = args.val_samples + bg_folder = args.background_val + else: + folder_path = base_output_folder / 'train' + num_samples = args.train_samples + bg_folder = args.background_train + + os.makedirs(folder_path / 'images', exist_ok=True) + os.makedirs(folder_path / 'labels', exist_ok=True) + + return { + 'background_folder': bg_folder, + 'folder': folder_path, + 'no_samples': num_samples + } + + +def _read_labels_from_csv(csv_path: str) -> Dict[str, int]: + """ + Read label names from a CSV file. + The CSV file should have columns 'yolo_label' and 'label_name'. + Returns a dictionary mapping label names to their corresponding class IDs. + """ + label_dict = OrderedDict() + + try: + with open(csv_path, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + for row in reader: + if 'yolo_label' in row and 'label_name' in row: + class_id = int(row['yolo_label']) + label_name = row['label_name'] + label_dict[label_name] = class_id + except Exception as e: + print(f"Error reading CSV file {csv_path}: {e}") + + if not label_dict: + print(f"Warning: No valid labels found in CSV file '{csv_path}'. Label dictionary is empty.") + + return label_dict + +def _create_label_dict(args: argparse.Namespace) -> Dict[str, int]: + if args.single_label: + return {'tibetan': 0} + else: + # Check if annotations_file_path is provided and has a corresponding CSV file + if args.annotations_file_path and os.path.exists(args.annotations_file_path): + # Try to find the corresponding CSV file + csv_path = args.annotations_file_path.replace('.txt', '.csv') + if os.path.exists(csv_path): + return _read_labels_from_csv(csv_path) + + # Fallback to the original method if CSV doesn't exist + return _fill_label_dict(args.corpora_tibetan_numbers_path) + + +def _load_background_images(folder: str) -> List[str]: + if not os.path.isdir(folder): + print(f"Warning: Background folder '{folder}' not found. No background images will be loaded.") + return [] + return [file for file in os.listdir(folder) if file.lower().endswith(('.jpg', '.jpeg', '.png'))] + + +def _prepare_generation_args(args: argparse.Namespace, dataset_info: Dict, label_dict: Dict, + images_bg_list: List[str]) -> Tuple: + """Prepare arguments for each call to generate_synthetic_image.""" + return ( + images_bg_list, + label_dict, + dataset_info['background_folder'], + args.corpora_tibetan_numbers_path, + args.corpora_tibetan_text_path, + args.corpora_chinese_numbers_path, + dataset_info['folder'], + args.debug, + args.font_path_tibetan, + args.font_path_chinese, + args.single_label, + args.image_width, + args.image_height, + args.augmentation, + args.annotations_file_path + ) + + +def _generate_images_in_parallel(generation_args_tuple: Tuple, no_samples: int) -> List: + if no_samples <= 0: + return [] + + list_of_generation_args = [generation_args_tuple] * no_samples + # Ensure os.cpu_count() returns a valid number or default to 1 + num_cpus = os.cpu_count() + # Reduce parallel processes to avoid resource conflicts + max_parallel_calls = min((num_cpus // 2) if num_cpus and num_cpus > 2 else 1, no_samples, 4) + + if max_parallel_calls == 0: + max_parallel_calls = 1 # Ensure at least one process + + print(f"Generating {no_samples} images using {max_parallel_calls} parallel processes...") + + results = [] + pool = None + + try: + # Use spawn method to avoid potential issues with fork on some systems + ctx = multiprocessing.get_context('spawn') + pool = ctx.Pool(processes=max_parallel_calls) + + # Add timeout and progress tracking + import time + start_time = time.time() + timeout_seconds = 300 # 5 minutes timeout + + # Use starmap_async for better control + async_result = pool.starmap_async(generate_synthetic_image, list_of_generation_args) + + # Wait with timeout and progress updates + while not async_result.ready(): + elapsed = time.time() - start_time + if elapsed > timeout_seconds: + print(f"Timeout after {timeout_seconds} seconds. Terminating processes...") + pool.terminate() + pool.join() + raise TimeoutError(f"Image generation timed out after {timeout_seconds} seconds") + + # Show progress every 10 seconds + if int(elapsed) % 10 == 0 and elapsed > 0: + print(f"Still generating... ({elapsed:.0f}s elapsed)") + + time.sleep(1) + + results = async_result.get() + elapsed = time.time() - start_time + print(f"Successfully generated {len(results)} images in {elapsed:.1f} seconds") + + except Exception as e: + print(f"Error during parallel image generation: {e}") + if pool: + try: + pool.terminate() # Forcefully terminate worker processes + pool.join(timeout=10) # Wait max 10 seconds for cleanup + except Exception as cleanup_error: + print(f"Error during pool cleanup: {cleanup_error}") + + # Fallback to sequential processing + print("Falling back to sequential processing...") + results = _generate_images_sequentially(generation_args_tuple, no_samples) + + finally: + if pool: + try: + pool.close() + pool.join() + except Exception: + pass # Ignore cleanup errors + + return results + + +def _generate_images_sequentially(generation_args_tuple: Tuple, no_samples: int) -> List: + """Fallback sequential image generation when parallel processing fails.""" + print(f"Generating {no_samples} images sequentially...") + results = [] + start_time = time.time() + + for i in range(no_samples): + try: + if i % 10 == 0 and i > 0: + elapsed = time.time() - start_time + rate = i / elapsed if elapsed > 0 else 0 + eta = (no_samples - i) / rate if rate > 0 else 0 + print(f"Generated {i}/{no_samples} images... ({rate:.1f} img/s, ETA: {eta:.0f}s)") + + img_start = time.time() + result = generate_synthetic_image(*generation_args_tuple) + img_time = time.time() - img_start + + if result[0] and result[1]: # Check if generation was successful + results.append(result) + else: + print(f"Warning: Image {i+1} generation failed (took {img_time:.2f}s)") + + except Exception as e: + print(f"Error generating image {i+1}: {e}") + if generation_args_tuple[7]: # debug flag + traceback.print_exc() + continue + + elapsed = time.time() - start_time + success_rate = len(results) / no_samples * 100 if no_samples > 0 else 0 + print(f"Sequential generation completed: {len(results)}/{no_samples} images ({success_rate:.1f}% success) in {elapsed:.1f}s") + return results + + +def _create_dataset_dict(output_folder_str: str, label_dict: Dict[str, int]) -> OrderedDict: + # Create a mapping from class IDs to label names + # If the label_dict contains entries like {'tibetan_no': 0, 'text_body': 1, 'chinese_no': 2}, + # then class_names will be {0: 'tibetan_no', 1: 'text_body', 2: 'chinese_no'} + class_names = {} + + # First, create a reverse mapping from class IDs to label names + for label_name, class_id in label_dict.items(): + class_names[class_id] = label_name + + # Ensure we have entries for class IDs 0, 1, and 2 if they're not in the dictionary + if 0 not in class_names: + class_names[0] = 'tibetan_no' + if 1 not in class_names: + class_names[1] = 'text_body' + if 2 not in class_names: + class_names[2] = 'chinese_no' + + dataset_name_part = Path(output_folder_str).parent.name + split_name = Path(output_folder_str).name + + return OrderedDict([ + ('path', f"../{dataset_name_part}"), + (split_name, f'{split_name}/images'), + ('nc', len(class_names)), + ('names', class_names) + ]) diff --git a/tibetanDataGenerator/dataset_generator_tib_no.py b/tibetanDataGenerator/dataset_generator_tib_no.py new file mode 100644 index 0000000..4e927cd --- /dev/null +++ b/tibetanDataGenerator/dataset_generator_tib_no.py @@ -0,0 +1,517 @@ +import argparse +import multiprocessing +import random +import re +import os +from typing import Tuple, Dict, List, Optional # Added Optional + +import yaml +from pathlib import Path +from collections import OrderedDict +from ultralytics.data.utils import DATASETS_DIR +from tibetanDataGenerator.utils.data_loader import TextFactory +from tibetanDataGenerator.data.text_renderer_img_size import ImageBuilder +from tibetanDataGenerator.data.augmentation import RotateAugmentation, NoiseAugmentation, \ + AugmentationStrategy +from tibetanDataGenerator.utils.bounding_box import BoundingBoxCalculator +from tibetanDataGenerator.utils.identifier import hash_current_time + +# Define a dictionary of augmentation strategies +augmentation_strategies: Dict[str, AugmentationStrategy] = { + 'rotate': RotateAugmentation(), + 'noise': NoiseAugmentation() +} + +def _parse_yolo_annotations(file_path: str) -> List[Tuple[int, float, float, float, float]]: + """ + Parses a YOLO annotation file. + Each line is expected to be: class_id center_x center_y width height + Returns a list of tuples (class_id, x_center, y_center, width, height). + """ + annotations = [] + if not file_path: # If file_path is None or empty string + return annotations + + if not os.path.exists(file_path): + print(f"Warning: Annotation file '{file_path}' not found. No annotations will be loaded from this file.") + return annotations + + try: + with open(file_path, 'r', encoding='utf-8') as f: + for i, line in enumerate(f): + line_strip = line.strip() + if not line_strip: # Skip empty lines + continue + parts = line_strip.split() + if len(parts) == 5: + try: + class_id = int(parts[0]) + x_center = float(parts[1]) + y_center = float(parts[2]) + width = float(parts[3]) + height = float(parts[4]) + + # Basic validation for YOLO coordinates (normalized) + if not (0.0 <= x_center <= 1.0 and \ + 0.0 <= y_center <= 1.0 and \ + 0.0 <= width <= 1.0 and \ + 0.0 <= height <= 1.0): + # This warning can be made conditional on debug flag if too verbose + # print(f"Debug: Annotation values out of [0,1] range in {file_path}, line {i+1}: {line_strip}") + pass + + # Ensure width and height are positive for valid bounding box + if width <= 0 or height <= 0: + print( + f"Warning: Non-positive width/height in annotation file {file_path}, line {i + 1}: {line_strip}. Skipping this annotation.") + continue + + annotations.append((class_id, x_center, y_center, width, height)) + except ValueError: + print( + f"Warning: Malformed line (numeric conversion) in annotation file {file_path}, line {i + 1}: {line_strip}") + else: # Incorrect number of parts + print( + f"Warning: Incorrect number of parts in line in annotation file {file_path}, line {i + 1}: {line_strip}") + except Exception as e: + print(f"Error reading or parsing annotation file {file_path}: {e}") + return annotations + + +def generate_dataset(args: argparse.Namespace, validation: bool = False) -> Dict: + """ + Generate a dataset for training or validation. + + Args: + args (argparse.Namespace): Command-line arguments. + validation (bool): Whether to generate validation dataset. Defaults to False. + + Returns: + Dict: A dictionary containing dataset information. + """ + dataset_info = _setup_dataset_info(args, validation) + label_dict = _create_label_dict(args) + background_images = _load_background_images(dataset_info['background_folder']) + + # _prepare_generation_args now gets annotations_file_path from args + generation_args_tuple = _prepare_generation_args(args, dataset_info, label_dict, background_images) + + results = _generate_images_in_parallel(generation_args_tuple, dataset_info['no_samples']) + + return _create_dataset_dict(str(dataset_info['folder']), label_dict) + + +def generate_synthetic_image( + images: List[str], + label_dict: Dict[str, int], + folder_with_background: str, + corpora_tibetan_numbers_path: str, + corpora_tibetan_text_path: str, + corpora_chinese_numbers_path: str, + folder_for_train_data: str, + debug: bool = True, + font_path: str = 'res/Microsoft Himalaya.ttf', + single_label: bool = False, + image_width: int = 1024, + image_height: int = 361, + augmentation: str = "noise", + annotations_file_path: Optional[str] = None # <<< NEW ARGUMENT +) -> Tuple[str, str]: + # Constants + FONT_SIZE = 24 + BORDER_OFFSET_RATIO = 0.05 + + ctr = hash_current_time() + + border_offset_x = int(BORDER_OFFSET_RATIO * image_width) + border_offset_y = int(BORDER_OFFSET_RATIO * image_height) + + image_path_bg = _select_random_background(folder_with_background, images) + builder = _setup_image_builder(image_path_bg, image_width, image_height, font_path, FONT_SIZE) + + bbox_str_list = [] # Collect bounding box strings for all text instances + + # ---- Start: Draw bounding boxes from YOLO annotation file ---- + if annotations_file_path: + parsed_annotations = _parse_yolo_annotations(annotations_file_path) + for ann_class_id, norm_cx, norm_cy, norm_w, norm_h in parsed_annotations: + # Convert YOLO normalized coordinates to pixel coordinates for drawing + x_center_pixel = norm_cx * image_width + y_center_pixel = norm_cy * image_height + pixel_w = norm_w * image_width + pixel_h = norm_h * image_height + + # Calculate top-left corner for add_bounding_box + tl_x = x_center_pixel - (pixel_w / 2) + tl_y = y_center_pixel - (pixel_h / 2) + + draw_tl_pos = (int(round(tl_x)), int(round(tl_y))) + draw_box_size = (int(round(pixel_w)), int(round(pixel_h))) + + # Draw only if width and height are positive + if draw_box_size[0] > 0 and draw_box_size[1] > 0: + # Select the text corpus based on ann_class_id + if ann_class_id == 0: + text, file_name_from_corpus = _generate_text(corpora_tibetan_numbers_path) + elif ann_class_id == 1: + text, file_name_from_corpus = _generate_text(corpora_tibetan_text_path) + elif ann_class_id == 2: + text, file_name_from_corpus = _generate_text(corpora_chinese_numbers_path) + else: + if debug: + print(f"Debug: Unknown ann_class_id {ann_class_id}. Skipping this annotation box.") + continue + + # Ensure the text fits within the bounding box + text_render_top_left_pos = (draw_tl_pos[0], draw_tl_pos[1]) + yolo_box_center_pos = (int(round(x_center_pixel)), int(round(y_center_pixel))) + actual_text_box_size = (draw_box_size[0], draw_box_size[1]) + + builder.add_text(text, text_render_top_left_pos, actual_text_box_size) + # Get the base filename without extension + label_key = os.path.splitext(file_name_from_corpus)[0] + + # For Tibetan numbers (class 0), ensure we use the tib_no_ prefix + if ann_class_id == 0: + if not label_key.startswith('tib_no_'): + # Extract the number from the filename if it exists + try: + num_part = re.search(r'\d+', label_key).group() + label_key = f'tib_no_{num_part.zfill(4)}' # Format as tib_no_0001 + except AttributeError: + label_key = 'tib_no_0001' # Default fallback + + # Get label ID from dictionary or use class ID as fallback + label_id = label_dict.get(label_key, ann_class_id) + if label_key not in label_dict and debug: + print(f"Debug: Label '{label_key}' not found in label_dict. Using class_id {ann_class_id}") + + bbox_str = _create_bbox_string( + label_id, + yolo_box_center_pos, + actual_text_box_size, + image_width, + image_height + ) + bbox_str_list.append(bbox_str) + + if debug: + builder.add_bounding_box(text_render_top_left_pos, actual_text_box_size, color=(0, 255, 0)) # Green + builder.add_bounding_box(draw_tl_pos, draw_box_size, color=(255, 0, 0)) # Red + + else: + if debug: + print( + f"Debug: Skipping drawing annotation box from file (class {ann_class_id}) due to non-positive dimensions: size {draw_box_size}") + + if augmentation.lower() != 'none' and augmentation.lower() in augmentation_strategies: + _apply_augmentation(builder, augmentation) + elif augmentation.lower() != 'none': + print(f"Warning: Augmentation strategy '{augmentation}' not found. Skipping augmentation.") + + image_filename_saved = f"{ctr}.png" + image_full_path = os.path.join(folder_for_train_data, 'images', image_filename_saved) + os.makedirs(os.path.dirname(image_full_path), exist_ok=True) + builder.save(image_full_path) + + labels_dir = os.path.join(folder_for_train_data, 'labels') + os.makedirs(labels_dir, exist_ok=True) + + label_filename_saved = f"{ctr}.txt" + label_full_path = os.path.join(labels_dir, label_filename_saved) + with open(label_full_path, 'w', encoding='utf-8') as f: + f.writelines(bbox_str_list) # Write all bounding box strings into the file + + if debug: + print(f"Generated sample: {image_full_path}") + print(f"Label file: {label_full_path}") + print(f"Bounding boxes (YOLO format for synthetic text):\n{''.join(bbox_str_list).strip()}") + + return image_full_path, label_full_path + + +def _select_random_background(folder: str, images: List[str]) -> str: + if not images: + raise ValueError(f"No images found in background folder: {folder}. Cannot select a random background.") + return os.path.join(folder, random.choice(images)) + + +def _setup_image_builder(image_path_bg: str, image_width: int, image_height: int, font_path: str, + font_size: int) -> ImageBuilder: + builder = ImageBuilder(image_size=(image_width, image_height)) + try: + if image_path_bg and os.path.exists(image_path_bg): + builder.set_background(image_path_bg) + else: + if image_path_bg: + print(f"Warning: Background image {image_path_bg} not found. Using default white background.") + except FileNotFoundError: + print( + f"Warning: Background image {image_path_bg} not found during set_background. Using default white background.") + except Exception as e: + print(f"Error setting background {image_path_bg}: {e}. Using default white background.") + + builder.set_font(font_path, font_size=font_size) + return builder + + +def _generate_text(folder_with_corpora: str) -> Tuple[str, str]: + text_generator = TextFactory.create_text_source("corpus", folder_with_corpora) + return text_generator.generate_text() + + +def _calculate_text_layout( + text: str, + image_width: int, + image_height: int, + border_offset_x: int, + border_offset_y: int, + font_path: str, + font_size: int +) -> Tuple[Tuple[int, int], Tuple[int, int], Tuple[int, int]]: + min_text_box_width = font_size * 2 + min_text_box_height = int(font_size * 1.2) + + max_width_for_text_area = image_width - 2 * border_offset_x + max_height_for_text_area = image_height - 2 * border_offset_y + + if max_width_for_text_area < min_text_box_width or max_height_for_text_area < min_text_box_height: + # Fallback: try to use at least minimal dimensions if text is very short. + # This might not be ideal if text is truly too large for the area. + # A more robust solution might involve text wrapping or scaling, but that's complex. + print( + f"Warning: Text area ({max_width_for_text_area}x{max_height_for_text_area}) might be too small for text. Attempting to fit.") + max_width_for_text_area = max(max_width_for_text_area, min_text_box_width) + max_height_for_text_area = max(max_height_for_text_area, min_text_box_height) + + conceptual_box_w = random.randint(min_text_box_width, max_width_for_text_area) + conceptual_box_h = random.randint(min_text_box_height, max_height_for_text_area) + max_placement_box = (conceptual_box_w, conceptual_box_h) + + actual_text_box_size = BoundingBoxCalculator.fit(text, max_placement_box, font_size=font_size, font_path=font_path) + actual_w, actual_h = actual_text_box_size + + if actual_w <= 0 or actual_h <= 0: + print( + f"Warning: BoundingBoxCalculator.fit returned non-positive dimensions ({actual_w}x{actual_h}) for text: '{text[:50]}...'. Defaulting to minimal.") + actual_w = max(actual_w, font_size // 2 if text else 1) + actual_h = max(actual_h, font_size // 2 if text else 1) + actual_text_box_size = (actual_w, actual_h) + + pos_x_upper_bound = image_width - border_offset_x - actual_w + pos_y_upper_bound = image_height - border_offset_y - actual_h + + # Ensure random range is valid: lower_bound <= upper_bound + # If upper bound is less than lower, it means the box is too large. + # We should place it at the border_offset in such cases. + tl_pos_x = random.randint(border_offset_x, max(border_offset_x, + pos_x_upper_bound)) if pos_x_upper_bound >= border_offset_x else border_offset_x + tl_pos_y = random.randint(border_offset_y, max(border_offset_y, + pos_y_upper_bound)) if pos_y_upper_bound >= border_offset_y else border_offset_y + + text_render_top_left_pos = (tl_pos_x, tl_pos_y) + + center_x = tl_pos_x + actual_w // 2 + center_y = tl_pos_y + actual_h // 2 + yolo_box_center_pos = (center_x, center_y) + + return text_render_top_left_pos, yolo_box_center_pos, actual_text_box_size + + +def _apply_augmentation(builder: ImageBuilder, augmentation_name: str): + augmentation_strategy = augmentation_strategies[augmentation_name.lower()] + builder.apply_augmentation(augmentation_strategy) + + +def _save_image_and_label( + builder: ImageBuilder, + text_content: str, + ctr: str, + folder_for_train_data: str, + label_dict: Dict[str, int], + single_label: bool, + file_name_from_corpus: str, + yolo_box_center_pos: Tuple[int, int], + actual_text_box_size: Tuple[int, int], + image_width: int, + image_height: int, + debug: bool +) -> Tuple[str, str]: + label_str = next(iter(label_dict.keys())) if single_label else os.path.splitext(file_name_from_corpus)[0] + if label_str not in label_dict: + print( + f"Warning: Label '{label_str}' from corpus file '{file_name_from_corpus}' not found in label_dict. Defaulting to first available label.") + if not label_dict: + raise ValueError("Label dictionary is empty. Cannot determine a label.") + label_str = next(iter(label_dict.keys())) + label_id = label_dict[label_str] + + image_base_filename = f"{label_str}_{ctr}.png" + image_full_path = os.path.join(folder_for_train_data, 'images', image_base_filename) + builder.save(image_full_path) + + bbox_str = _create_bbox_string( + label_id, yolo_box_center_pos, actual_text_box_size, image_width, image_height + ) + + labels_dir = os.path.join(folder_for_train_data, 'labels') + os.makedirs(labels_dir, exist_ok=True) + + label_base_filename = f"{label_str}_{ctr}.txt" + label_full_path = os.path.join(labels_dir, label_base_filename) + with open(label_full_path, 'w', encoding='utf-8') as f: + f.write(bbox_str) + + if debug: + print(f"Generated sample: {image_full_path}") + print(f"Label file: {label_full_path}") + print(f"Bounding box (YOLO format for synthetic text):\n{bbox_str.strip()}") + + return image_full_path, label_full_path + + +def _create_bbox_string( + label_id: int, + box_center_xy: Tuple[int, int], + box_wh: Tuple[int, int], + image_width: int = 1024, + image_height: int = 361 +) -> str: + center_x, center_y = box_center_xy + box_w, box_h = box_wh + + if image_width == 0: raise ValueError("image_width cannot be zero.") + if image_height == 0: raise ValueError("image_height cannot be zero.") + + norm_center_x = max(0.0, min(1.0, center_x / image_width)) + norm_center_y = max(0.0, min(1.0, center_y / image_height)) + norm_w = max(0.0, min(1.0, box_w / image_width)) + norm_h = max(0.0, min(1.0, box_h / image_height)) + + return f"{label_id} {norm_center_x:.6f} {norm_center_y:.6f} {norm_w:.6f} {norm_h:.6f}\n" + + +def _fill_label_dict(folder_path: str) -> Dict[str, int]: + label_dict = OrderedDict() + label_id_counter = 0 + + if not os.path.isdir(folder_path): + print(f"Warning: Corpora folder '{folder_path}' not found. Returning empty label dict.") + return label_dict + + # Get all .txt files and sort them numerically by their suffix + files = [f for f in os.listdir(folder_path) if f.endswith(".txt") and f.startswith("tib_no_")] + + try: + # Sort files by their numeric suffix (tib_no_0001.txt -> 1) + sorted_files = sorted( + files, + key=lambda x: int(x.split("_")[-1].split(".")[0]) + ) + except (ValueError, IndexError): + print("Warning: Could not sort corpus files numerically. Using simple alphabetical sort.") + sorted_files = sorted(files) + + for filename in sorted_files: + label_name = os.path.splitext(filename)[0] # Gets 'tib_no_0001' from 'tib_no_0001.txt' + if label_name not in label_dict: + label_dict[label_name] = label_id_counter + label_id_counter += 1 + + if not label_dict: + print(f"Warning: No valid .txt files found in corpora folder '{folder_path}'. Label dictionary is empty.") + return label_dict + + +def _setup_dataset_info(args: argparse.Namespace, validation: bool) -> Dict: + base_output_folder = Path(args.dataset_name) + + if validation: + folder_path = base_output_folder / 'val' + num_samples = args.val_samples + bg_folder = args.background_val + else: + folder_path = base_output_folder / 'train' + num_samples = args.train_samples + bg_folder = args.background_train + + os.makedirs(folder_path / 'images', exist_ok=True) + os.makedirs(folder_path / 'labels', exist_ok=True) + + return { + 'background_folder': bg_folder, + 'folder': folder_path, + 'no_samples': num_samples + } + + +def _create_label_dict(args: argparse.Namespace) -> Dict[str, int]: + if args.single_label: + return {'tibetan': 0} + else: + return _fill_label_dict(args.corpora_tibetan_numbers_path) + + +def _load_background_images(folder: str) -> List[str]: + if not os.path.isdir(folder): + print(f"Warning: Background folder '{folder}' not found. No background images will be loaded.") + return [] + return [file for file in os.listdir(folder) if file.lower().endswith(('.jpg', '.jpeg', '.png'))] + + +def _prepare_generation_args(args: argparse.Namespace, dataset_info: Dict, label_dict: Dict, + images_bg_list: List[str]) -> Tuple: + """Prepare arguments for each call to generate_synthetic_image.""" + return ( + images_bg_list, + label_dict, + dataset_info['background_folder'], + args.corpora_tibetan_numbers_path, + args.corpora_tibetan_text_path, + args.corpora_chinese_numbers_path, + dataset_info['folder'], + args.debug, + args.font_path, + args.single_label, + args.image_width, + args.image_height, + args.augmentation, + args.annotations_file_path # <<< NEW ARGUMENT ADDED HERE + ) + + +def _generate_images_in_parallel(generation_args_tuple: Tuple, no_samples: int) -> List: + if no_samples <= 0: + return [] + list_of_generation_args = [generation_args_tuple] * no_samples + # Ensure os.cpu_count() returns a valid number or default to 1 + num_cpus = os.cpu_count() + max_parallel_calls = min(num_cpus if num_cpus else 1, no_samples) + + results = [] + # Use try-finally for pool shutdown if issues arise, but starmap should handle clean exit. + # Consider reducing max_parallel_calls if memory is an issue for large images/many processes. + if max_parallel_calls == 0: max_parallel_calls = 1 # Ensure at least one process + + with multiprocessing.Pool(processes=max_parallel_calls) as pool: + try: + results = pool.starmap(generate_synthetic_image, list_of_generation_args) + except Exception as e: + print(f"Error during parallel image generation: {e}") + pool.terminate() # Forcefully terminate worker processes + pool.join() # Wait for worker processes to exit + raise # Re-raise the exception to make the error visible + return results + + +def _create_dataset_dict(output_folder_str: str, label_dict: Dict[str, int]) -> OrderedDict: + class_names = {int(v): str(k) for k, v in label_dict.items()} + dataset_name_part = Path(output_folder_str).parent.name + split_name = Path(output_folder_str).name + + return OrderedDict([ + ('path', f"../{dataset_name_part}"), + (split_name, f'{split_name}/images'), + ('nc', len(class_names)), + ('names', class_names) + ]) diff --git a/tibetanDataGenerator/generated_sample.png b/tibetanDataGenerator/generated_sample.png new file mode 100644 index 0000000..60e3dde Binary files /dev/null and b/tibetanDataGenerator/generated_sample.png differ diff --git a/tibetanDataGenerator/main.py b/tibetanDataGenerator/main.py new file mode 100644 index 0000000..4ff8945 --- /dev/null +++ b/tibetanDataGenerator/main.py @@ -0,0 +1,92 @@ +import argparse +from pathlib import Path +import yaml +from collections import OrderedDict +from ultralytics.data.utils import DATASETS_DIR +from tibetanDataGenerator.dataset_generator_tib_no import generate_dataset + + +def main(): + parser = argparse.ArgumentParser(description="Generate YOLO dataset for Tibetan text detection") + + parser.add_argument('--background_train', type=str, default='./data/background_images_train/', + help='Folder with background images for training') + parser.add_argument('--background_val', type=str, default='./data/background_images_val/', + help='Folder with background images for validation') + parser.add_argument('--output_dir', type=str, default=str(Path(DATASETS_DIR)), + help='Base directory to save the generated dataset. (Default: Ultralytics DATASETS_DIR)') + parser.add_argument('--dataset_name', type=str, default='yolo_tibetan_dataset', + help='Name for the generated dataset folder.') + parser.add_argument('--corpora_tibetan_numbers_path', type=str, default='./data/corpora/Tibetan Number Words/', + help='Folder with Tibetan number words (maps to class_id 0: "tibetan_number_word").') + parser.add_argument('--corpora_tibetan_text_path', type=str, default='./data/corpora/UVA Tibetan Spoken Corpus/', + help='Folder with general Tibetan text (maps to class_id 1: "tibetan_text").') + parser.add_argument('--corpora_chinese_numbers_path', type=str, default='./data/corpora/Chinese Number Words/', + help='Folder with Chinese number words (maps to class_id 2: "chinese_number_word").') + parser.add_argument('--train_samples', type=int, default=100, + help='Number of training samples to generate') + parser.add_argument('--val_samples', type=int, default=20, + help='Number of validation samples to generate') + parser.add_argument('--font_path_tibetan', type=str, required=True, default='ext/Microsoft Himalaya.ttf', + help='Path to a font file that supports Tibetan characters') + parser.add_argument('--font_path_chinese', type=str, required=True, default='ext/simkai.ttf', + help='Path to a font file that supports Chinese characters') + parser.add_argument('--single_label', action='store_true', + help='Use a single label "tibetan" for all files instead of using filenames as labels') + parser.add_argument('--debug', action='store_true', + help='More verbose output with debug information about the image generation process.') + parser.add_argument('--image_width', type=int, default=1024, + help='Width (pixels) of each generated image.') + parser.add_argument('--image_height', type=int, default=361, + help='Height (pixels) of each generated image.') + parser.add_argument("--augmentation", choices=['rotate', 'noise', 'none'], default='noise', + help="Type of augmentation to apply") + parser.add_argument('--annotations_file_path', type=str, + default='./data/tibetan numbers/annotations/tibetan_chinese_no', + help='Path to a YOLO annotation file to load and draw bounding boxes from.') + + + args = parser.parse_args() + + full_dataset_path = Path(args.output_dir) / args.dataset_name + original_dataset_name = args.dataset_name + args.dataset_name = str(full_dataset_path) + + print(f"Generating YOLO dataset in {args.dataset_name}...") + + # Generate training dataset + # args object (containing args.annotations_file_path) is passed to generate_dataset + train_dataset_info = generate_dataset(args, validation=False) + + # Generate validation dataset + val_dataset_info = generate_dataset(args, validation=True) + + yaml_content = OrderedDict() + yaml_content['path'] = original_dataset_name + yaml_content['train'] = 'train/images' + yaml_content['val'] = 'val/images' + yaml_content['test'] = '' + + if 'nc' not in train_dataset_info or 'names' not in train_dataset_info: + raise ValueError("generate_dataset did not return 'nc' or 'names' in its info dictionary.") + yaml_content['nc'] = train_dataset_info['nc'] + yaml_content['names'] = train_dataset_info['names'] + + def represent_ordereddict(dumper, data): + return dumper.represent_mapping('tag:yaml.org,2002:map', data.items()) + + yaml.add_representer(OrderedDict, represent_ordereddict) + + yaml_file_path = Path(args.output_dir) / f"{original_dataset_name}.yaml" + + with open(yaml_file_path, 'w', encoding='utf-8') as f: # Added encoding='utf-8' + yaml.dump(dict(yaml_content), f, sort_keys=False, allow_unicode=True) + + print(f"\nDataset generation completed. YAML configuration saved to: {yaml_file_path}") + print("Training can be started with a command like:\n") + print( + f"yolo detect train data={yaml_file_path} epochs=100 imgsz=[{args.image_height},{args.image_width}] model=yolov8n.pt") + + +if __name__ == "__main__": + main() diff --git a/tibetan_utils/arg_utils.py b/tibetan_utils/arg_utils.py index 14d8116..dbb5baa 100644 --- a/tibetan_utils/arg_utils.py +++ b/tibetan_utils/arg_utils.py @@ -1,251 +1,207 @@ """ -Command-line argument utilities for the TibetanOCR project. +Argument parsing utilities for the TibetanOCR project. +Multi-class support with Tibetan numbers, Tibetan text, and Chinese numbers. """ import argparse +from pathlib import Path +try: + from ultralytics.data.utils import DATASETS_DIR +except ImportError: + DATASETS_DIR = "./datasets" # Fallback if ultralytics not installed + from .config import ( - DEFAULT_MODEL_PATH, DEFAULT_IMAGE_SIZE, DEFAULT_CONFIDENCE, - DEFAULT_OUTPUT_DIR, DEFAULT_DATASET_DIR, DEFAULT_OCR_LANG, - DEFAULT_TRAIN_SAMPLES, DEFAULT_VAL_SAMPLES, DEFAULT_AUGMENTATION, - DEFAULT_FONT_PATH, DEFAULT_SBB_OUTPUT, DEFAULT_OCR_OUTPUT + DEFAULT_BACKGROUND_TRAIN_PATH, + DEFAULT_BACKGROUND_VAL_PATH, + DEFAULT_CORPORA_PATH, + DEFAULT_FONT_PATH, + DEFAULT_IMAGE_SIZE, + DEFAULT_BATCH_SIZE, + DEFAULT_EPOCHS, + DEFAULT_WORKERS, + DEFAULT_TRAIN_SAMPLES, + DEFAULT_VAL_SAMPLES, + DEFAULT_AUGMENTATION, + DEFAULT_ANNOTATION_FILE_PATH ) def add_model_arguments(parser): - """ - Add model-related arguments to an ArgumentParser. - - Args: - parser: ArgumentParser instance - - Returns: - ArgumentParser: Updated parser - """ - group = parser.add_argument_group('Model Options') - group.add_argument('--model', type=str, default=DEFAULT_MODEL_PATH, - help='Path to the model (e.g., yolov8n.pt, best.pt)') - group.add_argument('--imgsz', type=int, default=DEFAULT_IMAGE_SIZE, - help='Image size for inference/training') - group.add_argument('--conf', type=float, default=DEFAULT_CONFIDENCE, - help='Confidence threshold for detections') - group.add_argument('--device', type=str, default='', - help='Device for inference/training (e.g., cpu, 0, 0,1,2,3)') - return parser + """Add model-related arguments.""" + parser.add_argument('--model', type=str, default='yolov8n.pt', + help='Path to the model file') + parser.add_argument('--imgsz', type=int, default=DEFAULT_IMAGE_SIZE, + help='Image size for inference') + parser.add_argument('--conf', type=float, default=0.25, + help='Confidence threshold for detections') def add_output_arguments(parser): - """ - Add output-related arguments to an ArgumentParser. - - Args: - parser: ArgumentParser instance - - Returns: - ArgumentParser: Updated parser - """ - group = parser.add_argument_group('Output Options') - group.add_argument('--project', '--output', type=str, default=DEFAULT_OUTPUT_DIR, - help='Directory for output') - group.add_argument('--name', type=str, default='exp', - help='Experiment name') - group.add_argument('--save', action='store_true', default=True, - help='Save results') - group.add_argument('--save-txt', action='store_true', - help='Save results as .txt files') - group.add_argument('--save-conf', action='store_true', - help='Save confidence values in .txt files') - return parser + """Add output-related arguments.""" + parser.add_argument('--output', type=str, default='output', + help='Output directory') + parser.add_argument('--save-crops', action='store_true', + help='Save cropped text regions') + parser.add_argument('--debug', action='store_true', + help='Enable debug mode with verbose output') def add_dataset_generation_arguments(parser): - """ - Add dataset generation arguments to an ArgumentParser. + """Add dataset generation arguments for multi-class support.""" + parser.add_argument('--background_train', type=str, default=DEFAULT_BACKGROUND_TRAIN_PATH, + help='Folder with background images for training') + parser.add_argument('--background_val', type=str, default=DEFAULT_BACKGROUND_VAL_PATH, + help='Folder with background images for validation') + parser.add_argument('--output_dir', type=str, default=str(Path(DATASETS_DIR)), + help='Base directory to save the generated dataset. (Default: Ultralytics DATASETS_DIR)') + parser.add_argument('--dataset_name', type=str, default='yolo_tibetan_dataset', + help='Name for the generated dataset folder.') - Args: - parser: ArgumentParser instance - - Returns: - ArgumentParser: Updated parser - """ - group = parser.add_argument_group('Dataset Generation Options') - group.add_argument('--background_train', type=str, default='./data/background_images_train/', - help='Folder with background images for training') - group.add_argument('--background_val', type=str, default='./data/background_images_val/', - help='Folder with background images for validation') - group.add_argument('--dataset_name', type=str, default=DEFAULT_DATASET_DIR, - help='Folder for the generated YOLO dataset') - group.add_argument('--corpora_folder', type=str, default='./data/corpora/Tibetan Number Words/', - help='Folder with Tibetan corpora') - group.add_argument('--train_samples', type=int, default=DEFAULT_TRAIN_SAMPLES, - help='Number of training samples to generate') - group.add_argument('--val_samples', type=int, default=DEFAULT_VAL_SAMPLES, - help='Number of validation samples to generate') - group.add_argument('--no_cols', type=int, default=1, - help='Number of text columns to generate [1-5]') - group.add_argument('--font_path', type=str, default=DEFAULT_FONT_PATH, - help='Path to a Tibetan font file') - group.add_argument('--single_label', action='store_true', - help='Use a single label "tibetan" for all files') - group.add_argument('--debug', action='store_true', - help='Enable debug mode for verbose output') - group.add_argument('--image_size', type=int, default=DEFAULT_IMAGE_SIZE, - help='Size of generated images in pixels') - group.add_argument('--augmentation', choices=['rotate', 'noise'], default=DEFAULT_AUGMENTATION, - help='Type of augmentation to apply') - return parser + # Multi-class corpora paths + parser.add_argument('--corpora_tibetan_numbers_path', type=str, + default='./data/corpora/Tibetan Number Words/', + help='Folder with Tibetan number words (maps to class_id 0: "tibetan_number_word").') + parser.add_argument('--corpora_tibetan_text_path', type=str, + default='./data/corpora/UVA Tibetan Spoken Corpus/', + help='Folder with general Tibetan text (maps to class_id 1: "tibetan_text").') + parser.add_argument('--corpora_chinese_numbers_path', type=str, + default='./data/corpora/Chinese Number Words/', + help='Folder with Chinese number words (maps to class_id 2: "chinese_number_word").') + + # Sample counts + parser.add_argument('--train_samples', type=int, default=DEFAULT_TRAIN_SAMPLES, + help='Number of training samples to generate') + parser.add_argument('--val_samples', type=int, default=DEFAULT_VAL_SAMPLES, + help='Number of validation samples to generate') + + # Multi-font support + parser.add_argument('--font_path_tibetan', type=str, required=True, + default='ext/Microsoft Himalaya.ttf', + help='Path to a font file that supports Tibetan characters') + parser.add_argument('--font_path_chinese', type=str, required=True, + default='ext/simkai.ttf', + help='Path to a font file that supports Chinese characters') + + # Image dimensions + parser.add_argument('--image_width', type=int, default=1024, + help='Width (pixels) of each generated image.') + parser.add_argument('--image_height', type=int, default=361, + help='Height (pixels) of each generated image.') + + # Labels and augmentation + parser.add_argument('--single_label', action='store_true', + help='Use a single label "tibetan" for all files instead of using filenames as labels') + parser.add_argument("--augmentation", choices=['rotate', 'noise', 'none'], default=DEFAULT_AUGMENTATION, + help="Type of augmentation to apply") + + # YOLO annotations support + parser.add_argument('--annotations_file_path', type=str, + default=DEFAULT_ANNOTATION_FILE_PATH, + help='Path to a YOLO annotation file to load and draw bounding boxes from.') def add_training_arguments(parser): - """ - Add training-related arguments to an ArgumentParser. - - Args: - parser: ArgumentParser instance - - Returns: - ArgumentParser: Updated parser - """ - group = parser.add_argument_group('Training Options') - group.add_argument('--dataset', type=str, default=DEFAULT_DATASET_DIR, - help='Name of the dataset folder') - group.add_argument('--epochs', type=int, default=100, - help='Number of training epochs') - group.add_argument('--batch', type=int, default=16, - help='Batch size for training') - group.add_argument('--workers', type=int, default=8, - help='Number of workers for data loading') - group.add_argument('--patience', type=int, default=50, - help='EarlyStopping patience in epochs') - group.add_argument('--export', action='store_true', - help='Export the model after training as TorchScript') - return parser + """Add training-related arguments.""" + parser.add_argument('--dataset', type=str, default='yolo_tibetan/', + help='Path to dataset YAML file') + parser.add_argument('--epochs', type=int, default=DEFAULT_EPOCHS, + help='Number of training epochs') + parser.add_argument('--batch', type=int, default=DEFAULT_BATCH_SIZE, + help='Batch size') + parser.add_argument('--workers', type=int, default=DEFAULT_WORKERS, + help='Number of worker threads') + parser.add_argument('--device', type=str, default='', + help='Device to use for training') + parser.add_argument('--project', type=str, default='runs/detect', + help='Project directory') + parser.add_argument('--name', type=str, default='train', + help='Experiment name') + parser.add_argument('--export', action='store_true', + help='Export model after training') + parser.add_argument('--patience', type=int, default=50, + help='EarlyStopping patience') def add_wandb_arguments(parser): - """ - Add Weights & Biases related arguments to an ArgumentParser. - - Args: - parser: ArgumentParser instance - - Returns: - ArgumentParser: Updated parser - """ - group = parser.add_argument_group('Weights & Biases Options') - group.add_argument('--wandb', action='store_true', - help='Enable Weights & Biases logging') - group.add_argument('--wandb-project', type=str, default='TibetanOCR', - help='Weights & Biases project name') - group.add_argument('--wandb-entity', type=str, default=None, - help='Weights & Biases entity (team or username)') - group.add_argument('--wandb-tags', type=str, default=None, - help='Comma-separated tags for the experiment (e.g., "yolov8,tibetan")') - group.add_argument('--wandb-name', type=str, default=None, - help='Name of the experiment in wandb (default: same as --name)') - return parser + """Add Weights & Biases arguments.""" + parser.add_argument('--wandb', action='store_true', + help='Enable Weights & Biases logging') + parser.add_argument('--wandb-project', type=str, default='PechaBridge', + help='W&B project name') + parser.add_argument('--wandb-entity', type=str, + help='W&B entity (team or username)') + parser.add_argument('--wandb-tags', type=str, + help='Comma-separated tags for the experiment') + parser.add_argument('--wandb-name', type=str, + help='Name of the experiment in wandb') def add_sbb_arguments(parser): - """ - Add Staatsbibliothek zu Berlin related arguments to an ArgumentParser. - - Args: - parser: ArgumentParser instance - - Returns: - ArgumentParser: Updated parser - """ - group = parser.add_argument_group('SBB Options') - group.add_argument('--ppn', type=str, - help='PPN (Pica Production Number) of the document in the Staatsbibliothek zu Berlin') - group.add_argument('--download', action='store_true', - help='Download images instead of processing them directly') - group.add_argument('--no-ssl-verify', action='store_true', - help='Disable SSL certificate verification (not recommended for production environments)') - group.add_argument('--max-images', type=int, default=0, - help='Maximum number of images for inference (0 = all)') - group.add_argument('--output', type=str, default=DEFAULT_SBB_OUTPUT, - help='Directory for saving downloaded images') - return parser + """Add Staatsbibliothek zu Berlin arguments.""" + parser.add_argument('--ppn', type=str, required=True, + help='PPN (Pica Production Number) of the document') + parser.add_argument('--download', action='store_true', + help='Download images instead of processing them directly') + parser.add_argument('--max-images', type=int, default=0, + help='Maximum number of images to process (0 = all)') + parser.add_argument('--no-ssl-verify', action='store_true', + help='Disable SSL certificate verification') def add_ocr_arguments(parser): - """ - Add OCR-related arguments to an ArgumentParser. - - Args: - parser: ArgumentParser instance - - Returns: - ArgumentParser: Updated parser - """ - group = parser.add_argument_group('OCR Options') - group.add_argument('--lang', type=str, default=DEFAULT_OCR_LANG, - help='Language for Tesseract OCR (e.g., eng, deu, eng+deu, bod for Tibetan)') - group.add_argument('--tesseract-config', type=str, default='', - help='Additional Tesseract configuration') - group.add_argument('--save-crops', action='store_true', - help='Save cropped text blocks as images') - group.add_argument('--output', type=str, default=DEFAULT_OCR_OUTPUT, - help='Directory for saving OCR results') - return parser + """Add OCR-related arguments.""" + parser.add_argument('--lang', type=str, default='eng+deu', + help='Language for Tesseract OCR') + parser.add_argument('--tesseract-config', type=str, default='', + help='Additional Tesseract configuration') def add_source_argument(parser): - """ - Add source argument to an ArgumentParser. - - Args: - parser: ArgumentParser instance - - Returns: - ArgumentParser: Updated parser - """ + """Add source argument for input files.""" parser.add_argument('--source', type=str, - help='Path to image or directory for inference') - return parser + help='Path to image file or directory') def create_generate_dataset_parser(): - """Create an ArgumentParser for dataset generation.""" + """Create parser for multi-class dataset generation.""" parser = argparse.ArgumentParser(description="Generate YOLO dataset for Tibetan text detection") - parser = add_dataset_generation_arguments(parser) + add_dataset_generation_arguments(parser) + add_output_arguments(parser) return parser def create_train_parser(): - """Create an ArgumentParser for model training.""" - parser = argparse.ArgumentParser(description="Train a YOLO model with Tibetan OCR data") - parser = add_model_arguments(parser) - parser = add_training_arguments(parser) - parser = add_output_arguments(parser) - parser = add_wandb_arguments(parser) + """Create parser for model training.""" + parser = argparse.ArgumentParser(description="Train YOLO model for Tibetan text detection") + add_training_arguments(parser) + add_wandb_arguments(parser) return parser def create_inference_parser(): - """Create an ArgumentParser for inference.""" - parser = argparse.ArgumentParser(description="Run inference with a trained YOLO model") - parser = add_model_arguments(parser) - parser = add_output_arguments(parser) - parser = add_source_argument(parser) + """Create parser for inference.""" + parser = argparse.ArgumentParser(description="Run inference on images") + add_model_arguments(parser) + add_source_argument(parser) + add_output_arguments(parser) return parser def create_sbb_inference_parser(): - """Create an ArgumentParser for SBB inference.""" - parser = argparse.ArgumentParser(description="Run inference on Staatsbibliothek zu Berlin data") - parser = add_model_arguments(parser) - parser = add_output_arguments(parser) - parser = add_sbb_arguments(parser) + """Create parser for SBB inference.""" + parser = argparse.ArgumentParser(description="Run inference on SBB data") + add_model_arguments(parser) + add_sbb_arguments(parser) + add_output_arguments(parser) return parser def create_ocr_parser(): - """Create an ArgumentParser for OCR on detected text blocks.""" + """Create parser for OCR on detections.""" parser = argparse.ArgumentParser(description="Apply OCR to detected text blocks") - parser = add_model_arguments(parser) - parser = add_source_argument(parser) - parser = add_sbb_arguments(parser) - parser = add_ocr_arguments(parser) + add_model_arguments(parser) + add_source_argument(parser) + add_sbb_arguments(parser) + add_ocr_arguments(parser) + add_output_arguments(parser) return parser diff --git a/tibetan_utils/config.py b/tibetan_utils/config.py index ad08967..2015237 100644 --- a/tibetan_utils/config.py +++ b/tibetan_utils/config.py @@ -11,6 +11,15 @@ DEFAULT_OUTPUT_DIR = 'runs/detect' DEFAULT_DATASET_DIR = 'yolo_tibetan/' +# Additional default constants for multi-class support +DEFAULT_ANNOTATION_FILE_PATH = './data/tibetan numbers/annotations/tibetan_chinese_no/bg_PPN337138764X_00000005.txt' +DEFAULT_BACKGROUND_TRAIN_PATH = './data/tibetan numbers/backgrounds/' +DEFAULT_BACKGROUND_VAL_PATH = './data/tibetan numbers/backgrounds/' +DEFAULT_CORPORA_PATH = './data/corpora/' +DEFAULT_BATCH_SIZE = 16 +DEFAULT_EPOCHS = 100 +DEFAULT_WORKERS = 8 + # Default model settings DEFAULT_MODEL_PATH = 'yolov8n.pt' DEFAULT_IMAGE_SIZE = 1024 diff --git a/tibetan_utils/image_utils.py b/tibetan_utils/image_utils.py index ef670c9..713303f 100644 --- a/tibetan_utils/image_utils.py +++ b/tibetan_utils/image_utils.py @@ -4,7 +4,7 @@ import cv2 import numpy as np -from PIL import Image +from PIL import Image, ImageDraw, ImageFont import io from typing import Tuple, List, Union, Dict, Any @@ -217,3 +217,202 @@ def denormalize_box(box: List[float], image_size: Tuple[int, int]) -> Tuple[int, y_max = int((y + h/2) * height) return (x_min, y_min, x_max, y_max) + + +class BoundingBoxCalculator: + """ + Utility class for calculating bounding boxes and font sizes for text rendering. + """ + + @staticmethod + def fit(text: str, box_size: Tuple[int, int], font_size: int = 24, font_path: str = 'ext/Microsoft Himalaya.ttf', debug: bool = False) -> Tuple[int, int]: + """ + Calculate the true bounding box size for the specified text when it is wrapped and terminated to fit a given box size. + Enhanced with timeout protection and iteration limits. + + Args: + text: Text to be measured + box_size: Tuple (width, height) specifying the size of the box to fit the text + font_size: Size of the font + font_path: Path to the font file + debug: Enable debug output + + Returns: + Tuple (width, height) representing the actual bounding box size of the wrapped and terminated text + """ + import time + start_time = time.time() + timeout_seconds = 5 # 5 second timeout for fit operation + max_lines = 100 # Maximum lines to process + max_chars_per_line = 1000 # Maximum characters per line to prevent infinite loops + + # Validate inputs + if not text or not text.strip(): + return (0, 0) + + if box_size[0] <= 0 or box_size[1] <= 0: + if debug: + print(f"Warning: Invalid box size {box_size}") + return (0, 0) + + # Create a dummy image to get a drawing context + dummy_image = Image.new('RGB', (1, 1)) + draw = ImageDraw.Draw(dummy_image) + + # Define the font + try: + font = ImageFont.truetype(font_path, font_size) + except IOError: + font = ImageFont.load_default() + if debug: + print("Warning: Default font used, may not accurately measure text.") + + box_w, box_h = box_size + actual_text_width, actual_text_height = 0, 0 + y_offset = 0 + lines_processed = 0 + + # Process each line with safety limits + for line in text.split('\n'): + if lines_processed >= max_lines: + if debug: + print(f"Warning: Reached maximum line limit ({max_lines})") + break + + # Check timeout + if time.time() - start_time > timeout_seconds: + if debug: + print(f"Warning: fit() timed out after {timeout_seconds}s") + break + + char_iterations = 0 + while line and char_iterations < max_chars_per_line: + char_iterations += 1 + + # Find the breakpoint for wrapping with safety limit + i = 0 + try: + for i in range(min(len(line), max_chars_per_line)): + if draw.textlength(line[:i + 1], font=font) > box_w: + break + else: + i = len(line) + except Exception as e: + if debug: + print(f"Error in textlength calculation: {e}") + i = min(10, len(line)) # Fallback to small chunk + + # Ensure we make progress + if i == 0: + i = 1 # Take at least one character to avoid infinite loop + + # Add the line to wrapped text + wrapped_line = line[:i] + + try: + left, top, right, bottom = font.getbbox(wrapped_line) + line_width, line_height = right - left, bottom - top + except Exception as e: + if debug: + print(f"Error in getbbox calculation: {e}") + # Fallback estimation + line_width = len(wrapped_line) * font_size // 2 + line_height = font_size + + actual_text_width = max(actual_text_width, line_width) + y_offset += line_height + + # Check if the next line exceeds the box height + if y_offset > box_h: + y_offset -= line_height # Remove the last line's height if it exceeds + break + + line = line[i:] + + lines_processed += 1 + if y_offset > box_h: + break + + elapsed = time.time() - start_time + if debug and elapsed > 1.0: + print(f"fit() took {elapsed:.2f}s for text length {len(text)}, font size {font_size}") + + return actual_text_width, y_offset + 10 + + @staticmethod + def find_max_font(text: str, box_size: Tuple[int, int], font_path: str, max_size: int = 100, debug: bool = False) -> int: + """ + Find maximum font size where text fits in box using binary search with timeout protection. + + Args: + text: Text to fit + box_size: Target box size (width, height) + font_path: Path to font file + max_size: Maximum font size to try + debug: Enable debug output + + Returns: + int: Maximum font size that fits + """ + import time + start_time = time.time() + timeout_seconds = 10 # 10 second timeout + max_iterations = 50 # Maximum iterations to prevent infinite loops + + # Validate inputs + if not text or not text.strip(): + if debug: + print("Warning: Empty text provided to find_max_font, returning minimum font size") + return 1 + + if box_size[0] <= 0 or box_size[1] <= 0: + if debug: + print(f"Warning: Invalid box size {box_size}, returning minimum font size") + return 1 + + low, high = 1, min(max_size, 200) # Cap maximum size to prevent extreme values + best = 1 + iterations = 0 + + if debug: + print(f"Starting font size search for text: '{text[:50]}...' in box {box_size}") + + while low <= high and iterations < max_iterations: + # Check timeout + if time.time() - start_time > timeout_seconds: + if debug: + print(f"find_max_font timed out after {timeout_seconds}s, returning best so far: {best}") + break + + iterations += 1 + mid = (low + high) // 2 + + try: + fit_start = time.time() + w, h = BoundingBoxCalculator.fit(text, box_size, mid, font_path) + fit_time = time.time() - fit_start + + if debug and fit_time > 1.0: # Log slow fit operations + print(f"Slow fit operation: {fit_time:.2f}s for font size {mid}") + + if w <= box_size[0] and h <= box_size[1]: + best = mid + low = mid + 1 + if debug: + print(f"Font size {mid} fits ({w}x{h} <= {box_size})") + else: + high = mid - 1 + if debug: + print(f"Font size {mid} too large ({w}x{h} > {box_size})") + + except Exception as e: + if debug: + print(f"Error in fit calculation for font size {mid}: {e}") + # If fit fails, assume font is too large + high = mid - 1 + + elapsed = time.time() - start_time + if debug: + print(f"find_max_font completed in {elapsed:.2f}s after {iterations} iterations, best font size: {best}") + + return best diff --git a/tibetan_utils/io_utils.py b/tibetan_utils/io_utils.py index 67b72ac..7935e09 100644 --- a/tibetan_utils/io_utils.py +++ b/tibetan_utils/io_utils.py @@ -6,6 +6,8 @@ import re import json import yaml +import hashlib +import time from pathlib import Path from typing import Dict, List, Union, Any @@ -155,3 +157,28 @@ def get_output_path(base_dir: str, name: str, filename: str, create_dir: bool = ensure_dir(output_dir) return os.path.join(output_dir, filename) + + +def hash_current_time() -> str: + """ + Generate a hash based on current time for unique identifiers. + + Returns: + str: SHA256 hash of current time in nanoseconds + """ + # Get the current time + current_time = time.time_ns() + + # Convert the current time to a string + time_str = str(current_time) + + # Create a hash object (using SHA256) + hash_object = hashlib.sha256() + + # Update the hash object with the time string + hash_object.update(time_str.encode()) + + # Get the hexadecimal digest of the hash + time_hash = hash_object.hexdigest() + + return time_hash