TomeWhisper/examples/test_tome_core.py at main · AdaBit-AI/TomeWhisper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#!/usr/bin/env python3
"""
Example script to test tome_core functionality.
"""

import asyncio
import sys
from pathlib import Path

# Add the parent directory to the path so we can import tome_core
sys.path.insert(0, str(Path(__file__).parent.parent))

from tome_core.models import TransformersOCRModel
from tome_core.processors import ImageProcessor, PDFProcessor
from tome_core.utils.prompt_utils import list_available_prompt_modes


def test_transformers_ocr():
    """Test the Transformers OCR model."""
    print("Testing Transformers OCR Model...")

    try:
        # Initialize model (this will download if not cached)
        print("Initializing model...")
        model = TransformersOCRModel()
        print("Model initialized successfully!")

        # List available prompt modes
        print("\nAvailable prompt modes:")
        for mode in list_available_prompt_modes():
            print(f"  - {mode}")

        print("\nTransformers OCR test completed successfully!")

    except Exception as e:
        print(f"Transformers OCR test failed: {e}")
        return False

    return True


def test_image_processor():
    """Test the image processor."""
    print("\nTesting Image Processor...")

    try:
        # Create image processor
        processor = ImageProcessor(max_dimension=1024)
        print("Image processor created successfully!")

        # Test with a simple image (create a dummy image for testing)
        from PIL import Image
        test_image = Image.new('RGB', (100, 100), color='white')

        # Process the image
        info = processor.validate_and_get_info(test_image)
        print(f"Image validation info: {info}")

        processed_image = processor.process_image(test_image)
        print(f"Processed image size: {processed_image.size}")

        # Convert to base64
        base64_string = processor.process_image_to_base64(test_image)
        print(f"Base64 string length: {len(base64_string)}")

        print("\nImage processor test completed successfully!")

    except Exception as e:
        print(f"Image processor test failed: {e}")
        return False

    return True


def test_pdf_processor():
    """Test the PDF processor."""
    print("\nTesting PDF Processor...")

    try:
        from tome_core.processors import PDFProcessor

        # Check if olmocr is available
        if not PDFProcessor.is_olmocr_available():
            print("olmocr is not available, skipping PDF processor test")
            return True

        print("olmocr is available!")

        # Test with the sample PDF if it exists
        pdf_path = Path(__file__).parent.parent / "paper.pdf"
        if pdf_path.exists():
            print(f"Testing with PDF: {pdf_path}")

            # Render first page
            base64_image = PDFProcessor.render_pdf_page_to_base64(str(pdf_path), 1)
            print(f"First page rendered to base64, length: {len(base64_image)}")

            # Get page count
            page_count = PDFProcessor.get_pdf_page_count(str(pdf_path))
            print(f"PDF has {page_count} pages")

            # Test rendering to PIL Image
            pil_image = PDFProcessor.render_pdf_page_to_image(str(pdf_path), 1)
            print(f"First page rendered to PIL Image: {pil_image.size}")

        else:
            print("No test PDF found, skipping PDF rendering test")

        print("\nPDF processor test completed successfully!")

    except Exception as e:
        print(f"PDF processor test failed: {e}")
        return False

    return True


async def test_full_ocr_workflow():
    """Test the complete OCR workflow on a PDF."""
    print("\nTesting Complete OCR Workflow...")

    try:
        from tome_core.models import TransformersOCRModel
        from tome_core.processors import PDFProcessor, ImageProcessor
        from tome_core.utils.prompt_utils import get_prompt_by_mode

        # Check if we have all required components
        if not PDFProcessor.is_olmocr_available():
            print("olmocr is not available, skipping full OCR workflow test")
            return True

        # Initialize model (this will download if not cached)
        print("Initializing Transformers OCR Model...")
        model = TransformersOCRModel()
        print("Model initialized successfully!")

        # Initialize processors
        image_processor = ImageProcessor()

        # Test with the sample PDF if it exists
        pdf_path = Path(__file__).parent.parent / "paper.pdf"
        if not pdf_path.exists():
            print("No test PDF found, skipping full OCR workflow test")
            return True

        print(f"Testing complete OCR workflow with PDF: {pdf_path}")

        # Render first page to PIL Image
        print("Rendering PDF page to image...")
        pil_image = PDFProcessor.render_pdf_page_to_image(str(pdf_path), 1)
        print(f"Image rendered: {pil_image.size}, mode: {pil_image.mode}")

        # Process the image
        print("Processing image...")
        processed_image = image_processor.process_image(pil_image)
        print(f"Processed image: {processed_image.size}, mode: {processed_image.mode}")

        # Get prompt for OCR
        prompt = get_prompt_by_mode("prompt_no_anchoring_v4_yaml")
        print(f"Using prompt mode: prompt_no_anchoring_v4_yaml")
        print(f"Prompt length: {len(prompt)} characters")

        # Perform OCR
        print("Performing OCR...")
        result = await model.generate_async(processed_image, prompt)

        print(f"OCR Result (first 200 characters):")
        print("-" * 50)
        print(result[:200] + "..." if len(result) > 200 else result)
        print("-" * 50)

        print(f"\nFull OCR result length: {len(result)} characters")
        print("Complete OCR workflow test successful!")

        return True

    except Exception as e:
        print(f"Full OCR workflow test failed: {e}")
        import traceback
        traceback.print_exc()
        return False


async def main():
    """Main test function."""
    print("=== Testing TomeWhisper Core Functionality ===\n")

    tests = [
        ("Transformers OCR", test_transformers_ocr),
        ("Image Processor", test_image_processor),
        ("PDF Processor", test_pdf_processor),
        ("Full OCR Workflow", test_full_ocr_workflow),
    ]

    results = {}

    for test_name, test_func in tests:
        print(f"Running {test_name} test...")
        try:
            if asyncio.iscoroutinefunction(test_func):
                result = await test_func()
            else:
                result = test_func()
            results[test_name] = result
            print(f"✓ {test_name} test {'PASSED' if result else 'FAILED'}\n")
        except Exception as e:
            results[test_name] = False
            print(f"✗ {test_name} test FAILED with exception: {e}\n")

    # Summary
    print("=== Test Summary ===")
    passed = sum(results.values())
    total = len(results)

    for test_name, result in results.items():
        status = "✓ PASSED" if result else "✗ FAILED"
        print(f"{test_name}: {status}")

    print(f"\nOverall: {passed}/{total} tests passed")

    return passed == total


if __name__ == "__main__":
    success = asyncio.run(main())
    sys.exit(0 if success else 1)