diff --git a/README.md b/README.md index 652afc057..ab669699b 100644 --- a/README.md +++ b/README.md @@ -176,6 +176,67 @@ result = md.convert("example.jpg") print(result.text_content) ``` +To extract text from images embedded in documents using OCR with LLM Vision: + +```python +from markitdown.converters._ocr_service import MultiBackendOCRService, OCRBackend +from markitdown.converters._pdf_converter_with_ocr import PdfConverterWithOCR +from openai import OpenAI + +# Create OCR service with LLM Vision backend +client = OpenAI() +ocr_service = MultiBackendOCRService( + backends=[OCRBackend.LLM_VISION], + llm_client=client, + llm_model="gpt-4o" +) + +# Convert PDF with LLM-based OCR +converter = PdfConverterWithOCR() +with open("document.pdf", "rb") as f: + result = converter.convert(f, ocr_service=ocr_service) + print(result.text_content) +``` + +OCR converters are available for PDF, DOCX, XLSX (multi-sheet), and PPTX formats. Images are extracted with context preservation (page numbers, cell references, relationship IDs). + +#### Scanned PDF Support + +MarkItDown automatically detects scanned PDFs (documents with no extractable text) and falls back to full-page OCR. When a PDF extraction returns empty or whitespace-only results, the converter: + +1. Renders each page as a high-resolution image (300 DPI) +2. Performs OCR on the full page image using LLM Vision +3. Preserves page structure with page markers +4. Indicates which OCR backend was used + +```python +from markitdown.converters._ocr_service import MultiBackendOCRService, OCRBackend +from markitdown.converters._pdf_converter_with_ocr import PdfConverterWithOCR +from openai import OpenAI + +# Create OCR service with LLM Vision +client = OpenAI() +ocr_service = MultiBackendOCRService( + backends=[OCRBackend.LLM_VISION], + llm_client=client, + llm_model="gpt-4o" +) + +# Convert scanned PDF - fallback is automatic +converter = PdfConverterWithOCR() +with open("scanned_invoice.pdf", "rb") as f: + result = converter.convert(f, ocr_service=ocr_service) + print(result.text_content) +``` + +The fallback triggers automatically when: + +- PDF has no extractable text (truly scanned documents) +- Text extraction returns only whitespace +- No embedded text is found via pdfminer or pdfplumber + +No additional configuration is needed - just provide an OCR service and the converter handles the rest. + ### Docker ```sh diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py index e49b8c4d6..fda22666e 100644 --- a/packages/markitdown/src/markitdown/__about__.py +++ b/packages/markitdown/src/markitdown/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -__version__ = "0.1.5b2" +__version__ = "0.1.5b3" diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..d898ec64a 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -39,6 +39,12 @@ EpubConverter, DocumentIntelligenceConverter, CsvConverter, + PdfConverterWithOCR, + DocxConverterWithOCR, + XlsxConverterWithOCR, + PptxConverterWithOCR, + MultiBackendOCRService, + OCRBackend, ) from ._base_converter import DocumentConverter, DocumentConverterResult @@ -49,7 +55,6 @@ FailedConversionAttempt, ) - # Lower priority values are tried first. PRIORITY_SPECIFIC_FILE_FORMAT = ( 0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia @@ -191,14 +196,25 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(WikipediaConverter()) self.register_converter(YouTubeConverter()) self.register_converter(BingSerpConverter()) - self.register_converter(DocxConverter()) - self.register_converter(XlsxConverter()) + + # Register OCR-enabled converters if LLM client is available, otherwise use standard converters + if self._llm_client is not None and self._llm_model is not None: + # Use OCR-enabled converters for documents with embedded images + self.register_converter(DocxConverterWithOCR()) + self.register_converter(XlsxConverterWithOCR()) + self.register_converter(PptxConverterWithOCR()) + self.register_converter(PdfConverterWithOCR()) + else: + # Use standard converters without OCR + self.register_converter(DocxConverter()) + self.register_converter(XlsxConverter()) + self.register_converter(PptxConverter()) + self.register_converter(PdfConverter()) + self.register_converter(XlsConverter()) - self.register_converter(PptxConverter()) self.register_converter(AudioConverter()) self.register_converter(ImageConverter()) self.register_converter(IpynbConverter()) - self.register_converter(PdfConverter()) self.register_converter(OutlookMsgConverter()) self.register_converter(EpubConverter()) self.register_converter(CsvConverter()) @@ -571,6 +587,19 @@ def _convert( if "llm_prompt" not in _kwargs and self._llm_prompt is not None: _kwargs["llm_prompt"] = self._llm_prompt + # Auto-create OCR service if llm_client is available and not already provided + if "ocr_service" not in _kwargs: + llm_client = _kwargs.get("llm_client", self._llm_client) + llm_model = _kwargs.get("llm_model", self._llm_model) + llm_prompt = _kwargs.get("llm_prompt", self._llm_prompt) + if llm_client is not None and llm_model is not None: + _kwargs["ocr_service"] = MultiBackendOCRService( + backends=[OCRBackend.LLM_VISION], + llm_client=llm_client, + llm_model=llm_model, + llm_prompt=llm_prompt, + ) + if "style_map" not in _kwargs and self._style_map is not None: _kwargs["style_map"] = self._style_map diff --git a/packages/markitdown/src/markitdown/_uri_utils.py b/packages/markitdown/src/markitdown/_uri_utils.py index 603da63e9..a17644eee 100644 --- a/packages/markitdown/src/markitdown/_uri_utils.py +++ b/packages/markitdown/src/markitdown/_uri_utils.py @@ -1,5 +1,4 @@ import base64 -import os from typing import Tuple, Dict from urllib.request import url2pathname from urllib.parse import urlparse, unquote_to_bytes @@ -12,7 +11,7 @@ def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]: raise ValueError(f"Not a file URL: {file_uri}") netloc = parsed.netloc if parsed.netloc else None - path = os.path.abspath(url2pathname(parsed.path)) + path = url2pathname(parsed.path) return netloc, path diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index e4437a582..e86dbc2ce 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -23,6 +23,11 @@ ) from ._epub_converter import EpubConverter from ._csv_converter import CsvConverter +from ._pdf_converter_with_ocr import PdfConverterWithOCR +from ._docx_converter_with_ocr import DocxConverterWithOCR +from ._xlsx_converter_with_ocr import XlsxConverterWithOCR +from ._pptx_converter_with_ocr import PptxConverterWithOCR +from ._ocr_service import MultiBackendOCRService, OCRBackend, OCRResult __all__ = [ "PlainTextConverter", @@ -45,4 +50,11 @@ "DocumentIntelligenceFileType", "EpubConverter", "CsvConverter", + "PdfConverterWithOCR", + "DocxConverterWithOCR", + "XlsxConverterWithOCR", + "PptxConverterWithOCR", + "MultiBackendOCRService", + "OCRBackend", + "OCRResult", ] diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter_with_ocr.py b/packages/markitdown/src/markitdown/converters/_docx_converter_with_ocr.py new file mode 100644 index 000000000..0978b4468 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_docx_converter_with_ocr.py @@ -0,0 +1,183 @@ +""" +Enhanced DOCX Converter with OCR support for embedded images. +Extracts images from Word documents and performs OCR while maintaining context. +""" + +import io +import re +import sys +from typing import Any, BinaryIO, Optional + +from .._base_converter import DocumentConverterResult +from .._exceptions import MISSING_DEPENDENCY_MESSAGE, MissingDependencyException +from .._stream_info import StreamInfo +from ..converter_utils.docx.pre_process import pre_process_docx +from ._html_converter import HtmlConverter +from ._ocr_service import MultiBackendOCRService + +# Try loading dependencies +_dependency_exc_info = None +try: + import mammoth + from docx import Document +except ImportError: + _dependency_exc_info = sys.exc_info() + + +class DocxConverterWithOCR(HtmlConverter): + """ + Enhanced DOCX Converter with OCR support for embedded images. + Maintains document flow while extracting text from images inline. + """ + + def __init__(self): + super().__init__() + self._html_converter = HtmlConverter() + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension == ".docx": + return True + + if mimetype.startswith( + "application/vnd.openxmlformats-officedocument.wordprocessingml" + ): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".docx", + feature="docx", + ) + ) from _dependency_exc_info[1].with_traceback( + _dependency_exc_info[2] + ) # type: ignore[union-attr] + + # Get OCR service if available + ocr_service: Optional[MultiBackendOCRService] = kwargs.get("ocr_service") + + if ocr_service: + # Extract and OCR images before mammoth processing + file_stream.seek(0) + image_ocr_map = self._extract_and_ocr_images(file_stream, ocr_service) + + # Process with mammoth + file_stream.seek(0) + pre_process_stream = pre_process_docx(file_stream) + html_result = mammoth.convert_to_html( + pre_process_stream, style_map=kwargs.get("style_map") + ).value + + # Inject OCR results into HTML + html_with_ocr = self._inject_ocr_into_html(html_result, image_ocr_map) + + return self._html_converter.convert_string(html_with_ocr, **kwargs) + else: + # Standard conversion without OCR + style_map = kwargs.get("style_map", None) + pre_process_stream = pre_process_docx(file_stream) + return self._html_converter.convert_string( + mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, + **kwargs, + ) + + def _extract_and_ocr_images( + self, file_stream: BinaryIO, ocr_service: MultiBackendOCRService + ) -> dict[str, str]: + """ + Extract images from DOCX and OCR them. + + Args: + file_stream: DOCX file stream + ocr_service: OCR service to use + + Returns: + Dict mapping image relationship IDs to OCR text + """ + ocr_map = {} + + try: + file_stream.seek(0) + doc = Document(file_stream) + + # Extract images from document relationships + for rel in doc.part.rels.values(): + if "image" in rel.target_ref.lower(): + try: + image_part = rel.target_part + image_bytes = image_part.blob + + # Create stream for OCR + image_stream = io.BytesIO(image_bytes) + + # Perform OCR + ocr_result = ocr_service.extract_text(image_stream) + + if ocr_result.text.strip(): + # Store with relationship ID using consistent format + ocr_text = f"\n[Image OCR: {rel.rId}]\n{ocr_result.text}\n[End Image OCR]\n" + ocr_map[rel.rId] = ocr_text + + except Exception: + continue + + except Exception: + pass + + return ocr_map + + def _inject_ocr_into_html(self, html: str, ocr_map: dict[str, str]) -> str: + """ + Replace image tags with OCR text inline (no base64 images). + + Args: + html: HTML content from mammoth + ocr_map: Map of image IDs to OCR text + + Returns: + HTML with images replaced by OCR text + """ + if not ocr_map: + return html + + # Create a list of OCR texts and track which ones we've used + ocr_texts = list(ocr_map.values()) + used_indices = [] + + def replace_img(match): + # Replace the entire image tag with OCR text (no base64!) + for i, ocr_text in enumerate(ocr_texts): + if i not in used_indices: + used_indices.append(i) + # Return just the OCR text as a paragraph, no image + return f"

{ocr_text}

" + return "" # Remove image if no OCR text available + + # Replace ALL img tags (including base64) with OCR text + result = re.sub(r"]*>", replace_img, html) + + # If there are remaining OCR texts (images that weren't in HTML), append them + remaining_ocr = [ + ocr_texts[i] for i in range(len(ocr_texts)) if i not in used_indices + ] + if remaining_ocr: + result += f"

{''.join(remaining_ocr)}

" + + return result diff --git a/packages/markitdown/src/markitdown/converters/_ocr_service.py b/packages/markitdown/src/markitdown/converters/_ocr_service.py new file mode 100644 index 000000000..bf8d7b2c0 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_ocr_service.py @@ -0,0 +1,248 @@ +""" +OCR Service Layer for MarkItDown +Provides unified interface for multiple OCR backends with graceful fallback. +""" + +import base64 +from dataclasses import dataclass +from enum import Enum +from typing import Any, BinaryIO, Optional, Protocol + +from .._stream_info import StreamInfo + + +class OCRBackend(str, Enum): + """Supported OCR backends.""" + + LLM_VISION = "llm_vision" + AZURE_DOC_INTEL = "azure_doc_intel" + + +@dataclass +class OCRResult: + """Result from OCR extraction.""" + + text: str + confidence: Optional[float] = None + language: Optional[str] = None + backend_used: Optional[str] = None + error: Optional[str] = None + + +class OCRService(Protocol): + """Protocol for OCR services.""" + + def extract_text(self, image_stream: BinaryIO, **kwargs: Any) -> OCRResult: + """Extract text from an image stream.""" + ... + + +class LLMVisionOCRService: + """OCR service using LLM vision models (OpenAI-compatible).""" + + def __init__(self, client: Any, model: str, default_prompt: Optional[str] = None): + """ + Initialize LLM Vision OCR service. + + Args: + client: OpenAI-compatible client + model: Model name (e.g., 'gpt-4o', 'gemini-2.0-flash') + default_prompt: Default prompt for OCR extraction + """ + self.client = client + self.model = model + self.default_prompt = default_prompt or ( + "Extract all text from this image. " + "Return ONLY the extracted text, maintaining the original layout and order. " + "Do not add any commentary or description." + ) + + def extract_text( + self, + image_stream: BinaryIO, + prompt: Optional[str] = None, + stream_info: Optional[StreamInfo] = None, + **kwargs: Any, + ) -> OCRResult: + """Extract text using LLM vision.""" + if self.client is None: + return OCRResult( + text="", backend_used="llm_vision", error="LLM client not configured" + ) + + try: + # Reset stream position + image_stream.seek(0) + + # Get content type + content_type = None + if stream_info: + content_type = stream_info.mimetype + + if not content_type: + # Guess from stream + try: + from PIL import Image + + image_stream.seek(0) + img = Image.open(image_stream) + fmt = img.format.lower() if img.format else "png" + content_type = f"image/{fmt}" + except Exception: + content_type = "image/png" + + # Convert to base64 + image_stream.seek(0) + base64_image = base64.b64encode(image_stream.read()).decode("utf-8") + data_uri = f"data:{content_type};base64,{base64_image}" + + # Prepare message + actual_prompt = prompt or self.default_prompt + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": actual_prompt}, + { + "type": "image_url", + "image_url": {"url": data_uri}, + }, + ], + } + ] + + # Call LLM (handle both sync and async clients) + import asyncio + import inspect + + result = self.client.chat.completions.create( + model=self.model, messages=messages + ) + + # If result is a coroutine, we need to run it in an event loop + if inspect.iscoroutine(result): + # Try to get the running event loop, or create a new one + try: + asyncio.get_running_loop() + # We're already in an async context, but this is a sync function + # This shouldn't happen in normal usage + raise RuntimeError( + "Cannot use async LLM client in sync OCR context" + ) + except RuntimeError: + # No running loop, create a new one (this is the normal case) + response = asyncio.run(result) + else: + response = result + + text = response.choices[0].message.content + + return OCRResult( + text=text.strip() if text else "", + backend_used="llm_vision", + confidence=None, # LLMs don't provide confidence scores + ) + except Exception as e: + return OCRResult(text="", backend_used="llm_vision", error=str(e)) + finally: + # Reset stream position + image_stream.seek(0) + + +class MultiBackendOCRService: + """ + OCR service with multiple backends and fallback strategy. + Tries backends in order until one succeeds. + """ + + def __init__( + self, + backends: Optional[list[OCRBackend]] = None, + llm_client: Any = None, + llm_model: Optional[str] = None, + llm_prompt: Optional[str] = None, + ): + """ + Initialize multi-backend OCR service. + + Args: + backends: List of backends to try in order + llm_client: OpenAI-compatible client for LLM vision + llm_model: Model name for LLM vision + llm_prompt: Default prompt for LLM vision + """ + # Default backend: LLM Vision + self.backends = backends or [OCRBackend.LLM_VISION] + + # Initialize backend services + self.services: dict[OCRBackend, OCRService] = {} + + # LLM Vision + if OCRBackend.LLM_VISION in self.backends: + if llm_client and llm_model: + self.services[OCRBackend.LLM_VISION] = LLMVisionOCRService( + client=llm_client, model=llm_model, default_prompt=llm_prompt + ) + + def extract_text( + self, + image_stream: BinaryIO, + prompt: Optional[str] = None, + stream_info: Optional[StreamInfo] = None, + min_text_length: int = 3, + **kwargs: Any, + ) -> OCRResult: + """ + Extract text using multiple backends with fallback. + + Args: + image_stream: Image stream to extract text from + prompt: Optional prompt for LLM-based OCR + stream_info: Stream information for the image + min_text_length: Minimum text length to consider successful + **kwargs: Additional arguments + + Returns: + OCRResult with extracted text and metadata + """ + last_error = None + + for backend in self.backends: + service = self.services.get(backend) + if service is None: + continue + + try: + # Reset stream position before each attempt + image_stream.seek(0) + + # Extract text + if backend == OCRBackend.LLM_VISION: + result = service.extract_text( + image_stream, prompt=prompt, stream_info=stream_info + ) + else: + result = service.extract_text(image_stream) + + # Check if extraction was successful + if ( + result.text + and len(result.text) >= min_text_length + and not result.error + ): + return result + + # Store error for potential reporting + if result.error: + last_error = result.error + + except Exception as e: + last_error = str(e) + continue + + # All backends failed + return OCRResult( + text="", + backend_used="none", + error=f"All OCR backends failed. Last error: {last_error}", + ) diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter_with_ocr.py b/packages/markitdown/src/markitdown/converters/_pdf_converter_with_ocr.py new file mode 100644 index 000000000..cbc80e39d --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter_with_ocr.py @@ -0,0 +1,388 @@ +""" +Enhanced PDF Converter with OCR support for embedded images. +Extracts images from PDFs and performs OCR while maintaining document context. +""" + +import io +import sys +from typing import Any, BinaryIO + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._exceptions import MISSING_DEPENDENCY_MESSAGE, MissingDependencyException +from .._stream_info import StreamInfo +from ._ocr_service import MultiBackendOCRService + +# Import dependencies +_dependency_exc_info = None +try: + import pdfminer + import pdfminer.high_level + import pdfplumber + from PIL import Image +except ImportError: + _dependency_exc_info = sys.exc_info() + + +def _extract_images_from_page(page: Any) -> list[dict]: + """ + Extract images from a PDF page by rendering page regions. + + Returns: + List of dicts with 'stream', 'bbox', 'name', 'y_pos' keys + """ + images_info = [] + + try: + # Try multiple methods to detect images + images = [] + + # Method 1: Use page.images (standard approach) + if hasattr(page, "images") and page.images: + images = page.images + + # Method 2: If no images found, try underlying PDF objects + if not images and hasattr(page, "objects") and "image" in page.objects: + images = page.objects.get("image", []) + + # Method 3: Try filtering all objects for image types + if not images and hasattr(page, "objects"): + all_objs = page.objects + for obj_type in all_objs.keys(): + if "image" in obj_type.lower() or "xobject" in obj_type.lower(): + potential_imgs = all_objs.get(obj_type, []) + if potential_imgs: + images = potential_imgs + break + + for i, img_dict in enumerate(images): + try: + # Try to get the actual image stream from the PDF + img_stream = None + y_pos = 0 + + # Method A: If img_dict has 'stream' key, use it directly + if "stream" in img_dict and hasattr(img_dict["stream"], "get_data"): + try: + img_bytes = img_dict["stream"].get_data() + + # Try to open as PIL Image to validate/decode + pil_img = Image.open(io.BytesIO(img_bytes)) + + # Convert to RGB if needed (handle CMYK, etc.) + if pil_img.mode not in ("RGB", "L"): + pil_img = pil_img.convert("RGB") + + # Save to stream as PNG + img_stream = io.BytesIO() + pil_img.save(img_stream, format="PNG") + img_stream.seek(0) + + y_pos = img_dict.get("top", 0) + except Exception: + pass + + # Method B: Fallback to rendering page region + if img_stream is None: + x0 = img_dict.get("x0", 0) + y0 = img_dict.get("top", 0) + x1 = img_dict.get("x1", 0) + y1 = img_dict.get("bottom", 0) + y_pos = y0 + + # Check if dimensions are valid + if x1 <= x0 or y1 <= y0: + continue + + # Use pdfplumber's within_bbox to crop, then render + # This preserves coordinate system correctly + bbox = (x0, y0, x1, y1) + cropped_page = page.within_bbox(bbox) + + # Render at 150 DPI (balance between quality and size) + page_img = cropped_page.to_image(resolution=150) + + # Save to stream + img_stream = io.BytesIO() + page_img.original.save(img_stream, format="PNG") + img_stream.seek(0) + + if img_stream: + images_info.append( + { + "stream": img_stream, + "name": f"page_{page.page_number}_img_{i}", + "y_pos": y_pos, + } + ) + + except Exception: + continue + + except Exception: + pass + + return images_info + + +class PdfConverterWithOCR(DocumentConverter): + """ + Enhanced PDF Converter with OCR support for embedded images. + Maintains document structure while extracting text from images inline. + """ + + def __init__(self): + super().__init__() + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension == ".pdf": + return True + + if mimetype.startswith("application/pdf") or mimetype.startswith( + "application/x-pdf" + ): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".pdf", + feature="pdf", + ) + ) from _dependency_exc_info[1].with_traceback( + _dependency_exc_info[2] + ) # type: ignore[union-attr] + + # Get OCR service if available + ocr_service: MultiBackendOCRService | None = kwargs.get("ocr_service") + + # Read PDF into BytesIO + file_stream.seek(0) + pdf_bytes = io.BytesIO(file_stream.read()) + + markdown_content = [] + + try: + with pdfplumber.open(pdf_bytes) as pdf: + for page_num, page in enumerate(pdf.pages, 1): + markdown_content.append(f"\n## Page {page_num}\n") + + # If OCR is enabled, interleave text and images by position + if ocr_service: + images_on_page = self._extract_page_images(pdf_bytes, page_num) + + if images_on_page: + # Extract text lines with Y positions + chars = page.chars + if chars: + # Group chars into lines based on Y position + lines_with_y = [] + current_line = [] + current_y = None + + for char in sorted( + chars, key=lambda c: (c["top"], c["x0"]) + ): + y = char["top"] + if current_y is None: + current_y = y + elif abs(y - current_y) > 2: # New line threshold + if current_line: + text = "".join( + [c["text"] for c in current_line] + ) + lines_with_y.append( + {"y": current_y, "text": text.strip()} + ) + current_line = [] + current_y = y + current_line.append(char) + + # Add last line + if current_line: + text = "".join([c["text"] for c in current_line]) + lines_with_y.append( + {"y": current_y, "text": text.strip()} + ) + else: + # Fallback: use simple text extraction + text_content = page.extract_text() or "" + lines_with_y = [ + {"y": i * 10, "text": line} + for i, line in enumerate(text_content.split("\n")) + ] + + # OCR all images + image_data = [] + for img_info in images_on_page: + ocr_result = ocr_service.extract_text( + img_info["stream"] + ) + if ocr_result.text.strip(): + image_data.append( + { + "y_pos": img_info["y_pos"], + "name": img_info["name"], + "ocr_text": ocr_result.text, + "backend": ocr_result.backend_used, + "type": "image", + } + ) + + # Add text items + content_items = [ + { + "y_pos": item["y"], + "text": item["text"], + "type": "text", + } + for item in lines_with_y + if item["text"] + ] + content_items.extend(image_data) + + # Sort all items by Y position (top to bottom) + content_items.sort(key=lambda x: x["y_pos"]) + + # Build markdown by interleaving text and images + for item in content_items: + if item["type"] == "text": + markdown_content.append(item["text"]) + else: # image + # Use consistent OCR format + img_marker = f"\n\n[Image OCR: {item['name']}]\n" + img_marker += f"{item['ocr_text']}\n" + img_marker += "[End Image OCR]\n" + markdown_content.append(img_marker) + else: + # No images detected - just extract regular text + text_content = page.extract_text() or "" + if text_content.strip(): + markdown_content.append(text_content.strip()) + else: + # No OCR, just extract text + text_content = page.extract_text() or "" + if text_content.strip(): + markdown_content.append(text_content.strip()) + + # Build final markdown + markdown = "\n\n".join(markdown_content).strip() + + # Fallback to pdfminer if empty + if not markdown: + pdf_bytes.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_bytes) + + except Exception: + # Fallback to pdfminer + try: + pdf_bytes.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_bytes) + except Exception: + markdown = "" + + # Final fallback: If still empty/whitespace and OCR is available, + # treat as scanned PDF and OCR full pages + if ocr_service and (not markdown or not markdown.strip()): + pdf_bytes.seek(0) + markdown = self._ocr_full_pages(pdf_bytes, ocr_service) + + return DocumentConverterResult(markdown=markdown) + + def _extract_page_images(self, pdf_bytes: io.BytesIO, page_num: int) -> list[dict]: + """ + Extract images from a PDF page using pdfplumber. + + Args: + pdf_bytes: PDF file as BytesIO + page_num: Page number (1-indexed) + + Returns: + List of image info dicts with 'stream', 'bbox', 'name', 'y_pos' + """ + images = [] + + try: + pdf_bytes.seek(0) + with pdfplumber.open(pdf_bytes) as pdf: + if page_num <= len(pdf.pages): + page = pdf.pages[page_num - 1] # 0-indexed + images = _extract_images_from_page(page) + except Exception: + pass + + # Sort by vertical position (top to bottom) + images.sort(key=lambda x: x["y_pos"]) + + return images + + def _ocr_full_pages( + self, pdf_bytes: io.BytesIO, ocr_service: MultiBackendOCRService + ) -> str: + """ + Fallback for scanned PDFs: Convert entire pages to images and OCR them. + Used when text extraction returns empty/whitespace results. + + Args: + pdf_bytes: PDF file as BytesIO + ocr_service: OCR service to use + + Returns: + Markdown text extracted from OCR of full pages + """ + markdown_parts = [] + + try: + pdf_bytes.seek(0) + with pdfplumber.open(pdf_bytes) as pdf: + for page_num, page in enumerate(pdf.pages, 1): + try: + markdown_parts.append(f"\n## Page {page_num}\n") + + # Render page to image at high resolution for better OCR + page_img = page.to_image(resolution=300) + img_stream = io.BytesIO() + page_img.original.save(img_stream, format="PNG") + img_stream.seek(0) + + # Run OCR on the full page image + ocr_result = ocr_service.extract_text(img_stream) + + if ocr_result.text.strip(): + # Use consistent OCR format for scanned pages + markdown_parts.append( + f"[Image OCR: page_{page_num}_fullpage]\n" + ) + markdown_parts.append(ocr_result.text.strip()) + markdown_parts.append("\n[End Image OCR]\n") + else: + markdown_parts.append( + "*[No text could be extracted from this page]*" + ) + + except Exception as e: + markdown_parts.append( + f"*[Error processing page {page_num}: {str(e)}]*" + ) + continue + + except Exception: + return "*[Error: Could not process scanned PDF]*" + + return "\n\n".join(markdown_parts).strip() diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter_with_ocr.py b/packages/markitdown/src/markitdown/converters/_pptx_converter_with_ocr.py new file mode 100644 index 000000000..210d6469c --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter_with_ocr.py @@ -0,0 +1,263 @@ +""" +Enhanced PPTX Converter with improved OCR support. +Already has LLM-based image description, this enhances it with traditional OCR fallback. +""" + +import io +import sys +from typing import Any, BinaryIO, Optional + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._exceptions import MISSING_DEPENDENCY_MESSAGE, MissingDependencyException +from .._stream_info import StreamInfo +from ._html_converter import HtmlConverter +from ._ocr_service import MultiBackendOCRService + +_dependency_exc_info = None +try: + import pptx +except ImportError: + _dependency_exc_info = sys.exc_info() + + +class PptxConverterWithOCR(DocumentConverter): + """Enhanced PPTX Converter with OCR fallback.""" + + def __init__(self): + super().__init__() + self._html_converter = HtmlConverter() + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension == ".pptx": + return True + + if mimetype.startswith( + "application/vnd.openxmlformats-officedocument.presentationml" + ): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".pptx", + feature="pptx", + ) + ) from _dependency_exc_info[1].with_traceback( + _dependency_exc_info[2] + ) # type: ignore[union-attr] + + # Get OCR service + ocr_service: Optional[MultiBackendOCRService] = kwargs.get("ocr_service") + llm_client = kwargs.get("llm_client") + + presentation = pptx.Presentation(file_stream) + md_content = "" + slide_num = 0 + + for slide in presentation.slides: + slide_num += 1 + md_content += f"\\n\\n\\n" + + title = slide.shapes.title + + def get_shape_content(shape, **kwargs): + nonlocal md_content + + # Pictures + if self._is_picture(shape): + # Get image data + image_stream = io.BytesIO(shape.image.blob) + + # Try LLM description first if available + llm_description = "" + if llm_client and kwargs.get("llm_model"): + try: + from ._llm_caption import llm_caption + + image_filename = shape.image.filename + image_extension = None + if image_filename: + import os + + image_extension = os.path.splitext(image_filename)[1] + + image_stream_info = StreamInfo( + mimetype=shape.image.content_type, + extension=image_extension, + filename=image_filename, + ) + + llm_description = llm_caption( + image_stream, + image_stream_info, + client=llm_client, + model=kwargs.get("llm_model"), + prompt=kwargs.get("llm_prompt"), + ) + except Exception: + pass + + # Try OCR if LLM failed or not available + ocr_text = "" + if not llm_description and ocr_service: + try: + image_stream.seek(0) + ocr_result = ocr_service.extract_text(image_stream) + if ocr_result.text.strip(): + ocr_text = ocr_result.text.strip() + except Exception: + pass + + # Get alt text from slide + alt_text = "" + try: + alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") + except Exception: + pass + + # Use consistent OCR format + shape_identifier = f"slide_{slide_num}_img_{shape.name}" + + if ocr_text: + # Use consistent OCR format + md_content += f"\\n[Image OCR: {shape_identifier}]\\n" + md_content += f"{ocr_text}\\n" + md_content += "[End Image OCR]\\n" + elif llm_description: + # LLM description available + md_content += f"\\n[Image OCR: {shape_identifier}]\\n" + md_content += f"{llm_description}\\n" + md_content += "[End Image OCR]\\n" + elif alt_text: + # Only alt text available + md_content += f"\\n[Image: {shape_identifier}]\\n" + md_content += f"{alt_text}\\n" + md_content += "[End Image]\\n" + + # Tables + if self._is_table(shape): + md_content += self._convert_table_to_markdown(shape.table, **kwargs) + + # Charts + if shape.has_chart: + md_content += self._convert_chart_to_markdown(shape.chart) + + # Text areas + elif shape.has_text_frame: + if shape == title: + md_content += "# " + shape.text.lstrip() + "\\n" + else: + md_content += shape.text + "\\n" + + # Group Shapes + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: + sorted_shapes = sorted( + shape.shapes, + key=lambda x: ( + float("-inf") if not x.top else x.top, + float("-inf") if not x.left else x.left, + ), + ) + for subshape in sorted_shapes: + get_shape_content(subshape, **kwargs) + + sorted_shapes = sorted( + slide.shapes, + key=lambda x: ( + float("-inf") if not x.top else x.top, + float("-inf") if not x.left else x.left, + ), + ) + for shape in sorted_shapes: + get_shape_content(shape, **kwargs) + + md_content = md_content.strip() + + if slide.has_notes_slide: + md_content += "\\n\\n### Notes:\\n" + notes_frame = slide.notes_slide.notes_text_frame + if notes_frame is not None: + md_content += notes_frame.text + md_content = md_content.strip() + + return DocumentConverterResult(markdown=md_content.strip()) + + def _is_picture(self, shape): + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: + return True + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER: + if hasattr(shape, "image"): + return True + return False + + def _is_table(self, shape): + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE: + return True + return False + + def _convert_table_to_markdown(self, table, **kwargs): + import html + + html_table = "" + first_row = True + for row in table.rows: + html_table += "" + for cell in row.cells: + if first_row: + html_table += "" + else: + html_table += "" + html_table += "" + first_row = False + html_table += "
" + html.escape(cell.text) + "" + html.escape(cell.text) + "
" + + return ( + self._html_converter.convert_string(html_table, **kwargs).markdown.strip() + + "\\n" + ) + + def _convert_chart_to_markdown(self, chart): + try: + md = "\\n\\n### Chart" + if chart.has_title: + md += f": {chart.chart_title.text_frame.text}" + md += "\\n\\n" + data = [] + category_names = [c.label for c in chart.plots[0].categories] + series_names = [s.name for s in chart.series] + data.append(["Category"] + series_names) + + for idx, category in enumerate(category_names): + row = [category] + for series in chart.series: + row.append(series.values[idx]) + data.append(row) + + markdown_table = [] + for row in data: + markdown_table.append("| " + " | ".join(map(str, row)) + " |") + header = markdown_table[0] + separator = "|" + "|".join(["---"] * len(data[0])) + "|" + return md + "\\n".join([header, separator] + markdown_table[1:]) + except ValueError as e: + if "unsupported plot type" in str(e): + return "\\n\\n[unsupported chart]\\n\\n" + except Exception: + return "\\n\\n[unsupported chart]\\n\\n" diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter_with_ocr.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter_with_ocr.py new file mode 100644 index 000000000..f4fcae4c7 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter_with_ocr.py @@ -0,0 +1,222 @@ +""" +Enhanced XLSX Converter with OCR support for embedded images. +Extracts images from Excel spreadsheets and performs OCR while maintaining cell context. +""" + +import io +import sys +from typing import Any, BinaryIO, Optional + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._exceptions import MISSING_DEPENDENCY_MESSAGE, MissingDependencyException +from .._stream_info import StreamInfo +from ._html_converter import HtmlConverter +from ._ocr_service import MultiBackendOCRService + +# Try loading dependencies +_xlsx_dependency_exc_info = None +try: + import pandas as pd + from openpyxl import load_workbook +except ImportError: + _xlsx_dependency_exc_info = sys.exc_info() + + +class XlsxConverterWithOCR(DocumentConverter): + """ + Enhanced XLSX Converter with OCR support for embedded images. + Extracts images with their cell positions and performs OCR. + """ + + def __init__(self): + super().__init__() + self._html_converter = HtmlConverter() + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension == ".xlsx": + return True + + if mimetype.startswith( + "application/vnd.openxmlformats-officedocument.spreadsheetml" + ): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + if _xlsx_dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".xlsx", + feature="xlsx", + ) + ) from _xlsx_dependency_exc_info[1].with_traceback( + _xlsx_dependency_exc_info[2] + ) # type: ignore[union-attr] + + # Get OCR service if available + ocr_service: Optional[MultiBackendOCRService] = kwargs.get("ocr_service") + + if ocr_service: + # Remove ocr_service from kwargs to avoid duplicate argument error + kwargs_without_ocr = {k: v for k, v in kwargs.items() if k != "ocr_service"} + return self._convert_with_ocr( + file_stream, ocr_service, **kwargs_without_ocr + ) + else: + return self._convert_standard(file_stream, **kwargs) + + def _convert_standard( + self, file_stream: BinaryIO, **kwargs: Any + ) -> DocumentConverterResult: + """Standard conversion without OCR.""" + file_stream.seek(0) + sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") + md_content = "" + + for sheet_name in sheets: + md_content += f"## {sheet_name}\n" + html_content = sheets[sheet_name].to_html(index=False) + md_content += ( + self._html_converter.convert_string( + html_content, **kwargs + ).markdown.strip() + + "\n\n" + ) + + return DocumentConverterResult(markdown=md_content.strip()) + + def _convert_with_ocr( + self, file_stream: BinaryIO, ocr_service: MultiBackendOCRService, **kwargs: Any + ) -> DocumentConverterResult: + """Convert XLSX with image OCR.""" + file_stream.seek(0) + wb = load_workbook(file_stream) + + md_content = "" + + for sheet_name in wb.sheetnames: + sheet = wb[sheet_name] + md_content += f"## {sheet_name}\n\n" + + # Convert sheet data to markdown table + file_stream.seek(0) + try: + df = pd.read_excel( + file_stream, sheet_name=sheet_name, engine="openpyxl" + ) + html_content = df.to_html(index=False) + md_content += ( + self._html_converter.convert_string( + html_content, **kwargs + ).markdown.strip() + + "\n\n" + ) + except Exception: + # If pandas fails, just skip the table + pass + + # Extract and OCR images in this sheet + images_with_ocr = self._extract_and_ocr_sheet_images(sheet, ocr_service) + + if images_with_ocr: + md_content += "### Images in this sheet:\n\n" + for img_info in images_with_ocr: + cell_ref = img_info["cell_ref"] + ocr_text = img_info["ocr_text"] + md_content += f"**Image near cell {cell_ref}:**\n" + md_content += f"{ocr_text}\n\n" + + return DocumentConverterResult(markdown=md_content.strip()) + + def _extract_and_ocr_sheet_images( + self, sheet: Any, ocr_service: MultiBackendOCRService + ) -> list[dict]: + """ + Extract and OCR images from an Excel sheet. + + Args: + sheet: openpyxl worksheet + ocr_service: OCR service + + Returns: + List of dicts with 'cell_ref' and 'ocr_text' + """ + results = [] + + try: + # Check if sheet has images + if hasattr(sheet, "_images"): + for img in sheet._images: + try: + # Get image data + if hasattr(img, "_data"): + image_data = img._data() + elif hasattr(img, "image"): + # Some versions store it differently + image_data = img.image + else: + continue + + # Create image stream + image_stream = io.BytesIO(image_data) + + # Get cell reference + cell_ref = "unknown" + if hasattr(img, "anchor"): + anchor = img.anchor + if hasattr(anchor, "_from"): + from_cell = anchor._from + if hasattr(from_cell, "col") and hasattr( + from_cell, "row" + ): + # Convert column number to letter + col_letter = self._column_number_to_letter( + from_cell.col + ) + cell_ref = f"{col_letter}{from_cell.row + 1}" + + # Perform OCR + ocr_result = ocr_service.extract_text(image_stream) + + if ocr_result.text.strip(): + results.append( + { + "cell_ref": cell_ref, + "ocr_text": ocr_result.text.strip(), + "backend": ocr_result.backend_used, + } + ) + + except Exception: + continue + + except Exception: + pass + + return results + + @staticmethod + def _column_number_to_letter(n: int) -> str: + """Convert column number to Excel column letter (0-indexed).""" + result = "" + n = n + 1 # Make 1-indexed + while n > 0: + n -= 1 + result = chr(65 + (n % 26)) + result + n //= 26 + return result diff --git a/packages/markitdown/tests/ocr_test_data/docx_complex_layout.docx b/packages/markitdown/tests/ocr_test_data/docx_complex_layout.docx new file mode 100644 index 000000000..4ddd69746 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/docx_complex_layout.docx differ diff --git a/packages/markitdown/tests/ocr_test_data/docx_image_end.docx b/packages/markitdown/tests/ocr_test_data/docx_image_end.docx new file mode 100644 index 000000000..f2a9a8694 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/docx_image_end.docx differ diff --git a/packages/markitdown/tests/ocr_test_data/docx_image_middle.docx b/packages/markitdown/tests/ocr_test_data/docx_image_middle.docx new file mode 100644 index 000000000..200f3c6c7 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/docx_image_middle.docx differ diff --git a/packages/markitdown/tests/ocr_test_data/docx_image_start.docx b/packages/markitdown/tests/ocr_test_data/docx_image_start.docx new file mode 100644 index 000000000..7855bd166 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/docx_image_start.docx differ diff --git a/packages/markitdown/tests/ocr_test_data/docx_multipage.docx b/packages/markitdown/tests/ocr_test_data/docx_multipage.docx new file mode 100644 index 000000000..c698b0fa2 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/docx_multipage.docx differ diff --git a/packages/markitdown/tests/ocr_test_data/docx_multiple_images.docx b/packages/markitdown/tests/ocr_test_data/docx_multiple_images.docx new file mode 100644 index 000000000..790ce0bcb Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/docx_multiple_images.docx differ diff --git a/packages/markitdown/tests/ocr_test_data/html_complex_layout.html b/packages/markitdown/tests/ocr_test_data/html_complex_layout.html new file mode 100644 index 000000000..660fe1b0b --- /dev/null +++ b/packages/markitdown/tests/ocr_test_data/html_complex_layout.html @@ -0,0 +1,14 @@ + + +Complex HTML Document + +

Complex Layout

+ + + + +
ItemStatus
Task 1Complete
Task 2Pending
+Warning notice +

Additional information below the warning.

+ + \ No newline at end of file diff --git a/packages/markitdown/tests/ocr_test_data/html_image_end.html b/packages/markitdown/tests/ocr_test_data/html_image_end.html new file mode 100644 index 000000000..3a5e640b4 --- /dev/null +++ b/packages/markitdown/tests/ocr_test_data/html_image_end.html @@ -0,0 +1,11 @@ + + +HTML with Image at End + +

Content Page

+

Main content goes here.

+

More details and information.

+

Final paragraph.

+Footer image + + \ No newline at end of file diff --git a/packages/markitdown/tests/ocr_test_data/html_image_middle.html b/packages/markitdown/tests/ocr_test_data/html_image_middle.html new file mode 100644 index 000000000..2ab98cda9 --- /dev/null +++ b/packages/markitdown/tests/ocr_test_data/html_image_middle.html @@ -0,0 +1,12 @@ + + +HTML with Image in Middle + +

Article

+

This is the introduction paragraph.

+

We will see an infographic below.

+Stats infographic +

Analysis

+

This section comes after the image.

+ + \ No newline at end of file diff --git a/packages/markitdown/tests/ocr_test_data/html_image_start.html b/packages/markitdown/tests/ocr_test_data/html_image_start.html new file mode 100644 index 000000000..0e0b40d31 --- /dev/null +++ b/packages/markitdown/tests/ocr_test_data/html_image_start.html @@ -0,0 +1,10 @@ + + +HTML with Image at Start + +

Welcome

+Banner image +

This is the main content after the header image.

+

More content here.

+ + \ No newline at end of file diff --git a/packages/markitdown/tests/ocr_test_data/html_multiple_images.html b/packages/markitdown/tests/ocr_test_data/html_multiple_images.html new file mode 100644 index 000000000..091efc3c5 --- /dev/null +++ b/packages/markitdown/tests/ocr_test_data/html_multiple_images.html @@ -0,0 +1,12 @@ + + +HTML with Multiple Images + +

Financial Report

+

First section

+Revenue chart +

Second section

+Growth chart +

Conclusion

+ + \ No newline at end of file diff --git a/packages/markitdown/tests/ocr_test_data/pdf_complex_layout.pdf b/packages/markitdown/tests/ocr_test_data/pdf_complex_layout.pdf new file mode 100644 index 000000000..f843ab891 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pdf_complex_layout.pdf differ diff --git a/packages/markitdown/tests/ocr_test_data/pdf_image_end.pdf b/packages/markitdown/tests/ocr_test_data/pdf_image_end.pdf new file mode 100644 index 000000000..8b020edf6 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pdf_image_end.pdf differ diff --git a/packages/markitdown/tests/ocr_test_data/pdf_image_middle.pdf b/packages/markitdown/tests/ocr_test_data/pdf_image_middle.pdf new file mode 100644 index 000000000..d90bc9d3e Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pdf_image_middle.pdf differ diff --git a/packages/markitdown/tests/ocr_test_data/pdf_image_start.pdf b/packages/markitdown/tests/ocr_test_data/pdf_image_start.pdf new file mode 100644 index 000000000..0b57b7f96 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pdf_image_start.pdf differ diff --git a/packages/markitdown/tests/ocr_test_data/pdf_multipage.pdf b/packages/markitdown/tests/ocr_test_data/pdf_multipage.pdf new file mode 100644 index 000000000..71ffe8d83 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pdf_multipage.pdf differ diff --git a/packages/markitdown/tests/ocr_test_data/pdf_multiple_images.pdf b/packages/markitdown/tests/ocr_test_data/pdf_multiple_images.pdf new file mode 100644 index 000000000..8a5e47416 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pdf_multiple_images.pdf differ diff --git a/packages/markitdown/tests/ocr_test_data/pdf_scanned_invoice.pdf b/packages/markitdown/tests/ocr_test_data/pdf_scanned_invoice.pdf new file mode 100644 index 000000000..5e1caacc5 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pdf_scanned_invoice.pdf differ diff --git a/packages/markitdown/tests/ocr_test_data/pdf_scanned_meeting_minutes.pdf b/packages/markitdown/tests/ocr_test_data/pdf_scanned_meeting_minutes.pdf new file mode 100644 index 000000000..33c717bed Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pdf_scanned_meeting_minutes.pdf differ diff --git a/packages/markitdown/tests/ocr_test_data/pdf_scanned_minimal.pdf b/packages/markitdown/tests/ocr_test_data/pdf_scanned_minimal.pdf new file mode 100644 index 000000000..9410339e3 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pdf_scanned_minimal.pdf differ diff --git a/packages/markitdown/tests/ocr_test_data/pdf_scanned_report.pdf b/packages/markitdown/tests/ocr_test_data/pdf_scanned_report.pdf new file mode 100644 index 000000000..4c2112ff7 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pdf_scanned_report.pdf differ diff --git a/packages/markitdown/tests/ocr_test_data/pdf_scanned_sales_report.pdf b/packages/markitdown/tests/ocr_test_data/pdf_scanned_sales_report.pdf new file mode 100644 index 000000000..178c63826 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pdf_scanned_sales_report.pdf differ diff --git a/packages/markitdown/tests/ocr_test_data/pptx_complex_layout.pptx b/packages/markitdown/tests/ocr_test_data/pptx_complex_layout.pptx new file mode 100644 index 000000000..10467ea0e Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pptx_complex_layout.pptx differ diff --git a/packages/markitdown/tests/ocr_test_data/pptx_image_end.pptx b/packages/markitdown/tests/ocr_test_data/pptx_image_end.pptx new file mode 100644 index 000000000..1ed9804cd Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pptx_image_end.pptx differ diff --git a/packages/markitdown/tests/ocr_test_data/pptx_image_middle.pptx b/packages/markitdown/tests/ocr_test_data/pptx_image_middle.pptx new file mode 100644 index 000000000..315586a23 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pptx_image_middle.pptx differ diff --git a/packages/markitdown/tests/ocr_test_data/pptx_image_start.pptx b/packages/markitdown/tests/ocr_test_data/pptx_image_start.pptx new file mode 100644 index 000000000..32a50aa8c Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pptx_image_start.pptx differ diff --git a/packages/markitdown/tests/ocr_test_data/pptx_multiple_images.pptx b/packages/markitdown/tests/ocr_test_data/pptx_multiple_images.pptx new file mode 100644 index 000000000..a8eaa4dee Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/pptx_multiple_images.pptx differ diff --git a/packages/markitdown/tests/ocr_test_data/xlsx_complex_layout.xlsx b/packages/markitdown/tests/ocr_test_data/xlsx_complex_layout.xlsx new file mode 100644 index 000000000..6052c1e30 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/xlsx_complex_layout.xlsx differ diff --git a/packages/markitdown/tests/ocr_test_data/xlsx_image_end.xlsx b/packages/markitdown/tests/ocr_test_data/xlsx_image_end.xlsx new file mode 100644 index 000000000..3e26b33fd Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/xlsx_image_end.xlsx differ diff --git a/packages/markitdown/tests/ocr_test_data/xlsx_image_middle.xlsx b/packages/markitdown/tests/ocr_test_data/xlsx_image_middle.xlsx new file mode 100644 index 000000000..2a6c91b77 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/xlsx_image_middle.xlsx differ diff --git a/packages/markitdown/tests/ocr_test_data/xlsx_image_start.xlsx b/packages/markitdown/tests/ocr_test_data/xlsx_image_start.xlsx new file mode 100644 index 000000000..9e461821a Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/xlsx_image_start.xlsx differ diff --git a/packages/markitdown/tests/ocr_test_data/xlsx_multiple_images.xlsx b/packages/markitdown/tests/ocr_test_data/xlsx_multiple_images.xlsx new file mode 100644 index 000000000..eb8d0cfe6 Binary files /dev/null and b/packages/markitdown/tests/ocr_test_data/xlsx_multiple_images.xlsx differ diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 8e3acc23d..717763dcc 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -221,35 +221,39 @@ def test_data_uris() -> None: def test_file_uris() -> None: + from urllib.request import url2pathname + + expected_path = url2pathname("/path/to/file.txt") + # Test file URI with an empty host file_uri = "file:///path/to/file.txt" netloc, path = file_uri_to_path(file_uri) assert netloc is None - assert path == "/path/to/file.txt" + assert path == expected_path # Test file URI with no host file_uri = "file:/path/to/file.txt" netloc, path = file_uri_to_path(file_uri) assert netloc is None - assert path == "/path/to/file.txt" + assert path == expected_path # Test file URI with localhost file_uri = "file://localhost/path/to/file.txt" netloc, path = file_uri_to_path(file_uri) assert netloc == "localhost" - assert path == "/path/to/file.txt" + assert path == expected_path # Test file URI with query parameters file_uri = "file:///path/to/file.txt?param=value" netloc, path = file_uri_to_path(file_uri) assert netloc is None - assert path == "/path/to/file.txt" + assert path == expected_path # Test file URI with fragment file_uri = "file:///path/to/file.txt#fragment" netloc, path = file_uri_to_path(file_uri) assert netloc is None - assert path == "/path/to/file.txt" + assert path == expected_path def test_docx_comments() -> None: diff --git a/packages/markitdown/tests/test_ocr.py b/packages/markitdown/tests/test_ocr.py new file mode 100644 index 000000000..42bbab96a --- /dev/null +++ b/packages/markitdown/tests/test_ocr.py @@ -0,0 +1,1011 @@ +""" +Test OCR functionality for markitdown converters. + +Tests OCR text extraction from images embedded in PDF, DOCX, XLSX, and PPTX files. +Validates context preservation, multi-sheet processing, positioning accuracy, and text matching. +""" + +import sys +from pathlib import Path +from dataclasses import dataclass +from typing import Any + +import pytest + +# Mark all tests in this module as unittests +pytestmark = pytest.mark.unittests + +# Add src to path for direct imports during testing +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from markitdown.converters._ocr_service import MultiBackendOCRService, OCRBackend +from markitdown.converters._pdf_converter_with_ocr import PdfConverterWithOCR +from markitdown.converters._docx_converter_with_ocr import DocxConverterWithOCR +from markitdown.converters._xlsx_converter_with_ocr import XlsxConverterWithOCR +from markitdown.converters._pptx_converter_with_ocr import PptxConverterWithOCR +from markitdown._stream_info import StreamInfo + +# Check for optional dependencies +_skip_docx = False +try: + import mammoth # noqa: F401 + from docx import Document # noqa: F401 +except ImportError: + _skip_docx = True + +_skip_xlsx = False +try: + import pandas # noqa: F401 + from openpyxl import load_workbook # noqa: F401 +except ImportError: + _skip_xlsx = True + +_skip_pptx = False +try: + import pptx # noqa: F401 +except ImportError: + _skip_pptx = True + +# Test data directory +TEST_DATA_DIR = Path(__file__).parent / "ocr_test_data" + + +# ============================================================================== +# EXPECTED OCR RESULTS - Ground Truth for Validation +# ============================================================================== + + +@dataclass +class ImagePosition: + """Track expected position of an image in document.""" + + position: str # "start", "middle", "end" + page_or_sheet: int # Page number (PDF/DOCX) or sheet index (XLSX) + expected_text: str # Expected OCR text (partial match) + before_marker: str | None = None # Text that should appear before image + after_marker: str | None = None # Text that should appear after image + + +# Expected OCR results for test files +EXPECTED_OCR_RESULTS: dict[str, list[ImagePosition]] = { + # PDF Tests + "pdf_complex_layout.pdf": [ + ImagePosition( + position="middle", + page_or_sheet=1, + expected_text="WARNING", + before_marker="ItemQuantity", + after_marker="Widget A", + ) + ], + "pdf_image_end.pdf": [ + ImagePosition( + position="end", + page_or_sheet=1, + expected_text="WARNING", + before_marker="Keep reading", + after_marker=None, + ) + ], + "pdf_image_start.pdf": [ + ImagePosition( + position="start", + page_or_sheet=1, + expected_text="WARNING", + before_marker=None, + after_marker="This is text", + ) + ], + "pdf_image_middle.pdf": [ + ImagePosition( + position="middle", + page_or_sheet=1, + expected_text="WARNING", + before_marker="introductory text", + after_marker="Section 2", + ) + ], + "pdf_multiple_images.pdf": [ + ImagePosition( + position="middle", + page_or_sheet=1, + expected_text="WARNING", + before_marker="Multiple Images", + after_marker="Text between", + ), + ImagePosition( + position="middle", + page_or_sheet=1, + expected_text="NOTICE", + before_marker="Text between", + after_marker="Final text", + ), + ], + "pdf_multipage.pdf": [ + ImagePosition( + position="middle", + page_or_sheet=1, + expected_text="WARNING", + before_marker="BEFORE the image", + after_marker="AFTER the image", + ), + ImagePosition( + position="end", + page_or_sheet=2, + expected_text="NOTICE", + before_marker="Final paragraph", + after_marker=None, + ), + ImagePosition( + position="start", + page_or_sheet=3, + expected_text="Contact", + before_marker=None, + after_marker="Content that follows", + ), + ], + # DOCX Tests + "docx_complex_layout.docx": [ + ImagePosition( + position="middle", + page_or_sheet=1, + expected_text="WARNING", + before_marker="Security notice", + after_marker=None, + ) + ], + "docx_image_end.docx": [ + ImagePosition( + position="end", + page_or_sheet=1, + expected_text="WARNING", + before_marker="Recommendations", + after_marker=None, + ) + ], + "docx_image_start.docx": [ + ImagePosition( + position="start", + page_or_sheet=1, + expected_text="WARNING", + before_marker=None, + after_marker="main content", + ) + ], + "docx_image_middle.docx": [ + ImagePosition( + position="middle", + page_or_sheet=1, + expected_text="WARNING", + before_marker="see an image below", + after_marker="Analysis", + ) + ], + "docx_multiple_images.docx": [ + ImagePosition( + position="middle", + page_or_sheet=1, + expected_text="WARNING", + before_marker="First section", + after_marker="Second section", + ), + ImagePosition( + position="middle", + page_or_sheet=1, + expected_text="NOTICE", + before_marker="Second section", + after_marker="Conclusion", + ), + ], + "docx_multipage.docx": [ + ImagePosition( + position="middle", + page_or_sheet=1, + expected_text="WARNING", + before_marker="BEFORE IMAGE", + after_marker="AFTER IMAGE", + ), + ImagePosition( + position="end", + page_or_sheet=2, + expected_text="NOTICE", + before_marker="Final paragraph", + after_marker=None, + ), + ImagePosition( + position="start", + page_or_sheet=3, + expected_text="Contact", + before_marker=None, + after_marker="Content that follows", + ), + ], +} + + +def validate_image_position( + markdown: str, image_pos: ImagePosition, verbose: bool = False +) -> tuple[bool, str]: + """ + Validate that an image appears at the expected position with expected text. + + Returns: + Tuple of (success: bool, message: str) + """ + # Check expected text is present + if image_pos.expected_text not in markdown: + return ( + False, + f"Expected OCR text '{image_pos.expected_text}' not found in output", + ) + + # Get position of expected text + text_idx = markdown.index(image_pos.expected_text) + + # Validate position relative to markers + if image_pos.before_marker: + if image_pos.before_marker not in markdown: + return False, f"Before marker '{image_pos.before_marker}' not found" + before_idx = markdown.index(image_pos.before_marker) + if before_idx >= text_idx: + return ( + False, + f"Image text (pos {text_idx}) should appear AFTER before marker (pos {before_idx})", + ) + + if image_pos.after_marker: + if image_pos.after_marker not in markdown: + return False, f"After marker '{image_pos.after_marker}' not found" + after_idx = markdown.index(image_pos.after_marker) + if text_idx >= after_idx: + return ( + False, + f"Image text (pos {text_idx}) should appear BEFORE after marker (pos {after_idx})", + ) + + # Build success message + msg_parts = [f"Image at {image_pos.position} position validated"] + if image_pos.before_marker: + before_idx = markdown.index(image_pos.before_marker) + msg_parts.append(f"before_marker(pos:{before_idx})") + msg_parts.append(f"image(pos:{text_idx})") + if image_pos.after_marker: + after_idx = markdown.index(image_pos.after_marker) + msg_parts.append(f"after_marker(pos:{after_idx})") + + return True, " -> ".join(msg_parts) + + +def validate_no_base64_images(markdown: str) -> tuple[bool, str]: + """Validate that no base64 encoded images are present in output.""" + if "data:image" in markdown or "base64" in markdown: + return False, "Base64 images found in output (should be replaced with OCR text)" + return True, "No base64 images found" + + +class MockOCRService: + """Mock OCR service for testing without external dependencies.""" + + def __init__(self): + # Predefined OCR results that cycle through + self.results_queue = [ + "WARNING: Security Alert", + "NOTICE: SSL Certificate Expiring", + "Contact Information: support@example.com", + "START OF DOCUMENT", + "MIDDLE SECTION CONTENT", + "FOOTER: End of Document", + "Image 1: First Image Content", + "Image 2: Second Image Content", + "Sales Chart Q4 2024", + "System Architecture Diagram", + "Invoice #12345\nDate: 2024-01-15\nTotal: $1,234.56", + "Annual Report 2024\nRevenue Growth: 25%", + "Meeting Minutes\nDate: 2024-02-01\nAttendees: Team A", + "Sales Performance Report\nQ4 Results", + "Minimal Test Document", + ] + self.call_count = 0 + + def extract_text(self, image_stream, **kwargs): + """Mock text extraction that cycles through predefined results.""" + from markitdown.converters._ocr_service import OCRResult + + # Cycle through results based on call count + text = self.results_queue[self.call_count % len(self.results_queue)] + self.call_count += 1 + + return OCRResult( + text=text, + confidence=0.95, + backend_used="mock_ocr", + ) + + +@pytest.fixture(scope="function") +def ocr_service() -> Any: + """Create mock OCR service for testing.""" + return MockOCRService() + + +def test_pdf_ocr_basic(ocr_service: Any) -> None: + """Test PDF OCR extraction with context preservation and position validation.""" + converter = PdfConverterWithOCR() + pdf_path = TEST_DATA_DIR / "pdf_complex_layout.pdf" + + if not pdf_path.exists(): + pytest.skip(f"Test file not found: {pdf_path}") + + with open(pdf_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=".pdf"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Validate structure and content + assert "## Page" in markdown, "Should have page marker" + assert "[Image OCR:" in markdown, "Should have image marker" + + # Validate expected OCR results with position tracking + filename = pdf_path.name + if filename in EXPECTED_OCR_RESULTS: + for img_pos in EXPECTED_OCR_RESULTS[filename]: + success, message = validate_image_position(markdown, img_pos, verbose=True) + assert success, f"Position validation failed: {message}" + print(f" [PASS] {message}") # Verbose output for pytest -s + + +def test_pdf_ocr_image_at_end(ocr_service: Any) -> None: + """Test PDF with image at document end with position validation.""" + converter = PdfConverterWithOCR() + pdf_path = TEST_DATA_DIR / "pdf_image_end.pdf" + + if not pdf_path.exists(): + pytest.skip(f"Test file not found: {pdf_path}") + + with open(pdf_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=".pdf"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Validate with expected results + filename = pdf_path.name + if filename in EXPECTED_OCR_RESULTS: + for img_pos in EXPECTED_OCR_RESULTS[filename]: + success, message = validate_image_position(markdown, img_pos, verbose=True) + assert success, f"Position validation failed: {message}" + print(f" [PASS] {message}") + + +@pytest.mark.skipif(_skip_docx, reason="docx dependencies not installed") +def test_docx_ocr_basic(ocr_service: Any) -> None: + """Test DOCX OCR extraction with position validation and no base64 check.""" + converter = DocxConverterWithOCR() + docx_path = TEST_DATA_DIR / "docx_complex_layout.docx" + + if not docx_path.exists(): + pytest.skip(f"Test file not found: {docx_path}") + + with open(docx_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=".docx"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Validate no base64 images + success, message = validate_no_base64_images(markdown) + assert success, message + print(f" ✓ {message}") + + # Validate structure + assert "[Image OCR:" in markdown, "Should have OCR markers" + + # Ensure no duplicates (critical fix validation) + if "WARNING" in markdown: + warning_count = markdown.count("WARNING: Security Alert") + assert ( + warning_count <= 1 + ), f"OCR text should not be duplicated (found {warning_count} times)" + + # Validate expected OCR results with position tracking + filename = docx_path.name + if filename in EXPECTED_OCR_RESULTS: + for img_pos in EXPECTED_OCR_RESULTS[filename]: + success, message = validate_image_position(markdown, img_pos, verbose=True) + assert success, f"Position validation failed: {message}" + print(f" [PASS] {message}") + + +@pytest.mark.skipif(_skip_docx, reason="docx dependencies not installed") +def test_docx_ocr_image_at_end(ocr_service: Any) -> None: + """Test DOCX with image at document end with position validation.""" + converter = DocxConverterWithOCR() + docx_path = TEST_DATA_DIR / "docx_image_end.docx" + + if not docx_path.exists(): + pytest.skip(f"Test file not found: {docx_path}") + + with open(docx_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=".docx"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Validate no base64 images + success, message = validate_no_base64_images(markdown) + assert success, message + + # Validate with expected results + filename = docx_path.name + if filename in EXPECTED_OCR_RESULTS: + for img_pos in EXPECTED_OCR_RESULTS[filename]: + success, message = validate_image_position(markdown, img_pos, verbose=True) + assert success, f"Position validation failed: {message}" + print(f" [PASS] {message}") + + +@pytest.mark.skipif(_skip_xlsx, reason="xlsx dependencies not installed") +def test_xlsx_ocr_multisheet(ocr_service: Any) -> None: + """Test XLSX OCR with multi-sheet processing and cell references.""" + converter = XlsxConverterWithOCR() + xlsx_path = TEST_DATA_DIR / "xlsx_complex_layout.xlsx" + + if not xlsx_path.exists(): + pytest.skip(f"Test file not found: {xlsx_path}") + + with open(xlsx_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=".xlsx"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Validate multi-sheet processing + sheet_count = markdown.count("##") + assert sheet_count >= 2, f"Should process multiple sheets (found {sheet_count})" + + # Validate image sections with cell references + assert "Images in this sheet:" in markdown, "Should have image sections" + assert "cell" in markdown.lower(), "Should track cell references" + + # Check for OCR text (mock returns cycling results starting with "WARNING") + has_ocr = any( + keyword in markdown for keyword in ["WARNING", "Security", "NOTICE", "SSL"] + ) + assert has_ocr, "Should extract OCR text from images" + + +@pytest.mark.skipif(_skip_xlsx, reason="xlsx dependencies not installed") +def test_xlsx_ocr_cell_references(ocr_service: Any) -> None: + """Test XLSX cell position tracking.""" + converter = XlsxConverterWithOCR() + xlsx_path = TEST_DATA_DIR / "xlsx_image_start.xlsx" + + if not xlsx_path.exists(): + pytest.skip(f"Test file not found: {xlsx_path}") + + with open(xlsx_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=".xlsx"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Validate cell references present + assert "Image near cell" in markdown, "Should have cell reference tracking" + + # Check multiple sheets processed + assert ( + "Sales Q1" in markdown or "Forecast" in markdown + ), "Should process named sheets" + + +@pytest.mark.skipif(_skip_pptx, reason="pptx dependencies not installed") +def test_pptx_ocr_basic(ocr_service: Any) -> None: + """Test PPTX OCR with alt text integration.""" + converter = PptxConverterWithOCR() + pptx_path = TEST_DATA_DIR / "pptx_complex_layout.pptx" + + if not pptx_path.exists(): + pytest.skip(f"Test file not found: {pptx_path}") + + with open(pptx_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=".pptx"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Validate structure + assert "Slide number:" in markdown, "Should have slide markers" + assert ( + "Product Comparison" in markdown or "Market Share" in markdown + ), "Should have slide content" + assert "[Image OCR:" in markdown, "Should have OCR image markers" + + +@pytest.mark.skipif(_skip_pptx, reason="pptx dependencies not installed") +def test_pptx_ocr_multipage(ocr_service: Any) -> None: + """Test PPTX with multiple slides.""" + converter = PptxConverterWithOCR() + pptx_path = TEST_DATA_DIR / "pptx_image_end.pptx" + + if not pptx_path.exists(): + pytest.skip(f"Test file not found: {pptx_path}") + + with open(pptx_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=".pptx"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Validate multiple slides + slide_count = markdown.count("Slide number:") + assert slide_count >= 2, f"Should have multiple slides (found {slide_count})" + + # Check OCR text from mock service + assert ( + "WARNING" in markdown or "Security" in markdown or "NOTICE" in markdown + ), "Should extract OCR text" + + +def test_ocr_service_fallback(ocr_service: Any) -> None: + """Test OCR service graceful handling.""" + from PIL import Image + import io + + # Create a simple test image + img = Image.new("RGB", (400, 100), color="white") + from PIL import ImageDraw + + draw = ImageDraw.Draw(img) + draw.text((10, 30), "Test Text", fill="black") + + img_stream = io.BytesIO() + img.save(img_stream, format="PNG") + img_stream.seek(0) + + result = ocr_service.extract_text(img_stream) + + # Should either succeed or fail gracefully + assert result is not None, "Should return result object" + assert hasattr(result, "text"), "Result should have text attribute" + assert hasattr(result, "backend_used"), "Result should have backend_used attribute" + + +@pytest.mark.parametrize( + "filename", + [ + "pdf_complex_layout.pdf", + "pdf_image_end.pdf", + "pdf_image_start.pdf", + "pdf_image_middle.pdf", + "pdf_multiple_images.pdf", + "pdf_multipage.pdf", + "docx_complex_layout.docx", + "docx_image_end.docx", + "docx_image_start.docx", + "docx_image_middle.docx", + "docx_multiple_images.docx", + "docx_multipage.docx", + ], +) +def test_comprehensive_ocr_positioning(ocr_service: Any, filename: str) -> None: + """ + Comprehensive test validating OCR text extraction and positioning for all test files. + + This test: + 1. Validates expected OCR text is extracted + 2. Validates image positioning relative to surrounding text + 3. For DOCX: validates no base64 images in output + 4. Compares extracted text against expected ground truth + """ + file_path = TEST_DATA_DIR / filename + + if not file_path.exists(): + pytest.skip(f"Test file not found: {file_path}") + + if filename not in EXPECTED_OCR_RESULTS: + pytest.skip(f"No expected results defined for {filename}") + + # Determine converter based on extension + converter: Any + if filename.endswith(".pdf"): + converter = PdfConverterWithOCR() + extension = ".pdf" + elif filename.endswith(".docx"): + if _skip_docx: + pytest.skip("docx dependencies not installed") + converter = DocxConverterWithOCR() + extension = ".docx" + else: + pytest.skip(f"Unsupported file type for {filename}") + + # Convert document + print(f"\n{'='*60}") + print(f"Testing: {filename}") + print(f"{'='*60}") + + with open(file_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=extension), ocr_service=ocr_service + ) + markdown = result.text_content + + if not markdown or not markdown.strip() or "Error:" in markdown: + pytest.skip(f"Could not extract content from {filename} (possibly corrupt)") + + # For DOCX files, validate no base64 images + if filename.endswith(".docx"): + success, message = validate_no_base64_images(markdown) + assert success, f"Base64 validation failed for {filename}: {message}" + print(f" [PASS] Base64 check: {message}") + + # Validate all expected image positions + expected_images = EXPECTED_OCR_RESULTS[filename] + print(f" Validating {len(expected_images)} image(s)...") + + for idx, img_pos in enumerate(expected_images, 1): + success, message = validate_image_position(markdown, img_pos, verbose=True) + assert success, f"Image {idx} validation failed for {filename}: {message}" + print(f" [PASS] Image {idx}: {message}") + + print( + f" [SUCCESS] All {len(expected_images)} images validated successfully for {filename}" + ) + + +def test_pdf_scanned_fallback(ocr_service: Any) -> None: + """ + Test that scanned PDFs (no extractable text) trigger full-page OCR fallback. + + This test validates the fallback mechanism that: + 1. Attempts normal text extraction + 2. Detects empty/whitespace results + 3. Falls back to rendering pages as images + 4. Performs OCR on full-page images + """ + converter = PdfConverterWithOCR() + + # Test with a scanned PDF if available + pdf_path = TEST_DATA_DIR / "pdf_scanned.pdf" + + if not pdf_path.exists(): + pytest.skip(f"Scanned PDF test file not found: {pdf_path}") + + with open(pdf_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=".pdf"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Validate that some text was extracted via OCR + assert markdown, "Should extract text from scanned PDF via OCR fallback" + assert len(markdown.strip()) > 0, "Extracted text should not be empty/whitespace" + + # Should have page markers + assert "## Page" in markdown, "Should have page structure markers" + + # Should indicate OCR was used + assert "OCR:" in markdown, "Should indicate OCR backend was used" + + print(f" [PASS] Scanned PDF fallback extracted {len(markdown)} characters") + + +def test_pdf_scanned_fallback_with_mock(ocr_service: Any) -> None: + """ + Test scanned PDF fallback with a PDF that has minimal/no extractable text. + + This validates the full-page OCR pathway when embedded image extraction + and pdfminer both return empty results. + """ + import io + from unittest.mock import patch, MagicMock + + converter = PdfConverterWithOCR() + + # Use any existing PDF for this test + pdf_path = TEST_DATA_DIR / "pdf_image_start.pdf" + + if not pdf_path.exists(): + pytest.skip(f"Test PDF not found: {pdf_path}") + + # Mock pdfplumber page.extract_text to return empty text + with patch( + "markitdown.converters._pdf_converter_with_ocr.pdfplumber.open" + ) as mock_plumber: + # Create mock PDF with mock pages + mock_pdf = MagicMock() + mock_page = MagicMock() + mock_page.extract_text.return_value = "" # Simulate no text + mock_page.chars = [] # No character data + mock_page.images = [] # No embedded images + mock_page.page_number = 1 + mock_pdf.pages = [mock_page] + mock_pdf.__enter__.return_value = mock_pdf + mock_plumber.return_value = mock_pdf + + # Also mock pdfminer to return empty + with patch( + "markitdown.converters._pdf_converter_with_ocr.pdfminer.high_level.extract_text" + ) as mock_pdfminer: + mock_pdfminer.return_value = "" + + with open(pdf_path, "rb") as f: + pdf_bytes = f.read() + pdf_stream = io.BytesIO(pdf_bytes) + + result = converter.convert( + pdf_stream, StreamInfo(extension=".pdf"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Should have triggered the scanned PDF fallback + assert markdown, "Should extract text via scanned PDF fallback" + assert len(markdown.strip()) > 0, "Should have non-empty OCR results" + + # Should indicate OCR was used for full-page fallback + assert "## Page" in markdown, "Should have page markers from full-page OCR" + + print(" [PASS] Scanned PDF fallback mock test passed") + + +def test_pdf_empty_result_detection() -> None: + """ + Test that empty and whitespace-only results are correctly detected. + + This validates the logic that determines when to trigger the scanned PDF fallback. + """ + # Test various empty/whitespace scenarios + test_cases = [ + ("", True, "Empty string should trigger fallback"), + (" ", True, "Whitespace-only should trigger fallback"), + ("\n\n\n", True, "Newlines-only should trigger fallback"), + (" \t \n ", True, "Mixed whitespace should trigger fallback"), + ("Some text", False, "Non-empty text should not trigger fallback"), + ] + + for text, should_fallback, description in test_cases: + # Check the condition used in the code + would_trigger = not text or not text.strip() + assert would_trigger == should_fallback, f"Failed: {description}" + print(f" [PASS] {description}") + + +def test_pdf_scanned_invoice(ocr_service: Any) -> None: + """Test OCR extraction from a scanned invoice PDF.""" + converter = PdfConverterWithOCR() + pdf_path = TEST_DATA_DIR / "pdf_scanned_invoice.pdf" + + if not pdf_path.exists(): + pytest.skip(f"Scanned invoice test file not found: {pdf_path}") + + with open(pdf_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=".pdf"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Validate extraction + assert markdown, "Should extract text from scanned invoice" + assert len(markdown.strip()) > 50, "Should extract text content" + + # With mock OCR, validate the OCR fallback mechanism works + # (mock returns cycling results, not real document content) + assert "## Page" in markdown, "Should have page structure markers" + assert "Image OCR:" in markdown, "Should indicate OCR was used" + assert "WARNING" in markdown, "Should contain mock OCR output" + + print(f" [PASS] Scanned invoice OCR extracted {len(markdown)} characters") + + +def test_pdf_scanned_multipage_report(ocr_service: Any) -> None: + """Test OCR extraction from a multi-page scanned technical report.""" + converter = PdfConverterWithOCR() + pdf_path = TEST_DATA_DIR / "pdf_scanned_report.pdf" + + if not pdf_path.exists(): + pytest.skip(f"Scanned report test file not found: {pdf_path}") + + with open(pdf_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=".pdf"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Validate extraction + assert markdown, "Should extract text from scanned report" + assert len(markdown.strip()) > 50, "Should extract text from all pages" + + # Validate page structure + page_markers = markdown.count("## Page") + assert ( + page_markers >= 1 + ), f"Should have at least 1 page (found {page_markers} markers)" + + # With mock OCR, validate the fallback mechanism processes pages + assert "Image OCR:" in markdown, "Should indicate OCR was used" + # Mock returns cycling results for each page + assert "WARNING" in markdown, "Should contain mock OCR output for page 1" + + print(f" [PASS] Multi-page scanned report OCR extracted from {page_markers} pages") + + +def test_pdf_scanned_meeting_minutes(ocr_service: Any) -> None: + """Test OCR extraction from scanned meeting minutes.""" + converter = PdfConverterWithOCR() + pdf_path = TEST_DATA_DIR / "pdf_scanned_meeting_minutes.pdf" + + if not pdf_path.exists(): + pytest.skip(f"Scanned meeting minutes test file not found: {pdf_path}") + + with open(pdf_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=".pdf"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Validate extraction + assert markdown, "Should extract text from scanned meeting minutes" + assert len(markdown.strip()) > 50, "Should extract text content" + + # With mock OCR, validate fallback mechanism works + assert "## Page" in markdown, "Should have page structure markers" + assert "Image OCR:" in markdown, "Should indicate OCR was used" + assert "WARNING" in markdown, "Should contain mock OCR output" + + print(f" [PASS] Scanned meeting minutes OCR extracted {len(markdown)} characters") + + +def test_pdf_scanned_sales_report(ocr_service: Any) -> None: + """Test OCR extraction from scanned sales report with table structure.""" + converter = PdfConverterWithOCR() + pdf_path = TEST_DATA_DIR / "pdf_scanned_sales_report.pdf" + + if not pdf_path.exists(): + pytest.skip(f"Scanned sales report test file not found: {pdf_path}") + + with open(pdf_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=".pdf"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Validate extraction + assert markdown, "Should extract text from scanned sales report" + assert len(markdown.strip()) > 50, "Should extract text content" + + # With mock OCR, validate fallback mechanism works + assert "## Page" in markdown, "Should have page structure markers" + assert "Image OCR:" in markdown, "Should indicate OCR was used" + assert "WARNING" in markdown, "Should contain mock OCR output" + + print(f" [PASS] Scanned sales report OCR extracted {len(markdown)} characters") + + +def test_pdf_scanned_minimal(ocr_service: Any) -> None: + """Test OCR extraction from minimal scanned document (edge case).""" + converter = PdfConverterWithOCR() + pdf_path = TEST_DATA_DIR / "pdf_scanned_minimal.pdf" + + if not pdf_path.exists(): + pytest.skip(f"Scanned minimal test file not found: {pdf_path}") + + with open(pdf_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=".pdf"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Validate extraction + assert markdown, "Should extract text from minimal scanned document" + assert len(markdown.strip()) > 10, "Should extract some text content" + + # Validate basic content from mock OCR + assert ( + "WARNING" in markdown or "Security Alert" in markdown + ), "Should contain mock OCR output" + + print(f" [PASS] Minimal scanned document OCR extracted {len(markdown)} characters") + + +@pytest.mark.parametrize( + "filename,expected_terms,min_length", + [ + ( + "pdf_scanned_invoice.pdf", + ["WARNING", "Security", "Alert"], + 50, + ), + ( + "pdf_scanned_report.pdf", + ["WARNING", "Security"], + 50, + ), + ( + "pdf_scanned_meeting_minutes.pdf", + ["WARNING", "Security"], + 50, + ), + ( + "pdf_scanned_sales_report.pdf", + ["WARNING", "Security"], + 50, + ), + ( + "pdf_scanned_minimal.pdf", + ["WARNING", "Security"], + 10, + ), + ], +) +def test_comprehensive_scanned_pdf_ocr( + ocr_service: Any, filename: str, expected_terms: list[str], min_length: int +) -> None: + """ + Comprehensive parametrized test for all scanned PDF files. + + Validates that: + 1. OCR fallback is triggered (no extractable text in these PDFs) + 2. Full-page OCR successfully extracts text + 3. Key terms from the document are present in the output + 4. Minimum text length is met (validates substantial extraction) + """ + converter = PdfConverterWithOCR() + pdf_path = TEST_DATA_DIR / filename + + if not pdf_path.exists(): + pytest.skip(f"Test file not found: {pdf_path}") + + print(f"\n{'='*60}") + print(f"Testing scanned PDF: {filename}") + print(f"{'='*60}") + + with open(pdf_path, "rb") as f: + result = converter.convert( + f, StreamInfo(extension=".pdf"), ocr_service=ocr_service + ) + markdown = result.text_content + + # Validate extraction occurred + assert markdown, f"Should extract text from {filename}" + assert ( + len(markdown.strip()) >= min_length + ), f"Should extract at least {min_length} characters (got {len(markdown.strip())})" + + print(f" [PASS] Extracted {len(markdown)} characters") + + # Validate key terms present + found_terms = [] + missing_terms = [] + + for term in expected_terms: + # Case-insensitive search + if term.lower() in markdown.lower(): + found_terms.append(term) + else: + missing_terms.append(term) + + # Require at least 60% of terms to be found (OCR isn't perfect) + success_rate = len(found_terms) / len(expected_terms) + assert ( + success_rate >= 0.6 + ), f"Should extract at least 60% of key terms. Found: {found_terms}, Missing: {missing_terms}" + + print( + f" [PASS] Term extraction: {len(found_terms)}/{len(expected_terms)} terms found ({success_rate:.0%})" + ) + + # Validate OCR backend indicator present + assert "OCR:" in markdown, "Should indicate which OCR backend was used" + print(" [PASS] OCR backend indicator present") + + # Validate page structure + if "## Page" in markdown: + page_count = markdown.count("## Page") + print(f" [PASS] Page structure preserved ({page_count} pages)") + + print(f" [SUCCESS] All validations passed for {filename}\n") + + +if __name__ == "__main__": + # Run tests when executed directly + pytest.main([__file__, "-v"]) diff --git a/packages/markitdown/tests/test_ocr_format_consistency.py b/packages/markitdown/tests/test_ocr_format_consistency.py new file mode 100644 index 000000000..781cbbe7f --- /dev/null +++ b/packages/markitdown/tests/test_ocr_format_consistency.py @@ -0,0 +1,174 @@ +"""Test OCR format consistency across converters.""" + +import re +from typing import Pattern + +import pytest + +# Standard OCR format pattern (without backend info) +OCR_FORMAT_PATTERN: Pattern[str] = re.compile( + r"\[Image OCR: ([^\]]+)\]\n" # Header with identifier + r"(.+?)\n" # OCR text content (non-greedy) + r"\[End Image OCR\]", # Footer + re.DOTALL, # Allow . to match newlines in content +) + + +def validate_ocr_format(text: str, expected_count: int) -> list[dict[str, str]]: + """ + Validate that text contains OCR blocks in the standard format. + + Args: + text: Text to validate + expected_count: Expected number of OCR blocks + + Returns: + List of dicts with 'identifier' and 'content' keys + + Raises: + AssertionError: If format doesn't match or count is wrong + """ + matches = OCR_FORMAT_PATTERN.findall(text) + + assert len(matches) == expected_count, ( + f"Expected {expected_count} OCR blocks, found {len(matches)}. " f"Text:\n{text}" + ) + + results = [] + for match in matches: + identifier, content = match + results.append( + { + "identifier": identifier, + "content": content.strip(), + } + ) + + return results + + +class TestOCRFormatConsistency: + """Test OCR output format consistency.""" + + def test_word_ocr_format(self) -> None: + """Test Word document OCR format.""" + # Example Word OCR output + text = """ +Some text before image. + +[Image OCR: rId9] +FOOTER: Document ID: DOC-2024-001 +[End Image OCR] + +Some text after image. + """.strip() + + results = validate_ocr_format(text, expected_count=1) + assert results[0]["identifier"] == "rId9" + assert "DOC-2024-001" in results[0]["content"] + + def test_powerpoint_ocr_format(self) -> None: + """Test PowerPoint OCR format.""" + # Example PowerPoint OCR output + text = """ +Slide title + +[Image OCR: slide_1_img_Picture_3] +Diagram: System Components +Architecture Overview +[End Image OCR] + +More slide content. + """.strip() + + results = validate_ocr_format(text, expected_count=1) + assert results[0]["identifier"].startswith("slide_") + assert "System Components" in results[0]["content"] + + def test_pdf_ocr_format(self) -> None: + """Test PDF OCR format.""" + # Example PDF OCR output - embedded image + text = """ +## Page 1 + +Regular text content. + +[Image OCR: page_1_img_0] +Complex Layout Diagram +With Multiple Elements +[End Image OCR] + +More page content. + """.strip() + + results = validate_ocr_format(text, expected_count=1) + assert results[0]["identifier"].startswith("page_") + assert "Complex Layout" in results[0]["content"] + + def test_pdf_scanned_page_format(self) -> None: + """Test scanned PDF page OCR format.""" + # Example scanned PDF OCR output + text = """ +## Page 5 + +[Image OCR: page_5_fullpage] +Entire page was scanned +All text extracted via OCR +Multiple paragraphs preserved +[End Image OCR] + """.strip() + + results = validate_ocr_format(text, expected_count=1) + assert results[0]["identifier"] == "page_5_fullpage" + assert "scanned" in results[0]["content"] + + def test_multiple_ocr_blocks(self) -> None: + """Test multiple OCR blocks in same document.""" + text = """ +Header + +[Image OCR: rId5] +First image text +[End Image OCR] + +Middle content + +[Image OCR: rId7] +Second image text +[End Image OCR] + +Footer + """.strip() + + results = validate_ocr_format(text, expected_count=2) + assert results[0]["identifier"] == "rId5" + assert results[1]["identifier"] == "rId7" + + def test_ocr_format_invalid(self) -> None: + """Test that invalid formats are rejected.""" + # Old PowerPoint format (should fail) + invalid_text = """ +![Diagram: System Components](Picture 3.jpg) + """.strip() + + with pytest.raises(AssertionError, match="Expected 1 OCR blocks, found 0"): + validate_ocr_format(invalid_text, expected_count=1) + + +def test_ocr_format_pattern_extraction() -> None: + """Test OCR format pattern can extract all components.""" + text = """ +[Image OCR: slide_3_img_Chart_1] +Multi-line +OCR content +with newlines +[End Image OCR] + """.strip() + + match = OCR_FORMAT_PATTERN.search(text) + assert match is not None + identifier, content = match.groups() + + assert identifier == "slide_3_img_Chart_1" + assert "Multi-line" in content + assert "newlines" in content