microsoft · lesyk · Jan 26, 2026 · Jan 26, 2026 · Jan 27, 2026 · Jan 27, 2026
diff --git a/README.md b/README.md
@@ -176,6 +176,67 @@ result = md.convert("example.jpg")
 print(result.text_content)
 ```
 
+To extract text from images embedded in documents using OCR with LLM Vision:
+
+```python
+from markitdown.converters._ocr_service import MultiBackendOCRService, OCRBackend
+from markitdown.converters._pdf_converter_with_ocr import PdfConverterWithOCR
+from openai import OpenAI
+
+# Create OCR service with LLM Vision backend
+client = OpenAI()
+ocr_service = MultiBackendOCRService(
+    backends=[OCRBackend.LLM_VISION],
+    llm_client=client,
+    llm_model="gpt-4o"
+)
+
+# Convert PDF with LLM-based OCR
+converter = PdfConverterWithOCR()
+with open("document.pdf", "rb") as f:
+    result = converter.convert(f, ocr_service=ocr_service)
+    print(result.text_content)
+```
+
+OCR converters are available for PDF, DOCX, XLSX (multi-sheet), and PPTX formats. Images are extracted with context preservation (page numbers, cell references, relationship IDs).
+
+#### Scanned PDF Support
+
+MarkItDown automatically detects scanned PDFs (documents with no extractable text) and falls back to full-page OCR. When a PDF extraction returns empty or whitespace-only results, the converter:
+
+1. Renders each page as a high-resolution image (300 DPI)
+2. Performs OCR on the full page image using LLM Vision
+3. Preserves page structure with page markers
+4. Indicates which OCR backend was used
+
+```python
+from markitdown.converters._ocr_service import MultiBackendOCRService, OCRBackend
+from markitdown.converters._pdf_converter_with_ocr import PdfConverterWithOCR
+from openai import OpenAI
+
+# Create OCR service with LLM Vision
+client = OpenAI()
+ocr_service = MultiBackendOCRService(
+    backends=[OCRBackend.LLM_VISION],
+    llm_client=client,
+    llm_model="gpt-4o"
+)
+
+# Convert scanned PDF - fallback is automatic
+converter = PdfConverterWithOCR()
+with open("scanned_invoice.pdf", "rb") as f:
+    result = converter.convert(f, ocr_service=ocr_service)
+    print(result.text_content)
+```
+
+The fallback triggers automatically when:
+
+- PDF has no extractable text (truly scanned documents)
+- Text extraction returns only whitespace
+- No embedded text is found via pdfminer or pdfplumber
+
+No additional configuration is needed - just provide an OCR service and the converter handles the rest.
+
 ### Docker
 
 ```sh

diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.5b2"
+__version__ = "0.1.5b3"
diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
@@ -39,6 +39,12 @@
     EpubConverter,
     DocumentIntelligenceConverter,
     CsvConverter,
+    PdfConverterWithOCR,
+    DocxConverterWithOCR,
+    XlsxConverterWithOCR,
+    PptxConverterWithOCR,
+    MultiBackendOCRService,
+    OCRBackend,
 )
 
 from ._base_converter import DocumentConverter, DocumentConverterResult
@@ -49,7 +55,6 @@
     FailedConversionAttempt,
 )
 
-
 # Lower priority values are tried first.
 PRIORITY_SPECIFIC_FILE_FORMAT = (
     0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
@@ -191,14 +196,25 @@ def enable_builtins(self, **kwargs) -> None:
             self.register_converter(WikipediaConverter())
             self.register_converter(YouTubeConverter())
             self.register_converter(BingSerpConverter())
-            self.register_converter(DocxConverter())
-            self.register_converter(XlsxConverter())
+
+            # Register OCR-enabled converters if LLM client is available, otherwise use standard converters
+            if self._llm_client is not None and self._llm_model is not None:
+                # Use OCR-enabled converters for documents with embedded images
+                self.register_converter(DocxConverterWithOCR())
+                self.register_converter(XlsxConverterWithOCR())
+                self.register_converter(PptxConverterWithOCR())
+                self.register_converter(PdfConverterWithOCR())
+            else:
+                # Use standard converters without OCR
+                self.register_converter(DocxConverter())
+                self.register_converter(XlsxConverter())
+                self.register_converter(PptxConverter())
+                self.register_converter(PdfConverter())
+
             self.register_converter(XlsConverter())
-            self.register_converter(PptxConverter())
             self.register_converter(AudioConverter())
             self.register_converter(ImageConverter())
             self.register_converter(IpynbConverter())
-            self.register_converter(PdfConverter())
             self.register_converter(OutlookMsgConverter())
             self.register_converter(EpubConverter())
             self.register_converter(CsvConverter())
@@ -571,6 +587,19 @@ def _convert(
                 if "llm_prompt" not in _kwargs and self._llm_prompt is not None:
                     _kwargs["llm_prompt"] = self._llm_prompt
 
+                # Auto-create OCR service if llm_client is available and not already provided
+                if "ocr_service" not in _kwargs:
+                    llm_client = _kwargs.get("llm_client", self._llm_client)
+                    llm_model = _kwargs.get("llm_model", self._llm_model)
+                    llm_prompt = _kwargs.get("llm_prompt", self._llm_prompt)
+                    if llm_client is not None and llm_model is not None:
+                        _kwargs["ocr_service"] = MultiBackendOCRService(
+                            backends=[OCRBackend.LLM_VISION],
+                            llm_client=llm_client,
+                            llm_model=llm_model,
+                            llm_prompt=llm_prompt,
+                        )
+
                 if "style_map" not in _kwargs and self._style_map is not None:
                     _kwargs["style_map"] = self._style_map
 

diff --git a/packages/markitdown/src/markitdown/_uri_utils.py b/packages/markitdown/src/markitdown/_uri_utils.py
@@ -1,5 +1,4 @@
 import base64
-import os
 from typing import Tuple, Dict
 from urllib.request import url2pathname
 from urllib.parse import urlparse, unquote_to_bytes
@@ -12,7 +11,7 @@ def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
         raise ValueError(f"Not a file URL: {file_uri}")
 
     netloc = parsed.netloc if parsed.netloc else None
-    path = os.path.abspath(url2pathname(parsed.path))
+    path = url2pathname(parsed.path)
     return netloc, path
 
 

diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py
@@ -23,6 +23,11 @@
 )
 from ._epub_converter import EpubConverter
 from ._csv_converter import CsvConverter
+from ._pdf_converter_with_ocr import PdfConverterWithOCR
+from ._docx_converter_with_ocr import DocxConverterWithOCR
+from ._xlsx_converter_with_ocr import XlsxConverterWithOCR
+from ._pptx_converter_with_ocr import PptxConverterWithOCR
+from ._ocr_service import MultiBackendOCRService, OCRBackend, OCRResult
 
 __all__ = [
     "PlainTextConverter",
@@ -45,4 +50,11 @@
     "DocumentIntelligenceFileType",
     "EpubConverter",
     "CsvConverter",
+    "PdfConverterWithOCR",
+    "DocxConverterWithOCR",
+    "XlsxConverterWithOCR",
+    "PptxConverterWithOCR",
+    "MultiBackendOCRService",
+    "OCRBackend",
+    "OCRResult",
 ]
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter_with_ocr.py b/packages/markitdown/src/markitdown/converters/_docx_converter_with_ocr.py
@@ -0,0 +1,183 @@
+"""
+Enhanced DOCX Converter with OCR support for embedded images.
+Extracts images from Word documents and performs OCR while maintaining context.
+"""
+
+import io
+import re
+import sys
+from typing import Any, BinaryIO, Optional
+
+from .._base_converter import DocumentConverterResult
+from .._exceptions import MISSING_DEPENDENCY_MESSAGE, MissingDependencyException
+from .._stream_info import StreamInfo
+from ..converter_utils.docx.pre_process import pre_process_docx
+from ._html_converter import HtmlConverter
+from ._ocr_service import MultiBackendOCRService
+
+# Try loading dependencies
+_dependency_exc_info = None
+try:
+    import mammoth
+    from docx import Document
+except ImportError:
+    _dependency_exc_info = sys.exc_info()
+
+
+class DocxConverterWithOCR(HtmlConverter):
+    """
+    Enhanced DOCX Converter with OCR support for embedded images.
+    Maintains document flow while extracting text from images inline.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._html_converter = HtmlConverter()
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        if extension == ".docx":
+            return True
+
+        if mimetype.startswith(
+            "application/vnd.openxmlformats-officedocument.wordprocessingml"
+        ):
+            return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
+        if _dependency_exc_info is not None:
+            raise MissingDependencyException(
+                MISSING_DEPENDENCY_MESSAGE.format(
+                    converter=type(self).__name__,
+                    extension=".docx",
+                    feature="docx",
+                )
+            ) from _dependency_exc_info[1].with_traceback(
+                _dependency_exc_info[2]
+            )  # type: ignore[union-attr]
+
+        # Get OCR service if available
+        ocr_service: Optional[MultiBackendOCRService] = kwargs.get("ocr_service")
+
+        if ocr_service:
+            # Extract and OCR images before mammoth processing
+            file_stream.seek(0)
+            image_ocr_map = self._extract_and_ocr_images(file_stream, ocr_service)
+
+            # Process with mammoth
+            file_stream.seek(0)
+            pre_process_stream = pre_process_docx(file_stream)
+            html_result = mammoth.convert_to_html(
+                pre_process_stream, style_map=kwargs.get("style_map")
+            ).value
+
+            # Inject OCR results into HTML
+            html_with_ocr = self._inject_ocr_into_html(html_result, image_ocr_map)
+
+            return self._html_converter.convert_string(html_with_ocr, **kwargs)
+        else:
+            # Standard conversion without OCR
+            style_map = kwargs.get("style_map", None)
+            pre_process_stream = pre_process_docx(file_stream)
+            return self._html_converter.convert_string(
+                mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
+                **kwargs,
+            )
+
+    def _extract_and_ocr_images(
+        self, file_stream: BinaryIO, ocr_service: MultiBackendOCRService
+    ) -> dict[str, str]:
+        """
+        Extract images from DOCX and OCR them.
+
+        Args:
+            file_stream: DOCX file stream
+            ocr_service: OCR service to use
+
+        Returns:
+            Dict mapping image relationship IDs to OCR text
+        """
+        ocr_map = {}
+
+        try:
+            file_stream.seek(0)
+            doc = Document(file_stream)
+
+            # Extract images from document relationships
+            for rel in doc.part.rels.values():
+                if "image" in rel.target_ref.lower():
+                    try:
+                        image_part = rel.target_part
+                        image_bytes = image_part.blob
+
+                        # Create stream for OCR
+                        image_stream = io.BytesIO(image_bytes)
+
+                        # Perform OCR
+                        ocr_result = ocr_service.extract_text(image_stream)
+
+                        if ocr_result.text.strip():
+                            # Store with relationship ID using consistent format
+                            ocr_text = f"\n[Image OCR: {rel.rId}]\n{ocr_result.text}\n[End Image OCR]\n"
+                            ocr_map[rel.rId] = ocr_text
+
+                    except Exception:
+                        continue
+
+        except Exception:
+            pass
+
+        return ocr_map
+
+    def _inject_ocr_into_html(self, html: str, ocr_map: dict[str, str]) -> str:
+        """
+        Replace image tags with OCR text inline (no base64 images).
+
+        Args:
+            html: HTML content from mammoth
+            ocr_map: Map of image IDs to OCR text
+
+        Returns:
+            HTML with images replaced by OCR text
+        """
+        if not ocr_map:
+            return html
+
+        # Create a list of OCR texts and track which ones we've used
+        ocr_texts = list(ocr_map.values())
+        used_indices = []
+
+        def replace_img(match):
+            # Replace the entire image tag with OCR text (no base64!)
+            for i, ocr_text in enumerate(ocr_texts):
+                if i not in used_indices:
+                    used_indices.append(i)
+                    # Return just the OCR text as a paragraph, no image
+                    return f"<p><em>{ocr_text}</em></p>"
+            return ""  # Remove image if no OCR text available
+
+        # Replace ALL img tags (including base64) with OCR text
+        result = re.sub(r"<img[^>]*>", replace_img, html)
+
+        # If there are remaining OCR texts (images that weren't in HTML), append them
+        remaining_ocr = [
+            ocr_texts[i] for i in range(len(ocr_texts)) if i not in used_indices
+        ]
+        if remaining_ocr:
+            result += f"<p><em>{''.join(remaining_ocr)}</em></p>"
+
+        return result