Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,67 @@ result = md.convert("example.jpg")
print(result.text_content)
```

To extract text from images embedded in documents using OCR with LLM Vision:

```python
from markitdown.converters._ocr_service import MultiBackendOCRService, OCRBackend
from markitdown.converters._pdf_converter_with_ocr import PdfConverterWithOCR
from openai import OpenAI

# Create OCR service with LLM Vision backend
client = OpenAI()
ocr_service = MultiBackendOCRService(
backends=[OCRBackend.LLM_VISION],
llm_client=client,
llm_model="gpt-4o"
)

# Convert PDF with LLM-based OCR
converter = PdfConverterWithOCR()
with open("document.pdf", "rb") as f:
result = converter.convert(f, ocr_service=ocr_service)
print(result.text_content)
```

OCR converters are available for PDF, DOCX, XLSX (multi-sheet), and PPTX formats. Images are extracted with context preservation (page numbers, cell references, relationship IDs).

#### Scanned PDF Support

MarkItDown automatically detects scanned PDFs (documents with no extractable text) and falls back to full-page OCR. When a PDF extraction returns empty or whitespace-only results, the converter:

1. Renders each page as a high-resolution image (300 DPI)
2. Performs OCR on the full page image using LLM Vision
3. Preserves page structure with page markers
4. Indicates which OCR backend was used

```python
from markitdown.converters._ocr_service import MultiBackendOCRService, OCRBackend
from markitdown.converters._pdf_converter_with_ocr import PdfConverterWithOCR
from openai import OpenAI

# Create OCR service with LLM Vision
client = OpenAI()
ocr_service = MultiBackendOCRService(
backends=[OCRBackend.LLM_VISION],
llm_client=client,
llm_model="gpt-4o"
)

# Convert scanned PDF - fallback is automatic
converter = PdfConverterWithOCR()
with open("scanned_invoice.pdf", "rb") as f:
result = converter.convert(f, ocr_service=ocr_service)
print(result.text_content)
```

The fallback triggers automatically when:

- PDF has no extractable text (truly scanned documents)
- Text extraction returns only whitespace
- No embedded text is found via pdfminer or pdfplumber

No additional configuration is needed - just provide an OCR service and the converter handles the rest.

### Docker

```sh
Expand Down
2 changes: 1 addition & 1 deletion packages/markitdown/src/markitdown/__about__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.1.5b2"
__version__ = "0.1.5b3"
39 changes: 34 additions & 5 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@
EpubConverter,
DocumentIntelligenceConverter,
CsvConverter,
PdfConverterWithOCR,
DocxConverterWithOCR,
XlsxConverterWithOCR,
PptxConverterWithOCR,
MultiBackendOCRService,
OCRBackend,
)

from ._base_converter import DocumentConverter, DocumentConverterResult
Expand All @@ -49,7 +55,6 @@
FailedConversionAttempt,
)


# Lower priority values are tried first.
PRIORITY_SPECIFIC_FILE_FORMAT = (
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
Expand Down Expand Up @@ -191,14 +196,25 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(WikipediaConverter())
self.register_converter(YouTubeConverter())
self.register_converter(BingSerpConverter())
self.register_converter(DocxConverter())
self.register_converter(XlsxConverter())

# Register OCR-enabled converters if LLM client is available, otherwise use standard converters
if self._llm_client is not None and self._llm_model is not None:
# Use OCR-enabled converters for documents with embedded images
self.register_converter(DocxConverterWithOCR())
self.register_converter(XlsxConverterWithOCR())
self.register_converter(PptxConverterWithOCR())
self.register_converter(PdfConverterWithOCR())
else:
# Use standard converters without OCR
self.register_converter(DocxConverter())
self.register_converter(XlsxConverter())
self.register_converter(PptxConverter())
self.register_converter(PdfConverter())

self.register_converter(XlsConverter())
self.register_converter(PptxConverter())
self.register_converter(AudioConverter())
self.register_converter(ImageConverter())
self.register_converter(IpynbConverter())
self.register_converter(PdfConverter())
self.register_converter(OutlookMsgConverter())
self.register_converter(EpubConverter())
self.register_converter(CsvConverter())
Expand Down Expand Up @@ -571,6 +587,19 @@ def _convert(
if "llm_prompt" not in _kwargs and self._llm_prompt is not None:
_kwargs["llm_prompt"] = self._llm_prompt

# Auto-create OCR service if llm_client is available and not already provided
if "ocr_service" not in _kwargs:
llm_client = _kwargs.get("llm_client", self._llm_client)
llm_model = _kwargs.get("llm_model", self._llm_model)
llm_prompt = _kwargs.get("llm_prompt", self._llm_prompt)
if llm_client is not None and llm_model is not None:
_kwargs["ocr_service"] = MultiBackendOCRService(
backends=[OCRBackend.LLM_VISION],
llm_client=llm_client,
llm_model=llm_model,
llm_prompt=llm_prompt,
)

if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map

Expand Down
3 changes: 1 addition & 2 deletions packages/markitdown/src/markitdown/_uri_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import base64
import os
from typing import Tuple, Dict
from urllib.request import url2pathname
from urllib.parse import urlparse, unquote_to_bytes
Expand All @@ -12,7 +11,7 @@ def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
raise ValueError(f"Not a file URL: {file_uri}")

netloc = parsed.netloc if parsed.netloc else None
path = os.path.abspath(url2pathname(parsed.path))
path = url2pathname(parsed.path)
return netloc, path


Expand Down
12 changes: 12 additions & 0 deletions packages/markitdown/src/markitdown/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@
)
from ._epub_converter import EpubConverter
from ._csv_converter import CsvConverter
from ._pdf_converter_with_ocr import PdfConverterWithOCR
from ._docx_converter_with_ocr import DocxConverterWithOCR
from ._xlsx_converter_with_ocr import XlsxConverterWithOCR
from ._pptx_converter_with_ocr import PptxConverterWithOCR
from ._ocr_service import MultiBackendOCRService, OCRBackend, OCRResult

__all__ = [
"PlainTextConverter",
Expand All @@ -45,4 +50,11 @@
"DocumentIntelligenceFileType",
"EpubConverter",
"CsvConverter",
"PdfConverterWithOCR",
"DocxConverterWithOCR",
"XlsxConverterWithOCR",
"PptxConverterWithOCR",
"MultiBackendOCRService",
"OCRBackend",
"OCRResult",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
"""
Enhanced DOCX Converter with OCR support for embedded images.
Extracts images from Word documents and performs OCR while maintaining context.
"""

import io
import re
import sys
from typing import Any, BinaryIO, Optional

from .._base_converter import DocumentConverterResult
from .._exceptions import MISSING_DEPENDENCY_MESSAGE, MissingDependencyException
from .._stream_info import StreamInfo
from ..converter_utils.docx.pre_process import pre_process_docx
from ._html_converter import HtmlConverter
from ._ocr_service import MultiBackendOCRService

# Try loading dependencies
_dependency_exc_info = None
try:
import mammoth
from docx import Document
except ImportError:
_dependency_exc_info = sys.exc_info()


class DocxConverterWithOCR(HtmlConverter):
"""
Enhanced DOCX Converter with OCR support for embedded images.
Maintains document flow while extracting text from images inline.
"""

def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()

if extension == ".docx":
return True

if mimetype.startswith(
"application/vnd.openxmlformats-officedocument.wordprocessingml"
):
return True

return False

def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".docx",
feature="docx",
)
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # type: ignore[union-attr]

# Get OCR service if available
ocr_service: Optional[MultiBackendOCRService] = kwargs.get("ocr_service")

if ocr_service:
# Extract and OCR images before mammoth processing
file_stream.seek(0)
image_ocr_map = self._extract_and_ocr_images(file_stream, ocr_service)

# Process with mammoth
file_stream.seek(0)
pre_process_stream = pre_process_docx(file_stream)
html_result = mammoth.convert_to_html(
pre_process_stream, style_map=kwargs.get("style_map")
).value

# Inject OCR results into HTML
html_with_ocr = self._inject_ocr_into_html(html_result, image_ocr_map)

return self._html_converter.convert_string(html_with_ocr, **kwargs)
else:
# Standard conversion without OCR
style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
**kwargs,
)

def _extract_and_ocr_images(
self, file_stream: BinaryIO, ocr_service: MultiBackendOCRService
) -> dict[str, str]:
"""
Extract images from DOCX and OCR them.

Args:
file_stream: DOCX file stream
ocr_service: OCR service to use

Returns:
Dict mapping image relationship IDs to OCR text
"""
ocr_map = {}

try:
file_stream.seek(0)
doc = Document(file_stream)

# Extract images from document relationships
for rel in doc.part.rels.values():
if "image" in rel.target_ref.lower():
try:
image_part = rel.target_part
image_bytes = image_part.blob

# Create stream for OCR
image_stream = io.BytesIO(image_bytes)

# Perform OCR
ocr_result = ocr_service.extract_text(image_stream)

if ocr_result.text.strip():
# Store with relationship ID using consistent format
ocr_text = f"\n[Image OCR: {rel.rId}]\n{ocr_result.text}\n[End Image OCR]\n"
ocr_map[rel.rId] = ocr_text

except Exception:
continue

except Exception:
pass

return ocr_map

def _inject_ocr_into_html(self, html: str, ocr_map: dict[str, str]) -> str:
"""
Replace image tags with OCR text inline (no base64 images).

Args:
html: HTML content from mammoth
ocr_map: Map of image IDs to OCR text

Returns:
HTML with images replaced by OCR text
"""
if not ocr_map:
return html

# Create a list of OCR texts and track which ones we've used
ocr_texts = list(ocr_map.values())
used_indices = []

def replace_img(match):
# Replace the entire image tag with OCR text (no base64!)
for i, ocr_text in enumerate(ocr_texts):
if i not in used_indices:
used_indices.append(i)
# Return just the OCR text as a paragraph, no image
return f"<p><em>{ocr_text}</em></p>"
return "" # Remove image if no OCR text available

# Replace ALL img tags (including base64) with OCR text
result = re.sub(r"<img[^>]*>", replace_img, html)

# If there are remaining OCR texts (images that weren't in HTML), append them
remaining_ocr = [
ocr_texts[i] for i in range(len(ocr_texts)) if i not in used_indices
]
if remaining_ocr:
result += f"<p><em>{''.join(remaining_ocr)}</em></p>"

return result
Loading