diff --git a/CHANGELOG.md b/CHANGELOG.md index 29a2b68..04a6868 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- **MarkItDown engine adapter** — wraps Microsoft's [`markitdown`](https://github.com/microsoft/markitdown) pure-Python library that converts Office files, PDFs, HTML, images, CSV/JSON/XML, ePub, audio, and ZIP archives into LLM-friendly Markdown. Added to the `benchmark.py` harness alongside the other local engines. Install: `pip install docfold[markitdown]`. +- **Non-PDF benchmark fixtures** — `benchmark.py` now also generates synthetic DOCX (built with stdlib `zipfile` + minimal Office Open XML, no extra deps), HTML, and CSV documents, and filters engines per-doc by `supported_extensions` so PyMuPDF / OCR engines no longer log spurious errors on Office or web fixtures. - **OpenDataLoader PDF engine adapter** — wraps the Java-based [`opendataloader-pdf`](https://github.com/opendataloader-project/opendataloader-pdf) tool (via its bundled-JAR Python wheel). Local, deterministic extraction with typed structural elements (heading, paragraph, table, list, header, footer) and per-element bounding boxes. Install: `pip install docfold[opendataloader]` (also requires Java 11+). - **Multi-script benchmark coverage** — `benchmark.py` now generates Arabic (RTL + shaping), Hebrew (RTL, no shaping), and Simplified Chinese (CJK) synthetic PDFs alongside the existing English docs. Fonts are bundled under `tests/fixtures/fonts/` (OFL-1.1, subsetted where relevant) so the benchmark is reproducible without system font packages. diff --git a/README.md b/README.md index 7c2d8f9..c623586 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ Docfold is the open-source extraction engine from [Datatera.ai](https://datatera | [**Azure Doc Intel**](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence) | ✅ | SaaS | Paid | ★★★ | ★★★ | ★★★ | ✅ | ✅ | Fast | $$ | | [**Nougat**](https://github.com/facebookresearch/nougat) | ✅ | Local | MIT | ★★★ | ★★☆ | ★★☆ | — | — | Slow | Free | | [**Surya**](https://github.com/VikParuchuri/surya) | ✅ | Local | GPL | ★★☆ | ★★★ | ★★☆ | ✅ | ✅ | Medium | Free | +| [**MarkItDown**](https://github.com/microsoft/markitdown) | ✅ | Local | MIT | ★★☆ | ★☆☆ | ★★☆ | — | — | Fast | Free | **★★★** Excellent **★★☆** Good **★☆☆** Basic **☆☆☆** Not supported — **$$** ~$1-3/1K pages **$$$** ~$5-15/1K pages — **BBox** Bounding boxes — **Conf** Confidence scores @@ -108,6 +109,7 @@ for name, res in results.items(): | [**Azure Doc Intel**](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence) | SaaS | Paid | PDF, Office, HTML, images | N/A | `pip install docfold[azure-docint]` | | [**Nougat**](https://github.com/facebookresearch/nougat) | Local | MIT (code) | PDF | Recommended | `pip install docfold[nougat]` | | [**Surya**](https://github.com/VikParuchuri/surya) | Local | GPL-3.0 | PDF, images | Optional | `pip install docfold[surya]` | +| [**MarkItDown**](https://github.com/microsoft/markitdown) | Local | MIT | PDF, Office, HTML, images, CSV/JSON/XML, ePub, audio, ZIP | No | `pip install docfold[markitdown]` | > **Adding your own engine?** Implement the `DocumentEngine` interface — see [Adding a Custom Engine](#adding-a-custom-engine) below. diff --git a/benchmark.py b/benchmark.py index 287d5a9..e70427f 100644 --- a/benchmark.py +++ b/benchmark.py @@ -286,9 +286,133 @@ def generate_benchmark_documents(tmpdir: str) -> list[dict]: # round-trip extraction for those scripts (null bytes, dropped matras). # They need real-world fixture PDFs — see docs/tasks/ for a follow-up. + # --- Doc 8: DOCX (Office) --- + # Minimal valid Office Open XML built with stdlib only — no python-docx + # dependency. Exercises engines that handle Office formats (markitdown, + # docling, unstructured, liteparse, ...). + doc8_path = os.path.join(tmpdir, "office_memo.docx") + doc8_paragraphs = [ + "Internal Memo", + "To: All Staff", + "Date: April 25, 2026", + "Subject: Q1 2026 Results", + "Revenue grew 18 percent year-over-year, exceeding the plan.", + "Operating margin improved to 24.1 percent.", + ] + create_docx(doc8_path, doc8_paragraphs) + documents.append({ + "name": "office_memo", + "path": doc8_path, + "ground_truth": "\n".join(doc8_paragraphs), + "pages": 1, + "category": "office", + }) + + # --- Doc 9: HTML page --- + doc9_path = os.path.join(tmpdir, "blog_post.html") + doc9_paragraphs = [ + "How Document Processing Works", + "Document processing converts unstructured files into structured data.", + "Modern pipelines combine layout analysis, OCR, and language models.", + "Open-source toolkits make these capabilities widely accessible.", + ] + doc9_html = ( + "
{p}
" for p in doc9_paragraphs[1:]) + + "" + ) + with open(doc9_path, "w", encoding="utf-8") as f: + f.write(doc9_html) + documents.append({ + "name": "blog_post", + "path": doc9_path, + "ground_truth": "\n".join(doc9_paragraphs), + "pages": 1, + "category": "web", + }) + + # --- Doc 10: CSV (tabular) --- + # Engines that target Markdown output (markitdown, docling, ...) render + # CSV as a Markdown table. The ground truth is the canonical Markdown + # table so CER/WER measure formatting fidelity, not how cells are joined. + doc10_path = os.path.join(tmpdir, "sales.csv") + doc10_rows = [ + ["Region", "Q1", "Q2", "Q3", "Q4"], + ["North", "120", "135", "150", "180"], + ["South", "98", "110", "125", "140"], + ["East", "85", "92", "100", "118"], + ["West", "140", "155", "170", "200"], + ] + with open(doc10_path, "w", encoding="utf-8") as f: + for row in doc10_rows: + f.write(",".join(row) + "\n") + header = doc10_rows[0] + sep = ["---"] * len(header) + md_lines = ( + ["| " + " | ".join(header) + " |", "| " + " | ".join(sep) + " |"] + + ["| " + " | ".join(row) + " |" for row in doc10_rows[1:]] + ) + documents.append({ + "name": "sales_csv", + "path": doc10_path, + "ground_truth": "\n".join(md_lines), + "pages": 1, + "category": "tabular", + }) + return documents +def create_docx(path: str, paragraphs: list[str]) -> None: + """Build a minimal but valid .docx (Office Open XML) with no dependencies. + + Only enough structure to round-trip plain paragraphs through engines like + python-docx, docling, markitdown, unstructured, liteparse, ... + """ + import zipfile + + def _xml_escape(s: str) -> str: + return ( + s.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + ) + + body = "".join( + f'`; JSON: `{"markdown": "..."}`; TEXT: pass through).
+- Unicode / CJK / RTL documents — ensure the string is passed through without
+ encoding munging (the benchmark's Arabic/Hebrew/Chinese fixtures will
+ cover this in the E2E run).
+
+## Out of Scope
+
+- No plugin hooks for markitdown's extensibility system (custom converters).
+- No attempt to extract bounding boxes — markitdown doesn't produce them.
+- No audio / YouTube / ZIP extensions enrollment in the router priority map;
+ we only register formats that already exist in `_EXTENSION_PRIORITY`.
+
+## Follow-up: non-PDF benchmark coverage
+
+The first round of `benchmark.py` only generated PDFs, which is the format
+where markitdown is *least* differentiated (PyMuPDF dominates on digital text
+PDFs). To actually exercise where markitdown shines, the harness now also
+produces:
+
+- A synthetic **DOCX** (built via stdlib `zipfile` + minimal Word XML — no
+ new runtime deps).
+- A synthetic **HTML** page with a heading, paragraphs, and a small table.
+- A synthetic **CSV** with a few rows.
+
+Engines are filtered per-doc by `supported_extensions`, so PyMuPDF / OCR
+engines simply don't run on Office / web / tabular fixtures (instead of
+spamming the report with errors).
diff --git a/pyproject.toml b/pyproject.toml
index 29af5f8..fb25214 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,6 +108,9 @@ surya = [
firecrawl = [
"firecrawl-py>=1.0",
]
+markitdown = [
+ "markitdown[all]>=0.0.1",
+]
evaluation = [
"jiwer>=3.0", # WER/CER computation
"numpy>=1.23",
@@ -116,7 +119,7 @@ evaluation = [
"psutil>=5.9", # Memory measurement
]
all = [
- "docfold[docling,mineru,marker,pymupdf,paddleocr,tesseract,easyocr,unstructured,llamaparse,liteparse,opendataloader,mistral-ocr,textract,google-docai,azure-docint,nougat,chandra,surya,firecrawl,evaluation]",
+ "docfold[docling,mineru,marker,pymupdf,paddleocr,tesseract,easyocr,unstructured,llamaparse,liteparse,opendataloader,mistral-ocr,textract,google-docai,azure-docint,nougat,chandra,surya,firecrawl,markitdown,evaluation]",
# Note: zerox excluded from [all] — py-zerox requires Python 3.11+
# Install separately: pip install docfold[zerox]
]
diff --git a/src/docfold/engines/markitdown_engine.py b/src/docfold/engines/markitdown_engine.py
new file mode 100644
index 0000000..cb47704
--- /dev/null
+++ b/src/docfold/engines/markitdown_engine.py
@@ -0,0 +1,135 @@
+"""MarkItDown engine adapter — Microsoft's open-source document-to-Markdown library.
+
+MarkItDown is a pure-Python tool that converts a wide range of document formats
+(Office files, PDFs, images, HTML, CSV/JSON/XML, ePub, audio, ZIP, ...) into
+LLM-friendly Markdown. See https://github.com/microsoft/markitdown.
+
+Install: ``pip install docfold[markitdown]``
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import time
+from typing import Any
+
+from docfold.engines.base import (
+ DocumentEngine,
+ EngineCapabilities,
+ EngineResult,
+ OutputFormat,
+)
+
+logger = logging.getLogger(__name__)
+
+_SUPPORTED_EXTENSIONS = {
+ # Office
+ "docx", "pptx", "xlsx", "xls",
+ # PDFs
+ "pdf",
+ # Web / markup
+ "html", "htm", "xml",
+ # Tabular / structured data
+ "csv", "tsv", "json",
+ # Images (markitdown runs OCR/LLM captioning when configured)
+ "png", "jpg", "jpeg", "gif", "bmp", "tiff", "tif", "webp",
+ # Audio (transcription)
+ "mp3", "wav", "m4a",
+ # eBooks / archives / misc
+ "epub", "zip", "txt", "md",
+}
+
+
+class MarkItDownEngine(DocumentEngine):
+ """Adapter for Microsoft's ``markitdown`` library.
+
+ Markitdown converts documents to Markdown via its synchronous ``convert``
+ method. We dispatch the call through ``run_in_executor`` so it does not
+ block the event loop.
+ """
+
+ def __init__(self, enable_plugins: bool = False) -> None:
+ self._enable_plugins = enable_plugins
+ self._converter: Any = None
+
+ @property
+ def name(self) -> str:
+ return "markitdown"
+
+ @property
+ def supported_extensions(self) -> set[str]:
+ return _SUPPORTED_EXTENSIONS
+
+ @property
+ def capabilities(self) -> EngineCapabilities:
+ # Markitdown returns a Markdown string — no layout, bboxes, or
+ # confidence scores.
+ return EngineCapabilities()
+
+ def is_available(self) -> bool:
+ # markitdown's import chain pulls in pdfminer/cryptography, which can
+ # raise non-ImportError exceptions (e.g. a broken PyO3 binding).
+ # Treat any import failure as "unavailable" so a broken env cannot
+ # knock out the whole router / benchmark harness.
+ try:
+ import markitdown # noqa: F401
+ return True
+ except Exception: # noqa: BLE001
+ return False
+
+ def _get_converter(self) -> Any:
+ if self._converter is None:
+ from markitdown import MarkItDown
+ self._converter = MarkItDown(enable_plugins=self._enable_plugins)
+ return self._converter
+
+ async def process(
+ self,
+ file_path: str,
+ output_format: OutputFormat = OutputFormat.MARKDOWN,
+ **kwargs: Any,
+ ) -> EngineResult:
+ start = time.perf_counter()
+
+ try:
+ converter = self._get_converter()
+ except ImportError as exc:
+ raise RuntimeError(
+ "markitdown is not installed. Install with: pip install docfold[markitdown]"
+ ) from exc
+ except TypeError:
+ # Older markitdown versions don't accept enable_plugins kwarg.
+ from markitdown import MarkItDown
+ self._converter = MarkItDown()
+ converter = self._converter
+
+ loop = asyncio.get_running_loop()
+ convert_result = await loop.run_in_executor(None, converter.convert, file_path)
+
+ markdown_text: str = getattr(convert_result, "text_content", "") or ""
+ title = getattr(convert_result, "title", None)
+
+ if output_format == OutputFormat.JSON:
+ content = json.dumps({"markdown": markdown_text, "title": title},
+ ensure_ascii=False)
+ elif output_format == OutputFormat.HTML:
+ # Minimal wrapper — markitdown doesn't render HTML itself.
+ content = f"{markdown_text}"
+ else:
+ # MARKDOWN and TEXT both return the markdown string as-is.
+ content = markdown_text
+
+ elapsed_ms = int((time.perf_counter() - start) * 1000)
+
+ return EngineResult(
+ content=content,
+ format=output_format,
+ engine_name=self.name,
+ processing_time_ms=elapsed_ms,
+ metadata={
+ "title": title,
+ "enable_plugins": self._enable_plugins,
+ },
+ )
diff --git a/src/docfold/engines/router.py b/src/docfold/engines/router.py
index d23fe57..8cafa77 100644
--- a/src/docfold/engines/router.py
+++ b/src/docfold/engines/router.py
@@ -25,6 +25,7 @@
_IMAGE_PRIORITY = [
"chandra", "surya", "paddleocr", "tesseract", "easyocr", "docling", "liteparse",
"mistral_ocr", "google_docai", "textract", "azure_docint", "zerox", "marker",
+ "markitdown",
]
_EXTENSION_PRIORITY: dict[str, list[str]] = {
@@ -33,31 +34,41 @@
"docling", "mineru", "chandra", "unstructured", "marker",
"llamaparse", "liteparse", "mistral_ocr", "firecrawl", "google_docai",
"azure_docint", "textract", "zerox", "nougat", "surya", "pymupdf",
- "paddleocr", "tesseract", "easyocr",
+ "paddleocr", "tesseract", "easyocr", "markitdown",
],
# --- Office ---
"docx": [
"docling", "marker", "unstructured", "llamaparse",
- "liteparse", "firecrawl", "azure_docint",
+ "liteparse", "firecrawl", "azure_docint", "markitdown",
],
"doc": ["docling", "marker", "unstructured", "llamaparse", "liteparse", "azure_docint"],
- "pptx": ["docling", "marker", "unstructured", "llamaparse", "liteparse", "azure_docint"],
+ "pptx": [
+ "docling", "marker", "unstructured", "llamaparse",
+ "liteparse", "azure_docint", "markitdown",
+ ],
"ppt": ["docling", "marker", "unstructured", "llamaparse", "liteparse", "azure_docint"],
- "xlsx": ["docling", "marker", "unstructured", "llamaparse", "liteparse", "azure_docint"],
- "xls": ["docling", "marker", "unstructured", "llamaparse", "liteparse", "azure_docint"],
+ "xlsx": [
+ "docling", "marker", "unstructured", "llamaparse",
+ "liteparse", "azure_docint", "markitdown",
+ ],
+ "xls": [
+ "docling", "marker", "unstructured", "llamaparse",
+ "liteparse", "azure_docint", "markitdown",
+ ],
"odt": ["marker", "unstructured"],
"odp": ["marker", "unstructured"],
"ods": ["marker", "unstructured"],
# --- Web / markup ---
- "html": ["docling", "firecrawl", "unstructured", "marker", "azure_docint"],
- "htm": ["docling", "firecrawl", "unstructured", "marker", "azure_docint"],
- "xml": ["firecrawl", "unstructured"],
- "md": ["unstructured"],
+ "html": ["docling", "firecrawl", "unstructured", "marker", "azure_docint", "markitdown"],
+ "htm": ["docling", "firecrawl", "unstructured", "marker", "azure_docint", "markitdown"],
+ "xml": ["firecrawl", "unstructured", "markitdown"],
+ "md": ["unstructured", "markitdown"],
"rst": ["unstructured"],
- "csv": ["unstructured"],
- "tsv": ["unstructured"],
- "txt": ["unstructured"],
+ "csv": ["unstructured", "markitdown"],
+ "tsv": ["unstructured", "markitdown"],
+ "txt": ["unstructured", "markitdown"],
"rtf": ["unstructured"],
+ "json": ["markitdown"],
# --- Images ---
"png": _IMAGE_PRIORITY,
"jpg": _IMAGE_PRIORITY,
@@ -71,7 +82,13 @@
"eml": ["unstructured"],
"msg": ["unstructured"],
# --- eBooks ---
- "epub": ["unstructured", "marker"],
+ "epub": ["unstructured", "marker", "markitdown"],
+ # --- Audio (transcription) ---
+ "mp3": ["markitdown"],
+ "wav": ["markitdown"],
+ "m4a": ["markitdown"],
+ # --- Archives ---
+ "zip": ["markitdown"],
}
# Ultimate fallback when extension is unknown or missing from the map.
@@ -79,7 +96,7 @@
"docling", "mineru", "chandra", "unstructured", "marker",
"llamaparse", "liteparse", "mistral_ocr", "google_docai", "azure_docint",
"textract", "zerox", "nougat", "surya", "pymupdf", "paddleocr", "tesseract",
- "easyocr",
+ "easyocr", "markitdown",
]
diff --git a/tests/engines/test_markitdown_engine.py b/tests/engines/test_markitdown_engine.py
new file mode 100644
index 0000000..9139e3c
--- /dev/null
+++ b/tests/engines/test_markitdown_engine.py
@@ -0,0 +1,209 @@
+"""Tests for the MarkItDown engine adapter.
+
+The ``markitdown`` package is not a test-time dependency; these tests mock the
+import path and the ``MarkItDown`` class so they run on any host.
+"""
+
+from __future__ import annotations
+
+import sys
+import types
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from docfold.engines.base import EngineResult, OutputFormat
+
+
+def _install_fake_markitdown(text_content: str = "# Hello\n\nWorld") -> MagicMock:
+ """Inject a fake ``markitdown`` module into ``sys.modules``.
+
+ Returns the mock ``MarkItDown`` class so individual tests can assert
+ on how it was called.
+ """
+ fake_module = types.ModuleType("markitdown")
+ mock_class = MagicMock(name="MarkItDown")
+
+ # Default: MarkItDown().convert(path).text_content = text_content
+ instance = MagicMock()
+ convert_result = MagicMock()
+ convert_result.text_content = text_content
+ convert_result.title = None
+ instance.convert.return_value = convert_result
+ mock_class.return_value = instance
+
+ fake_module.MarkItDown = mock_class
+ sys.modules["markitdown"] = fake_module
+ return mock_class
+
+
+def _remove_fake_markitdown() -> None:
+ sys.modules.pop("markitdown", None)
+
+
+@pytest.fixture
+def fake_markitdown():
+ mock_class = _install_fake_markitdown()
+ try:
+ yield mock_class
+ finally:
+ _remove_fake_markitdown()
+
+
+class TestMarkItDownEngineMetadata:
+ def test_name(self):
+ from docfold.engines.markitdown_engine import MarkItDownEngine
+
+ assert MarkItDownEngine().name == "markitdown"
+
+ def test_supported_extensions_covers_markitdown_formats(self):
+ from docfold.engines.markitdown_engine import MarkItDownEngine
+
+ exts = MarkItDownEngine().supported_extensions
+ # The formats markitdown documents support: Office, PDFs, images,
+ # web/markup, tabular, ePub, audio.
+ for fmt in ("pdf", "docx", "pptx", "xlsx", "html", "htm",
+ "png", "jpg", "jpeg", "csv", "json", "xml", "epub"):
+ assert fmt in exts, f"expected '{fmt}' in supported_extensions"
+
+ def test_capabilities_are_empty_by_default(self):
+ from docfold.engines.markitdown_engine import MarkItDownEngine
+
+ caps = MarkItDownEngine().capabilities
+ # markitdown returns plain markdown with no layout info
+ assert caps.bounding_boxes is False
+ assert caps.confidence is False
+ assert caps.table_structure is False
+
+ def test_is_available_true_when_importable(self, fake_markitdown):
+ from docfold.engines.markitdown_engine import MarkItDownEngine
+
+ assert MarkItDownEngine().is_available() is True
+
+ def test_is_available_false_when_missing(self):
+ from docfold.engines.markitdown_engine import MarkItDownEngine
+
+ _remove_fake_markitdown()
+ with patch.dict(sys.modules, {"markitdown": None}):
+ assert MarkItDownEngine().is_available() is False
+
+
+class TestMarkItDownEngineProcess:
+ @pytest.mark.asyncio
+ async def test_process_markdown_returns_engine_result(self, fake_markitdown):
+ from docfold.engines.markitdown_engine import MarkItDownEngine
+
+ # Custom markdown payload for this test
+ instance = fake_markitdown.return_value
+ instance.convert.return_value.text_content = (
+ "# Invoice 2024\n\nAmount: **$1,250.00**"
+ )
+
+ engine = MarkItDownEngine()
+ result = await engine.process("invoice.pdf", output_format=OutputFormat.MARKDOWN)
+
+ assert isinstance(result, EngineResult)
+ assert result.engine_name == "markitdown"
+ assert result.format == OutputFormat.MARKDOWN
+ assert "Invoice 2024" in result.content
+ assert "$1,250.00" in result.content
+ assert result.processing_time_ms >= 0
+
+ # MarkItDown().convert("invoice.pdf") must have been called
+ instance.convert.assert_called_once()
+ call_args = instance.convert.call_args
+ assert call_args.args[0] == "invoice.pdf"
+
+ @pytest.mark.asyncio
+ async def test_process_runs_convert_in_executor(self, fake_markitdown):
+ """The sync convert() call must be dispatched through run_in_executor
+ so it does not block the event loop."""
+ import asyncio
+
+ from docfold.engines.markitdown_engine import MarkItDownEngine
+
+ engine = MarkItDownEngine()
+
+ loop = asyncio.get_running_loop()
+ original_run_in_executor = loop.run_in_executor
+ call_count = {"n": 0}
+
+ async def spy(*args, **kwargs):
+ call_count["n"] += 1
+ return await original_run_in_executor(*args, **kwargs)
+
+ with patch.object(loop, "run_in_executor", side_effect=spy):
+ await engine.process("some.pdf", output_format=OutputFormat.MARKDOWN)
+
+ assert call_count["n"] >= 1, "convert() must be dispatched via run_in_executor"
+
+ @pytest.mark.asyncio
+ async def test_process_text_format_returns_plain_markdown(self, fake_markitdown):
+ from docfold.engines.markitdown_engine import MarkItDownEngine
+
+ instance = fake_markitdown.return_value
+ instance.convert.return_value.text_content = "# Title\n\nBody"
+
+ result = await MarkItDownEngine().process("x.pdf", output_format=OutputFormat.TEXT)
+
+ assert result.format == OutputFormat.TEXT
+ # TEXT format should pass the markdown string through unchanged.
+ assert "Title" in result.content
+ assert "Body" in result.content
+
+ @pytest.mark.asyncio
+ async def test_process_json_format_wraps_markdown(self, fake_markitdown):
+ import json as _json
+
+ from docfold.engines.markitdown_engine import MarkItDownEngine
+
+ instance = fake_markitdown.return_value
+ instance.convert.return_value.text_content = "# Doc\n\nHello"
+
+ result = await MarkItDownEngine().process("x.pdf", output_format=OutputFormat.JSON)
+
+ assert result.format == OutputFormat.JSON
+ parsed = _json.loads(result.content)
+ assert isinstance(parsed, dict)
+ assert "markdown" in parsed
+ assert "Doc" in parsed["markdown"]
+
+ @pytest.mark.asyncio
+ async def test_process_html_format_wraps_markdown(self, fake_markitdown):
+ from docfold.engines.markitdown_engine import MarkItDownEngine
+
+ instance = fake_markitdown.return_value
+ instance.convert.return_value.text_content = "# Doc"
+
+ result = await MarkItDownEngine().process("x.pdf", output_format=OutputFormat.HTML)
+
+ assert result.format == OutputFormat.HTML
+ # Markdown text must be preserved inside the HTML wrapper.
+ assert "Doc" in result.content
+ assert result.content.strip().startswith("<")
+
+ @pytest.mark.asyncio
+ async def test_process_missing_dependency_raises(self):
+ """When markitdown isn't installed, process() must raise a clear error."""
+ from docfold.engines.markitdown_engine import MarkItDownEngine
+
+ _remove_fake_markitdown()
+ with patch.dict(sys.modules, {"markitdown": None}):
+ engine = MarkItDownEngine()
+ with pytest.raises((RuntimeError, ImportError, ModuleNotFoundError)):
+ await engine.process("any.pdf", output_format=OutputFormat.MARKDOWN)
+
+ @pytest.mark.asyncio
+ async def test_process_preserves_unicode(self, fake_markitdown):
+ """Non-ASCII text (Arabic, CJK, Hebrew) must pass through unchanged."""
+ from docfold.engines.markitdown_engine import MarkItDownEngine
+
+ payload = "تقرير سنوي 2024\n\n2024年度报告\n\nדוח שנתי 2024"
+ instance = fake_markitdown.return_value
+ instance.convert.return_value.text_content = payload
+
+ result = await MarkItDownEngine().process("i18n.pdf", output_format=OutputFormat.MARKDOWN)
+
+ assert "تقرير" in result.content
+ assert "年度报告" in result.content
+ assert "דוח" in result.content