diff --git a/CHANGELOG.md b/CHANGELOG.md index 29a2b68..04a6868 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- **MarkItDown engine adapter** — wraps Microsoft's [`markitdown`](https://github.com/microsoft/markitdown) pure-Python library that converts Office files, PDFs, HTML, images, CSV/JSON/XML, ePub, audio, and ZIP archives into LLM-friendly Markdown. Added to the `benchmark.py` harness alongside the other local engines. Install: `pip install docfold[markitdown]`. +- **Non-PDF benchmark fixtures** — `benchmark.py` now also generates synthetic DOCX (built with stdlib `zipfile` + minimal Office Open XML, no extra deps), HTML, and CSV documents, and filters engines per-doc by `supported_extensions` so PyMuPDF / OCR engines no longer log spurious errors on Office or web fixtures. - **OpenDataLoader PDF engine adapter** — wraps the Java-based [`opendataloader-pdf`](https://github.com/opendataloader-project/opendataloader-pdf) tool (via its bundled-JAR Python wheel). Local, deterministic extraction with typed structural elements (heading, paragraph, table, list, header, footer) and per-element bounding boxes. Install: `pip install docfold[opendataloader]` (also requires Java 11+). - **Multi-script benchmark coverage** — `benchmark.py` now generates Arabic (RTL + shaping), Hebrew (RTL, no shaping), and Simplified Chinese (CJK) synthetic PDFs alongside the existing English docs. Fonts are bundled under `tests/fixtures/fonts/` (OFL-1.1, subsetted where relevant) so the benchmark is reproducible without system font packages. diff --git a/README.md b/README.md index 7c2d8f9..c623586 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ Docfold is the open-source extraction engine from [Datatera.ai](https://datatera | [**Azure Doc Intel**](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence) | ✅ | SaaS | Paid | ★★★ | ★★★ | ★★★ | ✅ | ✅ | Fast | $$ | | [**Nougat**](https://github.com/facebookresearch/nougat) | ✅ | Local | MIT | ★★★ | ★★☆ | ★★☆ | — | — | Slow | Free | | [**Surya**](https://github.com/VikParuchuri/surya) | ✅ | Local | GPL | ★★☆ | ★★★ | ★★☆ | ✅ | ✅ | Medium | Free | +| [**MarkItDown**](https://github.com/microsoft/markitdown) | ✅ | Local | MIT | ★★☆ | ★☆☆ | ★★☆ | — | — | Fast | Free | **★★★** Excellent **★★☆** Good **★☆☆** Basic **☆☆☆** Not supported — **$$** ~$1-3/1K pages **$$$** ~$5-15/1K pages — **BBox** Bounding boxes — **Conf** Confidence scores @@ -108,6 +109,7 @@ for name, res in results.items(): | [**Azure Doc Intel**](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence) | SaaS | Paid | PDF, Office, HTML, images | N/A | `pip install docfold[azure-docint]` | | [**Nougat**](https://github.com/facebookresearch/nougat) | Local | MIT (code) | PDF | Recommended | `pip install docfold[nougat]` | | [**Surya**](https://github.com/VikParuchuri/surya) | Local | GPL-3.0 | PDF, images | Optional | `pip install docfold[surya]` | +| [**MarkItDown**](https://github.com/microsoft/markitdown) | Local | MIT | PDF, Office, HTML, images, CSV/JSON/XML, ePub, audio, ZIP | No | `pip install docfold[markitdown]` | > **Adding your own engine?** Implement the `DocumentEngine` interface — see [Adding a Custom Engine](#adding-a-custom-engine) below. diff --git a/benchmark.py b/benchmark.py index 287d5a9..e70427f 100644 --- a/benchmark.py +++ b/benchmark.py @@ -286,9 +286,133 @@ def generate_benchmark_documents(tmpdir: str) -> list[dict]: # round-trip extraction for those scripts (null bytes, dropped matras). # They need real-world fixture PDFs — see docs/tasks/ for a follow-up. + # --- Doc 8: DOCX (Office) --- + # Minimal valid Office Open XML built with stdlib only — no python-docx + # dependency. Exercises engines that handle Office formats (markitdown, + # docling, unstructured, liteparse, ...). + doc8_path = os.path.join(tmpdir, "office_memo.docx") + doc8_paragraphs = [ + "Internal Memo", + "To: All Staff", + "Date: April 25, 2026", + "Subject: Q1 2026 Results", + "Revenue grew 18 percent year-over-year, exceeding the plan.", + "Operating margin improved to 24.1 percent.", + ] + create_docx(doc8_path, doc8_paragraphs) + documents.append({ + "name": "office_memo", + "path": doc8_path, + "ground_truth": "\n".join(doc8_paragraphs), + "pages": 1, + "category": "office", + }) + + # --- Doc 9: HTML page --- + doc9_path = os.path.join(tmpdir, "blog_post.html") + doc9_paragraphs = [ + "How Document Processing Works", + "Document processing converts unstructured files into structured data.", + "Modern pipelines combine layout analysis, OCR, and language models.", + "Open-source toolkits make these capabilities widely accessible.", + ] + doc9_html = ( + "Doc Processing" + f"

{doc9_paragraphs[0]}

" + + "".join(f"

{p}

" for p in doc9_paragraphs[1:]) + + "" + ) + with open(doc9_path, "w", encoding="utf-8") as f: + f.write(doc9_html) + documents.append({ + "name": "blog_post", + "path": doc9_path, + "ground_truth": "\n".join(doc9_paragraphs), + "pages": 1, + "category": "web", + }) + + # --- Doc 10: CSV (tabular) --- + # Engines that target Markdown output (markitdown, docling, ...) render + # CSV as a Markdown table. The ground truth is the canonical Markdown + # table so CER/WER measure formatting fidelity, not how cells are joined. + doc10_path = os.path.join(tmpdir, "sales.csv") + doc10_rows = [ + ["Region", "Q1", "Q2", "Q3", "Q4"], + ["North", "120", "135", "150", "180"], + ["South", "98", "110", "125", "140"], + ["East", "85", "92", "100", "118"], + ["West", "140", "155", "170", "200"], + ] + with open(doc10_path, "w", encoding="utf-8") as f: + for row in doc10_rows: + f.write(",".join(row) + "\n") + header = doc10_rows[0] + sep = ["---"] * len(header) + md_lines = ( + ["| " + " | ".join(header) + " |", "| " + " | ".join(sep) + " |"] + + ["| " + " | ".join(row) + " |" for row in doc10_rows[1:]] + ) + documents.append({ + "name": "sales_csv", + "path": doc10_path, + "ground_truth": "\n".join(md_lines), + "pages": 1, + "category": "tabular", + }) + return documents +def create_docx(path: str, paragraphs: list[str]) -> None: + """Build a minimal but valid .docx (Office Open XML) with no dependencies. + + Only enough structure to round-trip plain paragraphs through engines like + python-docx, docling, markitdown, unstructured, liteparse, ... + """ + import zipfile + + def _xml_escape(s: str) -> str: + return ( + s.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + ) + + body = "".join( + f'{_xml_escape(p)}' + for p in paragraphs + ) + document_xml = ( + '' + '' + f'{body}' + '' + ) + content_types = ( + '' + '' + '' + '' + '' + '' + ) + rels = ( + '' + '' + '' + '' + ) + + with zipfile.ZipFile(path, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", content_types) + zf.writestr("_rels/.rels", rels) + zf.writestr("word/document.xml", document_xml) + + def compute_cer(predicted: str, reference: str) -> float: """Character Error Rate — Levenshtein distance / reference length.""" if not reference: @@ -351,6 +475,7 @@ async def main(): from docfold.engines.easyocr_engine import EasyOCREngine from docfold.engines.liteparse_engine import LiteParseEngine from docfold.engines.marker_local_engine import MarkerLocalEngine + from docfold.engines.markitdown_engine import MarkItDownEngine from docfold.engines.mineru_engine import MinerUEngine from docfold.engines.nougat_engine import NougatEngine from docfold.engines.opendataloader_engine import OpenDataLoaderEngine @@ -377,6 +502,7 @@ async def main(): (PaddleOCREngine(), "pip install paddleocr"), (TesseractEngine(), "pip install pytesseract"), (UnstructuredEngine(), "pip install unstructured"), + (MarkItDownEngine(), "pip install docfold[markitdown]"), ] # Skip engines that hang on CPU for multi-doc benchmarks @@ -414,8 +540,15 @@ async def main(): print(f"{'─' * 90}") gt = doc["ground_truth"] + doc_ext = os.path.splitext(doc["path"])[1].lstrip(".").lower() for engine in engines: + # Skip engines whose declared supported_extensions don't include + # this doc's format — keeps the report free of noise like + # "PyMuPDF can't open .docx". + if doc_ext and doc_ext not in engine.supported_extensions: + continue + result, error = await run_engine( engine, doc["path"], OutputFormat.MARKDOWN ) diff --git a/docs/benchmark_results.json b/docs/benchmark_results.json index 631f884..ba7961b 100644 --- a/docs/benchmark_results.json +++ b/docs/benchmark_results.json @@ -1,8 +1,8 @@ { - "benchmark_date": "2026-04-17 10:17:34", + "benchmark_date": "2026-04-25 18:26:40", "engines": [ "pymupdf", - "opendataloader" + "markitdown" ], "documents": [ { @@ -39,11 +39,26 @@ "name": "hebrew_report", "pages": 1, "category": "rtl" + }, + { + "name": "office_memo", + "pages": 1, + "category": "office" + }, + { + "name": "blog_post", + "pages": 1, + "category": "web" + }, + { + "name": "sales_csv", + "pages": 1, + "category": "tabular" } ], "summary": { "pymupdf": { - "avg_time_ms": 6.4, + "avg_time_ms": 4.4, "avg_cer": 0.0, "avg_wer": 0.0, "avg_bbox_count": 5.3, @@ -52,7 +67,7 @@ "results": [ { "doc": "simple_text", - "time_ms": 11, + "time_ms": 5, "cer": 0.0, "wer": 0.0, "bbox_count": 5, @@ -61,7 +76,7 @@ }, { "doc": "multi_page", - "time_ms": 4, + "time_ms": 3, "cer": 0.0, "wer": 0.0, "bbox_count": 4, @@ -70,7 +85,7 @@ }, { "doc": "dense_financial", - "time_ms": 5, + "time_ms": 3, "cer": 0.0, "wer": 0.0, "bbox_count": 10, @@ -79,7 +94,7 @@ }, { "doc": "mixed_formatting", - "time_ms": 4, + "time_ms": 2, "cer": 0.0, "wer": 0.0, "bbox_count": 6, @@ -88,7 +103,7 @@ }, { "doc": "arabic_report", - "time_ms": 6, + "time_ms": 3, "cer": 0.0, "wer": 0.0, "bbox_count": 4, @@ -97,7 +112,7 @@ }, { "doc": "chinese_report", - "time_ms": 4, + "time_ms": 3, "cer": 0.0, "wer": 0.0, "bbox_count": 4, @@ -106,7 +121,7 @@ }, { "doc": "hebrew_report", - "time_ms": 11, + "time_ms": 12, "cer": 0.0, "wer": 0.0, "bbox_count": 4, @@ -115,76 +130,103 @@ } ] }, - "opendataloader": { - "avg_time_ms": 796.6, - "avg_cer": 0.257, - "avg_wer": 0.3756, - "avg_bbox_count": 3.1, + "markitdown": { + "avg_time_ms": 47.0, + "avg_cer": 0.0343, + "avg_wer": 0.1726, + "avg_bbox_count": 0.0, "errors": 0, - "successes": 7, + "successes": 10, "results": [ { "doc": "simple_text", - "time_ms": 1083, - "cer": 0.0165, - "wer": 0.0556, - "bbox_count": 2, - "content_length": 123, - "pages": 1 + "time_ms": 144, + "cer": 0.0, + "wer": 0.0, + "bbox_count": 0, + "content_length": 121, + "pages": null }, { "doc": "multi_page", - "time_ms": 756, - "cer": 0.0133, - "wer": 0.0513, - "bbox_count": 2, - "content_length": 304, - "pages": 2 + "time_ms": 27, + "cer": 0.0, + "wer": 0.0, + "bbox_count": 0, + "content_length": 300, + "pages": null }, { "doc": "dense_financial", - "time_ms": 725, + "time_ms": 21, "cer": 0.0, "wer": 0.0, - "bbox_count": 1, + "bbox_count": 0, "content_length": 298, - "pages": 1 + "pages": null }, { "doc": "mixed_formatting", - "time_ms": 737, - "cer": 0.0308, - "wer": 0.0811, - "bbox_count": 5, - "content_length": 268, - "pages": 1 + "time_ms": 23, + "cer": 0.0, + "wer": 0.0, + "bbox_count": 0, + "content_length": 260, + "pages": null }, { "doc": "arabic_report", - "time_ms": 766, - "cer": 0.8675, - "wer": 1.08, - "bbox_count": 4, - "content_length": 151, - "pages": 1 + "time_ms": 69, + "cer": 0.2517, + "wer": 0.32, + "bbox_count": 0, + "content_length": 179, + "pages": null }, { "doc": "chinese_report", - "time_ms": 721, - "cer": 0.0317, - "wer": 0.25, - "bbox_count": 4, - "content_length": 65, - "pages": 1 + "time_ms": 20, + "cer": 0.0476, + "wer": 1.0, + "bbox_count": 0, + "content_length": 60, + "pages": null }, { "doc": "hebrew_report", - "time_ms": 788, - "cer": 0.8392, - "wer": 1.1111, - "bbox_count": 4, - "content_length": 149, - "pages": 1 + "time_ms": 148, + "cer": 0.035, + "wer": 0.3704, + "bbox_count": 0, + "content_length": 138, + "pages": null + }, + { + "doc": "office_memo", + "time_ms": 10, + "cer": 0.0, + "wer": 0.0, + "bbox_count": 0, + "content_length": 176, + "pages": null + }, + { + "doc": "blog_post", + "time_ms": 5, + "cer": 0.0087, + "wer": 0.0357, + "bbox_count": 0, + "content_length": 233, + "pages": null + }, + { + "doc": "sales_csv", + "time_ms": 3, + "cer": 0.0, + "wer": 0.0, + "bbox_count": 0, + "content_length": 193, + "pages": null } ] } diff --git a/docs/tasks/MARKITDOWN_ENGINE.md b/docs/tasks/MARKITDOWN_ENGINE.md new file mode 100644 index 0000000..41e6942 --- /dev/null +++ b/docs/tasks/MARKITDOWN_ENGINE.md @@ -0,0 +1,127 @@ +--- +purpose: "Integrate Microsoft's markitdown as a new docfold engine and include it in the benchmark harness." +status: "IN_PROGRESS" +priority: "P1" +created: "2026-04-24" +--- + +# Feature: markitdown engine adapter + benchmark coverage + +## Problem + +Microsoft ships [markitdown](https://github.com/microsoft/markitdown), a pure-Python +library that converts a wide range of formats (PDF, DOCX, PPTX, XLSX, HTML, CSV, +JSON, XML, images, audio, ePub, ZIP, YouTube URLs) into LLM-friendly Markdown. +It is MIT-licensed, has no heavy runtime, and is a sensible "lowest common +denominator" baseline that users expect docfold to support alongside Docling, +Marker, Unstructured, etc. Today there is no adapter, no extras group, and it +does not appear in `benchmark.py`. + +Task from the user: "connect it and run benches" — so we need both the adapter +and synthetic-PDF benchmark coverage comparable to the other local engines. + +## Proposed Solution + +1. New `MarkItDownEngine` adapter under `src/docfold/engines/markitdown_engine.py` + that conforms to the `DocumentEngine` ABC: + - Lazy-imports `markitdown` (keeps the base package dep-free). + - Calls `MarkItDown().convert(file_path)` inside an executor (the library's + API is synchronous). + - Returns an `EngineResult` with `format=OutputFormat.MARKDOWN` (markitdown + always emits Markdown; for `HTML`/`JSON`/`TEXT` we serialize the Markdown + string into a minimal wrapper so the contract holds). + - `is_available()` returns True only when `markitdown` is importable. + - `capabilities` is empty — markitdown returns plain text, no bboxes. +2. New `markitdown` extras in `pyproject.toml`: + `markitdown = ["markitdown[all]>=0.0.1"]` and add to `all = [...]`. +3. Register it in `engines/router.py` priority lists for the formats it + handles (PDF, Office, HTML, images, CSV/text, ePub, ZIP) — placed near the + Unstructured/Marker tier since it is a similar "convert to Markdown" + baseline rather than a layout analyzer. +4. Wire it into `benchmark.py` as an additional candidate engine so it runs + on the same 7 synthetic PDFs as the existing engines and reports CER/WER/ + time like the others. +5. Add a row for markitdown to the two engine tables in `README.md`. + +## Affected Files + +- `src/docfold/engines/markitdown_engine.py` — new adapter. +- `tests/engines/test_markitdown_engine.py` — new test file, mocks the + `markitdown` package (tests do not require it installed). +- `src/docfold/engines/router.py` — add `"markitdown"` to extension priority + lists and the default fallback. +- `pyproject.toml` — add `markitdown` extras, include in `all`. +- `benchmark.py` — import and register `MarkItDownEngine` in the candidate + list. +- `README.md` — add markitdown row to the two engine overview tables. + +## Test Plan + +### Unit / Functional Tests + +- [ ] `test_name` — engine name is `"markitdown"`. +- [ ] `test_supported_extensions` — covers PDF, DOCX, PPTX, XLSX, HTML, images, + CSV, JSON, XML, ePub. +- [ ] `test_capabilities_defaults_to_empty` — no bboxes/confidence etc. +- [ ] `test_is_available_true_when_importable` — patched import succeeds. +- [ ] `test_is_available_false_when_missing` — `ImportError` short-circuits. +- [ ] `test_process_markdown_returns_engine_result` — mock the `MarkItDown` + class so `convert(...).text_content` is a known Markdown string; assert + the `EngineResult` fields (content, format, engine_name, time). +- [ ] `test_process_runs_convert_in_executor` — the synchronous `convert` call + must be dispatched via `loop.run_in_executor` so we don't block the + event loop. +- [ ] `test_process_missing_dependency_raises` — when markitdown isn't + installed, `.process()` should raise a clear `RuntimeError` (or similar) + so callers see *why* it failed. + +### Integration / E2E Tests + +- [ ] `benchmark.py` runs on a host where `markitdown` is installed and + produces a row for it in the summary table. + +### Test Commands +```bash +# Run just the new engine tests +pytest tests/engines/test_markitdown_engine.py -v + +# Full suite (should stay green) +pytest tests/ + +# E2E benchmark (requires: pip install docfold[markitdown]) +python benchmark.py +``` + +## Edge Cases + +- `markitdown` not installed on CI — tests must mock the import path and not + require the real dependency (mirrors `test_liteparse_engine.py`). +- `OutputFormat.HTML` / `JSON` / `TEXT` — markitdown only produces Markdown. + We honor the request by wrapping the Markdown string (HTML: wrap in + `
`; JSON: `{"markdown": "..."}`; TEXT: pass through).
+- Unicode / CJK / RTL documents — ensure the string is passed through without
+  encoding munging (the benchmark's Arabic/Hebrew/Chinese fixtures will
+  cover this in the E2E run).
+
+## Out of Scope
+
+- No plugin hooks for markitdown's extensibility system (custom converters).
+- No attempt to extract bounding boxes — markitdown doesn't produce them.
+- No audio / YouTube / ZIP extensions enrollment in the router priority map;
+  we only register formats that already exist in `_EXTENSION_PRIORITY`.
+
+## Follow-up: non-PDF benchmark coverage
+
+The first round of `benchmark.py` only generated PDFs, which is the format
+where markitdown is *least* differentiated (PyMuPDF dominates on digital text
+PDFs). To actually exercise where markitdown shines, the harness now also
+produces:
+
+- A synthetic **DOCX** (built via stdlib `zipfile` + minimal Word XML — no
+  new runtime deps).
+- A synthetic **HTML** page with a heading, paragraphs, and a small table.
+- A synthetic **CSV** with a few rows.
+
+Engines are filtered per-doc by `supported_extensions`, so PyMuPDF / OCR
+engines simply don't run on Office / web / tabular fixtures (instead of
+spamming the report with errors).
diff --git a/pyproject.toml b/pyproject.toml
index 29af5f8..fb25214 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,6 +108,9 @@ surya = [
 firecrawl = [
     "firecrawl-py>=1.0",
 ]
+markitdown = [
+    "markitdown[all]>=0.0.1",
+]
 evaluation = [
     "jiwer>=3.0",          # WER/CER computation
     "numpy>=1.23",
@@ -116,7 +119,7 @@ evaluation = [
     "psutil>=5.9",         # Memory measurement
 ]
 all = [
-    "docfold[docling,mineru,marker,pymupdf,paddleocr,tesseract,easyocr,unstructured,llamaparse,liteparse,opendataloader,mistral-ocr,textract,google-docai,azure-docint,nougat,chandra,surya,firecrawl,evaluation]",
+    "docfold[docling,mineru,marker,pymupdf,paddleocr,tesseract,easyocr,unstructured,llamaparse,liteparse,opendataloader,mistral-ocr,textract,google-docai,azure-docint,nougat,chandra,surya,firecrawl,markitdown,evaluation]",
     # Note: zerox excluded from [all] — py-zerox requires Python 3.11+
     # Install separately: pip install docfold[zerox]
 ]
diff --git a/src/docfold/engines/markitdown_engine.py b/src/docfold/engines/markitdown_engine.py
new file mode 100644
index 0000000..cb47704
--- /dev/null
+++ b/src/docfold/engines/markitdown_engine.py
@@ -0,0 +1,135 @@
+"""MarkItDown engine adapter — Microsoft's open-source document-to-Markdown library.
+
+MarkItDown is a pure-Python tool that converts a wide range of document formats
+(Office files, PDFs, images, HTML, CSV/JSON/XML, ePub, audio, ZIP, ...) into
+LLM-friendly Markdown.  See https://github.com/microsoft/markitdown.
+
+Install: ``pip install docfold[markitdown]``
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import time
+from typing import Any
+
+from docfold.engines.base import (
+    DocumentEngine,
+    EngineCapabilities,
+    EngineResult,
+    OutputFormat,
+)
+
+logger = logging.getLogger(__name__)
+
+_SUPPORTED_EXTENSIONS = {
+    # Office
+    "docx", "pptx", "xlsx", "xls",
+    # PDFs
+    "pdf",
+    # Web / markup
+    "html", "htm", "xml",
+    # Tabular / structured data
+    "csv", "tsv", "json",
+    # Images (markitdown runs OCR/LLM captioning when configured)
+    "png", "jpg", "jpeg", "gif", "bmp", "tiff", "tif", "webp",
+    # Audio (transcription)
+    "mp3", "wav", "m4a",
+    # eBooks / archives / misc
+    "epub", "zip", "txt", "md",
+}
+
+
+class MarkItDownEngine(DocumentEngine):
+    """Adapter for Microsoft's ``markitdown`` library.
+
+    Markitdown converts documents to Markdown via its synchronous ``convert``
+    method.  We dispatch the call through ``run_in_executor`` so it does not
+    block the event loop.
+    """
+
+    def __init__(self, enable_plugins: bool = False) -> None:
+        self._enable_plugins = enable_plugins
+        self._converter: Any = None
+
+    @property
+    def name(self) -> str:
+        return "markitdown"
+
+    @property
+    def supported_extensions(self) -> set[str]:
+        return _SUPPORTED_EXTENSIONS
+
+    @property
+    def capabilities(self) -> EngineCapabilities:
+        # Markitdown returns a Markdown string — no layout, bboxes, or
+        # confidence scores.
+        return EngineCapabilities()
+
+    def is_available(self) -> bool:
+        # markitdown's import chain pulls in pdfminer/cryptography, which can
+        # raise non-ImportError exceptions (e.g. a broken PyO3 binding).
+        # Treat any import failure as "unavailable" so a broken env cannot
+        # knock out the whole router / benchmark harness.
+        try:
+            import markitdown  # noqa: F401
+            return True
+        except Exception:  # noqa: BLE001
+            return False
+
+    def _get_converter(self) -> Any:
+        if self._converter is None:
+            from markitdown import MarkItDown
+            self._converter = MarkItDown(enable_plugins=self._enable_plugins)
+        return self._converter
+
+    async def process(
+        self,
+        file_path: str,
+        output_format: OutputFormat = OutputFormat.MARKDOWN,
+        **kwargs: Any,
+    ) -> EngineResult:
+        start = time.perf_counter()
+
+        try:
+            converter = self._get_converter()
+        except ImportError as exc:
+            raise RuntimeError(
+                "markitdown is not installed. Install with: pip install docfold[markitdown]"
+            ) from exc
+        except TypeError:
+            # Older markitdown versions don't accept enable_plugins kwarg.
+            from markitdown import MarkItDown
+            self._converter = MarkItDown()
+            converter = self._converter
+
+        loop = asyncio.get_running_loop()
+        convert_result = await loop.run_in_executor(None, converter.convert, file_path)
+
+        markdown_text: str = getattr(convert_result, "text_content", "") or ""
+        title = getattr(convert_result, "title", None)
+
+        if output_format == OutputFormat.JSON:
+            content = json.dumps({"markdown": markdown_text, "title": title},
+                                 ensure_ascii=False)
+        elif output_format == OutputFormat.HTML:
+            # Minimal wrapper — markitdown doesn't render HTML itself.
+            content = f"
{markdown_text}
" + else: + # MARKDOWN and TEXT both return the markdown string as-is. + content = markdown_text + + elapsed_ms = int((time.perf_counter() - start) * 1000) + + return EngineResult( + content=content, + format=output_format, + engine_name=self.name, + processing_time_ms=elapsed_ms, + metadata={ + "title": title, + "enable_plugins": self._enable_plugins, + }, + ) diff --git a/src/docfold/engines/router.py b/src/docfold/engines/router.py index d23fe57..8cafa77 100644 --- a/src/docfold/engines/router.py +++ b/src/docfold/engines/router.py @@ -25,6 +25,7 @@ _IMAGE_PRIORITY = [ "chandra", "surya", "paddleocr", "tesseract", "easyocr", "docling", "liteparse", "mistral_ocr", "google_docai", "textract", "azure_docint", "zerox", "marker", + "markitdown", ] _EXTENSION_PRIORITY: dict[str, list[str]] = { @@ -33,31 +34,41 @@ "docling", "mineru", "chandra", "unstructured", "marker", "llamaparse", "liteparse", "mistral_ocr", "firecrawl", "google_docai", "azure_docint", "textract", "zerox", "nougat", "surya", "pymupdf", - "paddleocr", "tesseract", "easyocr", + "paddleocr", "tesseract", "easyocr", "markitdown", ], # --- Office --- "docx": [ "docling", "marker", "unstructured", "llamaparse", - "liteparse", "firecrawl", "azure_docint", + "liteparse", "firecrawl", "azure_docint", "markitdown", ], "doc": ["docling", "marker", "unstructured", "llamaparse", "liteparse", "azure_docint"], - "pptx": ["docling", "marker", "unstructured", "llamaparse", "liteparse", "azure_docint"], + "pptx": [ + "docling", "marker", "unstructured", "llamaparse", + "liteparse", "azure_docint", "markitdown", + ], "ppt": ["docling", "marker", "unstructured", "llamaparse", "liteparse", "azure_docint"], - "xlsx": ["docling", "marker", "unstructured", "llamaparse", "liteparse", "azure_docint"], - "xls": ["docling", "marker", "unstructured", "llamaparse", "liteparse", "azure_docint"], + "xlsx": [ + "docling", "marker", "unstructured", "llamaparse", + "liteparse", "azure_docint", "markitdown", + ], + "xls": [ + "docling", "marker", "unstructured", "llamaparse", + "liteparse", "azure_docint", "markitdown", + ], "odt": ["marker", "unstructured"], "odp": ["marker", "unstructured"], "ods": ["marker", "unstructured"], # --- Web / markup --- - "html": ["docling", "firecrawl", "unstructured", "marker", "azure_docint"], - "htm": ["docling", "firecrawl", "unstructured", "marker", "azure_docint"], - "xml": ["firecrawl", "unstructured"], - "md": ["unstructured"], + "html": ["docling", "firecrawl", "unstructured", "marker", "azure_docint", "markitdown"], + "htm": ["docling", "firecrawl", "unstructured", "marker", "azure_docint", "markitdown"], + "xml": ["firecrawl", "unstructured", "markitdown"], + "md": ["unstructured", "markitdown"], "rst": ["unstructured"], - "csv": ["unstructured"], - "tsv": ["unstructured"], - "txt": ["unstructured"], + "csv": ["unstructured", "markitdown"], + "tsv": ["unstructured", "markitdown"], + "txt": ["unstructured", "markitdown"], "rtf": ["unstructured"], + "json": ["markitdown"], # --- Images --- "png": _IMAGE_PRIORITY, "jpg": _IMAGE_PRIORITY, @@ -71,7 +82,13 @@ "eml": ["unstructured"], "msg": ["unstructured"], # --- eBooks --- - "epub": ["unstructured", "marker"], + "epub": ["unstructured", "marker", "markitdown"], + # --- Audio (transcription) --- + "mp3": ["markitdown"], + "wav": ["markitdown"], + "m4a": ["markitdown"], + # --- Archives --- + "zip": ["markitdown"], } # Ultimate fallback when extension is unknown or missing from the map. @@ -79,7 +96,7 @@ "docling", "mineru", "chandra", "unstructured", "marker", "llamaparse", "liteparse", "mistral_ocr", "google_docai", "azure_docint", "textract", "zerox", "nougat", "surya", "pymupdf", "paddleocr", "tesseract", - "easyocr", + "easyocr", "markitdown", ] diff --git a/tests/engines/test_markitdown_engine.py b/tests/engines/test_markitdown_engine.py new file mode 100644 index 0000000..9139e3c --- /dev/null +++ b/tests/engines/test_markitdown_engine.py @@ -0,0 +1,209 @@ +"""Tests for the MarkItDown engine adapter. + +The ``markitdown`` package is not a test-time dependency; these tests mock the +import path and the ``MarkItDown`` class so they run on any host. +""" + +from __future__ import annotations + +import sys +import types +from unittest.mock import MagicMock, patch + +import pytest + +from docfold.engines.base import EngineResult, OutputFormat + + +def _install_fake_markitdown(text_content: str = "# Hello\n\nWorld") -> MagicMock: + """Inject a fake ``markitdown`` module into ``sys.modules``. + + Returns the mock ``MarkItDown`` class so individual tests can assert + on how it was called. + """ + fake_module = types.ModuleType("markitdown") + mock_class = MagicMock(name="MarkItDown") + + # Default: MarkItDown().convert(path).text_content = text_content + instance = MagicMock() + convert_result = MagicMock() + convert_result.text_content = text_content + convert_result.title = None + instance.convert.return_value = convert_result + mock_class.return_value = instance + + fake_module.MarkItDown = mock_class + sys.modules["markitdown"] = fake_module + return mock_class + + +def _remove_fake_markitdown() -> None: + sys.modules.pop("markitdown", None) + + +@pytest.fixture +def fake_markitdown(): + mock_class = _install_fake_markitdown() + try: + yield mock_class + finally: + _remove_fake_markitdown() + + +class TestMarkItDownEngineMetadata: + def test_name(self): + from docfold.engines.markitdown_engine import MarkItDownEngine + + assert MarkItDownEngine().name == "markitdown" + + def test_supported_extensions_covers_markitdown_formats(self): + from docfold.engines.markitdown_engine import MarkItDownEngine + + exts = MarkItDownEngine().supported_extensions + # The formats markitdown documents support: Office, PDFs, images, + # web/markup, tabular, ePub, audio. + for fmt in ("pdf", "docx", "pptx", "xlsx", "html", "htm", + "png", "jpg", "jpeg", "csv", "json", "xml", "epub"): + assert fmt in exts, f"expected '{fmt}' in supported_extensions" + + def test_capabilities_are_empty_by_default(self): + from docfold.engines.markitdown_engine import MarkItDownEngine + + caps = MarkItDownEngine().capabilities + # markitdown returns plain markdown with no layout info + assert caps.bounding_boxes is False + assert caps.confidence is False + assert caps.table_structure is False + + def test_is_available_true_when_importable(self, fake_markitdown): + from docfold.engines.markitdown_engine import MarkItDownEngine + + assert MarkItDownEngine().is_available() is True + + def test_is_available_false_when_missing(self): + from docfold.engines.markitdown_engine import MarkItDownEngine + + _remove_fake_markitdown() + with patch.dict(sys.modules, {"markitdown": None}): + assert MarkItDownEngine().is_available() is False + + +class TestMarkItDownEngineProcess: + @pytest.mark.asyncio + async def test_process_markdown_returns_engine_result(self, fake_markitdown): + from docfold.engines.markitdown_engine import MarkItDownEngine + + # Custom markdown payload for this test + instance = fake_markitdown.return_value + instance.convert.return_value.text_content = ( + "# Invoice 2024\n\nAmount: **$1,250.00**" + ) + + engine = MarkItDownEngine() + result = await engine.process("invoice.pdf", output_format=OutputFormat.MARKDOWN) + + assert isinstance(result, EngineResult) + assert result.engine_name == "markitdown" + assert result.format == OutputFormat.MARKDOWN + assert "Invoice 2024" in result.content + assert "$1,250.00" in result.content + assert result.processing_time_ms >= 0 + + # MarkItDown().convert("invoice.pdf") must have been called + instance.convert.assert_called_once() + call_args = instance.convert.call_args + assert call_args.args[0] == "invoice.pdf" + + @pytest.mark.asyncio + async def test_process_runs_convert_in_executor(self, fake_markitdown): + """The sync convert() call must be dispatched through run_in_executor + so it does not block the event loop.""" + import asyncio + + from docfold.engines.markitdown_engine import MarkItDownEngine + + engine = MarkItDownEngine() + + loop = asyncio.get_running_loop() + original_run_in_executor = loop.run_in_executor + call_count = {"n": 0} + + async def spy(*args, **kwargs): + call_count["n"] += 1 + return await original_run_in_executor(*args, **kwargs) + + with patch.object(loop, "run_in_executor", side_effect=spy): + await engine.process("some.pdf", output_format=OutputFormat.MARKDOWN) + + assert call_count["n"] >= 1, "convert() must be dispatched via run_in_executor" + + @pytest.mark.asyncio + async def test_process_text_format_returns_plain_markdown(self, fake_markitdown): + from docfold.engines.markitdown_engine import MarkItDownEngine + + instance = fake_markitdown.return_value + instance.convert.return_value.text_content = "# Title\n\nBody" + + result = await MarkItDownEngine().process("x.pdf", output_format=OutputFormat.TEXT) + + assert result.format == OutputFormat.TEXT + # TEXT format should pass the markdown string through unchanged. + assert "Title" in result.content + assert "Body" in result.content + + @pytest.mark.asyncio + async def test_process_json_format_wraps_markdown(self, fake_markitdown): + import json as _json + + from docfold.engines.markitdown_engine import MarkItDownEngine + + instance = fake_markitdown.return_value + instance.convert.return_value.text_content = "# Doc\n\nHello" + + result = await MarkItDownEngine().process("x.pdf", output_format=OutputFormat.JSON) + + assert result.format == OutputFormat.JSON + parsed = _json.loads(result.content) + assert isinstance(parsed, dict) + assert "markdown" in parsed + assert "Doc" in parsed["markdown"] + + @pytest.mark.asyncio + async def test_process_html_format_wraps_markdown(self, fake_markitdown): + from docfold.engines.markitdown_engine import MarkItDownEngine + + instance = fake_markitdown.return_value + instance.convert.return_value.text_content = "# Doc" + + result = await MarkItDownEngine().process("x.pdf", output_format=OutputFormat.HTML) + + assert result.format == OutputFormat.HTML + # Markdown text must be preserved inside the HTML wrapper. + assert "Doc" in result.content + assert result.content.strip().startswith("<") + + @pytest.mark.asyncio + async def test_process_missing_dependency_raises(self): + """When markitdown isn't installed, process() must raise a clear error.""" + from docfold.engines.markitdown_engine import MarkItDownEngine + + _remove_fake_markitdown() + with patch.dict(sys.modules, {"markitdown": None}): + engine = MarkItDownEngine() + with pytest.raises((RuntimeError, ImportError, ModuleNotFoundError)): + await engine.process("any.pdf", output_format=OutputFormat.MARKDOWN) + + @pytest.mark.asyncio + async def test_process_preserves_unicode(self, fake_markitdown): + """Non-ASCII text (Arabic, CJK, Hebrew) must pass through unchanged.""" + from docfold.engines.markitdown_engine import MarkItDownEngine + + payload = "تقرير سنوي 2024\n\n2024年度报告\n\nדוח שנתי 2024" + instance = fake_markitdown.return_value + instance.convert.return_value.text_content = payload + + result = await MarkItDownEngine().process("i18n.pdf", output_format=OutputFormat.MARKDOWN) + + assert "تقرير" in result.content + assert "年度报告" in result.content + assert "דוח" in result.content