diff --git a/CHANGELOG.md b/CHANGELOG.md
index 344b9a9..29a2b68 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [Unreleased]
+
+### Added
+
+- **OpenDataLoader PDF engine adapter** — wraps the Java-based [`opendataloader-pdf`](https://github.com/opendataloader-project/opendataloader-pdf) tool (via its bundled-JAR Python wheel). Local, deterministic extraction with typed structural elements (heading, paragraph, table, list, header, footer) and per-element bounding boxes. Install: `pip install docfold[opendataloader]` (also requires Java 11+).
+- **Multi-script benchmark coverage** — `benchmark.py` now generates Arabic (RTL + shaping), Hebrew (RTL, no shaping), and Simplified Chinese (CJK) synthetic PDFs alongside the existing English docs. Fonts are bundled under `tests/fixtures/fonts/` (OFL-1.1, subsetted where relevant) so the benchmark is reproducible without system font packages.
+
## [0.6.0] - 2026-02-20
### Added
diff --git a/README.md b/README.md
index 002f161..7c2d8f9 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,7 @@ Docfold is the open-source extraction engine from [Datatera.ai](https://datatera
| [**MinerU**](https://github.com/opendatalab/MinerU) | ✅ | Local | AGPL | ★★★ | ★★★ | ★★★ | — | — | Slow | Free |
| [**Marker**](https://www.datalab.to/) | ✅ | SaaS | Paid | ★★★ | ★★★ | ★★★ | ✅ | — | Fast | $$ |
| [**PyMuPDF**](https://pymupdf.readthedocs.io/) | ✅ | Local | AGPL | ★★★ | ☆☆☆ | ★☆☆ | — | — | Ultra | Free |
+| [**OpenDataLoader**](https://github.com/opendataloader-project/opendataloader-pdf) | ✅ | Local | Apache | ★★★ | ☆☆☆ | ★★☆ | ✅ | — | Fast | Free |
| [**PaddleOCR**](https://github.com/PaddlePaddle/PaddleOCR) | ✅ | Local | Apache | ★☆☆ | ★★★ | ★★☆ | — | ✅ | Medium | Free |
| [**Tesseract**](https://github.com/tesseract-ocr/tesseract) | ✅ | Local | Apache | ★☆☆ | ★★☆ | ★☆☆ | — | — | Medium | Free |
| [**EasyOCR**](https://github.com/JaidedAI/EasyOCR) | ✅ | Local | Apache | ★☆☆ | ★★★ | ☆☆☆ | — | ✅ | Medium | Free |
@@ -94,6 +95,7 @@ for name, res in results.items():
| [**MinerU**](https://github.com/opendatalab/MinerU) | Local | AGPL-3.0 | PDF | Recommended | `pip install docfold[mineru]` |
| [**Marker API**](https://www.datalab.to/) | SaaS | Paid | PDF, Office, images | N/A | `pip install docfold[marker]` |
| [**PyMuPDF**](https://pymupdf.readthedocs.io/) | Local | AGPL-3.0 | PDF | No | `pip install docfold[pymupdf]` |
+| [**OpenDataLoader**](https://github.com/opendataloader-project/opendataloader-pdf) | Local | Apache-2.0 | PDF | No (needs Java 11+) | `pip install docfold[opendataloader]` |
| [**PaddleOCR**](https://github.com/PaddlePaddle/PaddleOCR) | Local | Apache-2.0 | Images, scanned PDFs | Optional | `pip install docfold[paddleocr]` |
| [**Tesseract**](https://github.com/tesseract-ocr/tesseract) | Local | Apache-2.0 | Images, scanned PDFs | No | `pip install docfold[tesseract]` |
| [**EasyOCR**](https://github.com/JaidedAI/EasyOCR) | Local | Apache-2.0 | Images, scanned PDFs | Optional | `pip install docfold[easyocr]` |
diff --git a/benchmark.py b/benchmark.py
index c795e11..287d5a9 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -37,6 +37,90 @@ def create_text_pdf(path: str, pages: list[dict]) -> None:
doc.close()
+_FIXTURE_FONT_DIR = os.path.join(
+ os.path.dirname(__file__), "tests", "fixtures", "fonts"
+)
+
+
+def _find_bundled_font(preferred: str, fallbacks: list[tuple[str, str]]) -> tuple[str, str] | None:
+ """Return ``(font_dir, ttf_name)`` for a font that exists on disk.
+
+ Prefers the bundled fixture under ``tests/fixtures/fonts/`` (shipped under
+ OFL-1.1) so the benchmark is reproducible on any host; falls back to
+ system paths only as a safety net.
+ """
+ if os.path.exists(os.path.join(_FIXTURE_FONT_DIR, preferred)):
+ return _FIXTURE_FONT_DIR, preferred
+ for d, fname in fallbacks:
+ if os.path.exists(os.path.join(d, fname)):
+ return d, fname
+ return None
+
+
+def _find_arabic_font() -> tuple[str, str] | None:
+ return _find_bundled_font(
+ "NotoNaskhArabic-Regular.ttf",
+ [
+ ("/usr/share/fonts/truetype/noto", "NotoNaskhArabic-Regular.ttf"),
+ ("/usr/share/fonts/noto", "NotoNaskhArabic-Regular.ttf"),
+ ("/usr/share/fonts/truetype/noto", "NotoSansArabic-Regular.ttf"),
+ ],
+ )
+
+
+def _find_script_font(preferred: str) -> tuple[str, str] | None:
+ """Bundled fonts for non-Arabic scripts — no system fallback because the
+ subsetted TTF is what we tested against."""
+ return _find_bundled_font(preferred, [])
+
+
+def _render_html_pdf(path: str, html_body: str, font_info: tuple[str, str]) -> None:
+ """Generic HTML → PDF renderer using PyMuPDF's ``insert_htmlbox`` with
+ a bundled font archive. Handles shaping / bidi via HarfBuzz under the hood.
+ """
+ import fitz
+
+ font_dir, ttf = font_info
+ doc = fitz.open()
+ page = doc.new_page(width=612, height=792)
+ archive = fitz.Archive(font_dir)
+ css = f"@font-face {{ font-family: 'BenchFont'; src: url({ttf}); }}"
+ page.insert_htmlbox(fitz.Rect(36, 36, 576, 756), html_body, css=css, archive=archive)
+ doc.save(path)
+ doc.close()
+
+
+def create_arabic_pdf(path: str, html_body: str) -> None:
+ """Render an Arabic HTML snippet to PDF using Noto Naskh Arabic."""
+ font_info = _find_arabic_font()
+ if font_info is None:
+ raise RuntimeError(
+ "Arabic font fixture missing: "
+ "tests/fixtures/fonts/NotoNaskhArabic-Regular.ttf"
+ )
+ _render_html_pdf(path, html_body, font_info)
+
+
+def create_script_pdf(path: str, html_body: str, font_ttf: str) -> None:
+ """Render an HTML snippet to PDF using a bundled script-specific font."""
+ font_info = _find_script_font(font_ttf)
+ if font_info is None:
+ raise RuntimeError(f"Font fixture missing: tests/fixtures/fonts/{font_ttf}")
+ _render_html_pdf(path, html_body, font_info)
+
+
+def _extract_ground_truth(pdf_path: str) -> str:
+ """Return PyMuPDF's extracted text — used as ground truth for docs whose
+ authoritative form depends on font shaping (e.g. Arabic).
+ """
+ import fitz
+
+ doc = fitz.open(pdf_path)
+ text = "\n".join(p.get_text() for p in doc)
+ doc.close()
+ return text
+
+
def generate_benchmark_documents(tmpdir: str) -> list[dict]:
"""Generate synthetic PDFs and return metadata with ground truth."""
documents = []
@@ -130,6 +214,78 @@ def generate_benchmark_documents(tmpdir: str) -> list[dict]:
"category": "report",
})
+ # --- Doc 5: Arabic (RTL + shaping) ---
+ # PDFs store Arabic in shaped presentation forms and reverse visual order.
+ # We use PyMuPDF's extraction of the generated PDF as ground truth — this
+ # measures whether *other* engines agree on the same text, not whether
+ # they normalize to logical Unicode (a harder task).
+ doc5_path = os.path.join(tmpdir, "arabic_report.pdf")
+ arabic_html = (
+ '
"
+ "
تقرير سنوي 2024
"
+ "
حققت الشركة نموا قياسيا هذا العام بإيرادات تجاوزت التوقعات.
"
+ "
بلغت نسبة رضا العملاء 94 بالمئة.
"
+ "
وصل معدل الاحتفاظ بالموظفين إلى 96 بالمئة.
"
+ "
"
+ )
+ create_arabic_pdf(doc5_path, arabic_html)
+ documents.append({
+ "name": "arabic_report",
+ "path": doc5_path,
+ "ground_truth": _extract_ground_truth(doc5_path),
+ "pages": 1,
+ "category": "rtl",
+ })
+
+ # --- Doc 6: Simplified Chinese (CJK) ---
+ # CJK has no shaping and LTR, but tests that engines don't mangle
+ # multi-byte Unicode. Font is subsetted (60 KB) from Noto Sans CJK SC.
+ doc6_path = os.path.join(tmpdir, "chinese_report.pdf")
+ chinese_html = (
+ '"
+ "
2024年度报告
"
+ "
公司今年实现了创纪录的增长,收入超出预期。
"
+ "
客户满意度达到了94%。
"
+ "
员工保留率达到96%,创公司历史新高。
"
+ "
"
+ )
+ create_script_pdf(doc6_path, chinese_html, "NotoSansCJKsc-Regular-subset.ttf")
+ documents.append({
+ "name": "chinese_report",
+ "path": doc6_path,
+ "ground_truth": _extract_ground_truth(doc6_path),
+ "pages": 1,
+ "category": "cjk",
+ })
+
+ # --- Doc 7: Hebrew (RTL, no shaping) ---
+ # Good contrast to Arabic: same RTL bidi, but no contextual shaping.
+ doc7_path = os.path.join(tmpdir, "hebrew_report.pdf")
+ hebrew_html = (
+ '"
+ "
דוח שנתי 2024
"
+ "
החברה השיגה צמיחה שיא השנה, עם הכנסות שעלו על הציפיות.
"
+ "
שביעות רצון הלקוחות הגיעה ל-94 אחוז.
"
+ "
שיעור שימור העובדים הגיע ל-96 אחוז.
"
+ "
"
+ )
+ create_script_pdf(doc7_path, hebrew_html, "NotoSansHebrew-Regular-subset.ttf")
+ documents.append({
+ "name": "hebrew_report",
+ "path": doc7_path,
+ "ground_truth": _extract_ground_truth(doc7_path),
+ "pages": 1,
+ "category": "rtl",
+ })
+
+ # NOTE: Devanagari and Thai are intentionally omitted. PyMuPDF's
+ # ``insert_htmlbox`` produces PDFs whose ToUnicode maps don't survive
+ # round-trip extraction for those scripts (null bytes, dropped matras).
+ # They need real-world fixture PDFs — see docs/tasks/ for a follow-up.
+
return documents
@@ -197,6 +353,7 @@ async def main():
from docfold.engines.marker_local_engine import MarkerLocalEngine
from docfold.engines.mineru_engine import MinerUEngine
from docfold.engines.nougat_engine import NougatEngine
+ from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
from docfold.engines.paddleocr_engine import PaddleOCREngine
from docfold.engines.pymupdf_engine import PyMuPDFEngine
from docfold.engines.surya_engine import SuryaEngine
@@ -210,6 +367,7 @@ async def main():
candidates = [
(PyMuPDFEngine(), "pip install pymupdf"),
(LiteParseEngine(ocr_enabled=False), "npm i -g @llamaindex/liteparse"),
+ (OpenDataLoaderEngine(), "pip install docfold[opendataloader] (needs Java 11+)"),
(MinerUEngine(), "pip install docfold[mineru]"),
(MarkerLocalEngine(), "pip install marker-pdf"),
(SuryaEngine(), "pip install surya-ocr"),
diff --git a/docs/benchmark_results.json b/docs/benchmark_results.json
index 9899727..631f884 100644
--- a/docs/benchmark_results.json
+++ b/docs/benchmark_results.json
@@ -1,14 +1,8 @@
{
- "benchmark_date": "2026-04-05 22:20:15",
+ "benchmark_date": "2026-04-17 10:17:34",
"engines": [
"pymupdf",
- "mineru",
- "marker_local",
- "surya",
- "docling",
- "paddleocr",
- "tesseract",
- "unstructured"
+ "opendataloader"
],
"documents": [
{
@@ -30,20 +24,35 @@
"name": "mixed_formatting",
"pages": 1,
"category": "report"
+ },
+ {
+ "name": "arabic_report",
+ "pages": 1,
+ "category": "rtl"
+ },
+ {
+ "name": "chinese_report",
+ "pages": 1,
+ "category": "cjk"
+ },
+ {
+ "name": "hebrew_report",
+ "pages": 1,
+ "category": "rtl"
}
],
"summary": {
"pymupdf": {
- "avg_time_ms": 3.5,
+ "avg_time_ms": 6.4,
"avg_cer": 0.0,
"avg_wer": 0.0,
- "avg_bbox_count": 6.2,
+ "avg_bbox_count": 5.3,
"errors": 0,
- "successes": 4,
+ "successes": 7,
"results": [
{
"doc": "simple_text",
- "time_ms": 6,
+ "time_ms": 11,
"cer": 0.0,
"wer": 0.0,
"bbox_count": 5,
@@ -52,7 +61,7 @@
},
{
"doc": "multi_page",
- "time_ms": 2,
+ "time_ms": 4,
"cer": 0.0,
"wer": 0.0,
"bbox_count": 4,
@@ -61,7 +70,7 @@
},
{
"doc": "dense_financial",
- "time_ms": 3,
+ "time_ms": 5,
"cer": 0.0,
"wer": 0.0,
"bbox_count": 10,
@@ -70,334 +79,112 @@
},
{
"doc": "mixed_formatting",
- "time_ms": 3,
+ "time_ms": 4,
"cer": 0.0,
"wer": 0.0,
"bbox_count": 6,
"content_length": 260,
"pages": 1
- }
- ]
- },
- "mineru": {
- "avg_time_ms": 18304.8,
- "avg_cer": 0.0118,
- "avg_wer": 0.0804,
- "avg_bbox_count": 0.0,
- "errors": 0,
- "successes": 4,
- "results": [
- {
- "doc": "simple_text",
- "time_ms": 55848,
- "cer": 0.0083,
- "wer": 0.0556,
- "bbox_count": 0,
- "content_length": 122,
- "pages": null
},
{
- "doc": "multi_page",
- "time_ms": 7511,
- "cer": 0.0,
- "wer": 0.0,
- "bbox_count": 0,
- "content_length": 300,
- "pages": null
- },
- {
- "doc": "dense_financial",
- "time_ms": 4845,
- "cer": 0.0235,
- "wer": 0.2121,
- "bbox_count": 0,
- "content_length": 305,
- "pages": null
- },
- {
- "doc": "mixed_formatting",
- "time_ms": 5015,
- "cer": 0.0154,
- "wer": 0.0541,
- "bbox_count": 0,
- "content_length": 264,
- "pages": null
- }
- ]
- },
- "marker_local": {
- "avg_time_ms": 39090.5,
- "avg_cer": 0.0191,
- "avg_wer": 0.0936,
- "avg_bbox_count": 0.0,
- "errors": 0,
- "successes": 4,
- "results": [
- {
- "doc": "simple_text",
- "time_ms": 13760,
- "cer": 0.0083,
- "wer": 0.0556,
- "bbox_count": 0,
- "content_length": 122,
- "pages": null
- },
- {
- "doc": "multi_page",
- "time_ms": 69190,
- "cer": 0.01,
- "wer": 0.0256,
- "bbox_count": 0,
- "content_length": 303,
- "pages": null
- },
- {
- "doc": "dense_financial",
- "time_ms": 37193,
- "cer": 0.0235,
- "wer": 0.2121,
- "bbox_count": 0,
- "content_length": 305,
- "pages": null
- },
- {
- "doc": "mixed_formatting",
- "time_ms": 36219,
- "cer": 0.0346,
- "wer": 0.0811,
- "bbox_count": 0,
- "content_length": 269,
- "pages": null
- }
- ]
- },
- "surya": {
- "avg_time_ms": 32959.2,
- "avg_cer": 0.0135,
- "avg_wer": 0.027,
- "avg_bbox_count": 0.0,
- "errors": 0,
- "successes": 4,
- "results": [
- {
- "doc": "simple_text",
- "time_ms": 7741,
+ "doc": "arabic_report",
+ "time_ms": 6,
"cer": 0.0,
"wer": 0.0,
- "bbox_count": 0,
- "content_length": 121,
+ "bbox_count": 4,
+ "content_length": 151,
"pages": 1
},
{
- "doc": "multi_page",
- "time_ms": 45426,
+ "doc": "chinese_report",
+ "time_ms": 4,
"cer": 0.0,
"wer": 0.0,
- "bbox_count": 0,
- "content_length": 300,
- "pages": 2
+ "bbox_count": 4,
+ "content_length": 63,
+ "pages": 1
},
{
- "doc": "dense_financial",
- "time_ms": 35792,
+ "doc": "hebrew_report",
+ "time_ms": 11,
"cer": 0.0,
"wer": 0.0,
- "bbox_count": 0,
- "content_length": 298,
- "pages": 1
- },
- {
- "doc": "mixed_formatting",
- "time_ms": 42878,
- "cer": 0.0538,
- "wer": 0.1081,
- "bbox_count": 0,
- "content_length": 274,
+ "bbox_count": 4,
+ "content_length": 143,
"pages": 1
}
]
},
- "docling": {
- "avg_time_ms": 2601.0,
- "avg_cer": 0.0173,
- "avg_wer": 0.0406,
- "avg_bbox_count": 0.0,
+ "opendataloader": {
+ "avg_time_ms": 796.6,
+ "avg_cer": 0.257,
+ "avg_wer": 0.3756,
+ "avg_bbox_count": 3.1,
"errors": 0,
- "successes": 4,
+ "successes": 7,
"results": [
{
"doc": "simple_text",
- "time_ms": 3738,
- "cer": 0.0248,
+ "time_ms": 1083,
+ "cer": 0.0165,
"wer": 0.0556,
- "bbox_count": 0,
- "content_length": 124,
- "pages": null
+ "bbox_count": 2,
+ "content_length": 123,
+ "pages": 1
},
{
"doc": "multi_page",
- "time_ms": 3170,
- "cer": 0.01,
- "wer": 0.0256,
- "bbox_count": 0,
- "content_length": 303,
- "pages": null
+ "time_ms": 756,
+ "cer": 0.0133,
+ "wer": 0.0513,
+ "bbox_count": 2,
+ "content_length": 304,
+ "pages": 2
},
{
"doc": "dense_financial",
- "time_ms": 1715,
+ "time_ms": 725,
"cer": 0.0,
"wer": 0.0,
- "bbox_count": 0,
+ "bbox_count": 1,
"content_length": 298,
- "pages": null
+ "pages": 1
},
{
"doc": "mixed_formatting",
- "time_ms": 1781,
- "cer": 0.0346,
+ "time_ms": 737,
+ "cer": 0.0308,
"wer": 0.0811,
- "bbox_count": 0,
- "content_length": 269,
- "pages": null
- }
- ]
- },
- "paddleocr": {
- "avg_time_ms": 1617.2,
- "avg_cer": 0.0018,
- "avg_wer": 0.0132,
- "avg_bbox_count": 0.0,
- "errors": 0,
- "successes": 4,
- "results": [
- {
- "doc": "simple_text",
- "time_ms": 1304,
- "cer": 0.0,
- "wer": 0.0,
- "bbox_count": 0,
- "content_length": 121,
- "pages": null
- },
- {
- "doc": "multi_page",
- "time_ms": 2226,
- "cer": 0.0033,
- "wer": 0.0256,
- "bbox_count": 0,
- "content_length": 299,
- "pages": null
- },
- {
- "doc": "dense_financial",
- "time_ms": 1391,
- "cer": 0.0,
- "wer": 0.0,
- "bbox_count": 0,
- "content_length": 298,
- "pages": null
- },
- {
- "doc": "mixed_formatting",
- "time_ms": 1548,
- "cer": 0.0038,
- "wer": 0.027,
- "bbox_count": 0,
- "content_length": 261,
- "pages": null
- }
- ]
- },
- "tesseract": {
- "avg_time_ms": 1190.0,
- "avg_cer": 0.0,
- "avg_wer": 0.0,
- "avg_bbox_count": 0.0,
- "errors": 0,
- "successes": 4,
- "results": [
- {
- "doc": "simple_text",
- "time_ms": 881,
- "cer": 0.0,
- "wer": 0.0,
- "bbox_count": 0,
- "content_length": 121,
- "pages": null
- },
- {
- "doc": "multi_page",
- "time_ms": 1718,
- "cer": 0.0,
- "wer": 0.0,
- "bbox_count": 0,
- "content_length": 300,
- "pages": null
- },
- {
- "doc": "dense_financial",
- "time_ms": 1106,
- "cer": 0.0,
- "wer": 0.0,
- "bbox_count": 0,
- "content_length": 298,
- "pages": null
- },
- {
- "doc": "mixed_formatting",
- "time_ms": 1055,
- "cer": 0.0,
- "wer": 0.0,
- "bbox_count": 0,
- "content_length": 260,
- "pages": null
- }
- ]
- },
- "unstructured": {
- "avg_time_ms": 597.0,
- "avg_cer": 0.0357,
- "avg_wer": 0.1353,
- "avg_bbox_count": 0.0,
- "errors": 0,
- "successes": 4,
- "results": [
- {
- "doc": "simple_text",
- "time_ms": 2213,
- "cer": 0.0661,
- "wer": 0.2222,
- "bbox_count": 0,
- "content_length": 129,
- "pages": null
+ "bbox_count": 5,
+ "content_length": 268,
+ "pages": 1
},
{
- "doc": "multi_page",
- "time_ms": 75,
- "cer": 0.0067,
- "wer": 0.0256,
- "bbox_count": 0,
- "content_length": 302,
- "pages": null
+ "doc": "arabic_report",
+ "time_ms": 766,
+ "cer": 0.8675,
+ "wer": 1.08,
+ "bbox_count": 4,
+ "content_length": 151,
+ "pages": 1
},
{
- "doc": "dense_financial",
- "time_ms": 55,
- "cer": 0.047,
- "wer": 0.2121,
- "bbox_count": 0,
- "content_length": 312,
- "pages": null
+ "doc": "chinese_report",
+ "time_ms": 721,
+ "cer": 0.0317,
+ "wer": 0.25,
+ "bbox_count": 4,
+ "content_length": 65,
+ "pages": 1
},
{
- "doc": "mixed_formatting",
- "time_ms": 45,
- "cer": 0.0231,
- "wer": 0.0811,
- "bbox_count": 0,
- "content_length": 266,
- "pages": null
+ "doc": "hebrew_report",
+ "time_ms": 788,
+ "cer": 0.8392,
+ "wer": 1.1111,
+ "bbox_count": 4,
+ "content_length": 149,
+ "pages": 1
}
]
}
diff --git a/docs/tasks/OPENDATALOADER_ENGINE.md b/docs/tasks/OPENDATALOADER_ENGINE.md
new file mode 100644
index 0000000..68a21e1
--- /dev/null
+++ b/docs/tasks/OPENDATALOADER_ENGINE.md
@@ -0,0 +1,95 @@
+---
+purpose: "Add OpenDataLoader PDF as a local, fast PDF structuring engine."
+status: "OPEN"
+priority: "P2"
+created: "2026-04-16"
+---
+
+# Feature: OpenDataLoader PDF Engine
+
+## Problem
+docfold already ships several local PDF engines (PyMuPDF, LiteParse, Docling, MinerU, …),
+but none of them are based on `opendataloader-pdf` — a popular, Apache-2.0 Java tool
+(16.8k stars on GitHub) exposed through a thin Python wrapper on PyPI
+(`opendataloader-pdf`).
+
+Why add it now:
+- Very fast deterministic layout + reading-order extraction on CPU (benchmarked
+ at 100+ pages/sec by upstream) — useful as a reliable, low-latency baseline
+ to compare against heavier OCR / ML engines.
+- Emits richly typed structural elements (`heading`, `paragraph`, `table`,
+ `list`, `header`, `footer`, …) with per-element bounding boxes and page
+ numbers — a good fit for the docfold `EngineResult` contract.
+- Users can opt into a hybrid AI mode later without changing the adapter.
+
+## Proposed Solution
+Implement a new engine adapter `OpenDataLoaderEngine` that wraps the
+`opendataloader-pdf` Python package. The engine will:
+
+1. Write output (`json` + requested format) to a temp directory via
+ `opendataloader_pdf.convert(...)`.
+2. Read back the produced files:
+ - For `MARKDOWN` — read the `.md` file (or `text` output).
+ - For `HTML` — read the `.html` file.
+ - For `JSON`/`TEXT` — use the JSON/text file directly.
+3. Parse the JSON output to build a flat list of `BoundingBox` entries by
+ recursively walking the nested `kids` tree. Map upstream types
+ (`heading`, `paragraph`, `table`, `list`, `header`, `footer`, …) to
+ docfold's `BoundingBox.type` names (`SectionHeader`, `Text`, `Table`,
+ `List`, …).
+4. Normalize PDF-point coordinates — upstream emits `[x1, y1, x2, y2]` in
+ PDF points; we pass them through unchanged (same as PyMuPDF).
+
+Capabilities advertised: `bounding_boxes=True`, `reading_order=True`,
+`heading_detection=True`, `table_structure=True`.
+
+## Affected Files
+- `src/docfold/engines/opendataloader_engine.py` — new adapter
+- `tests/engines/test_opendataloader_engine.py` — new tests (mocked subprocess)
+- `benchmark.py` — register the new engine alongside the existing ones
+- `pyproject.toml` — add `opendataloader = ["opendataloader-pdf>=2.2"]` extra,
+ include it in `[all]`
+- `README.md` / `CHANGELOG.md` — short mention (optional in this task)
+
+## Test Plan
+
+### Unit / Functional Tests
+- [x] `name` is `"opendataloader"`
+- [x] `supported_extensions` includes `"pdf"`
+- [x] `capabilities` advertises `bounding_boxes`, `reading_order`,
+ `heading_detection`, `table_structure`
+- [x] `is_available()` returns True when `opendataloader_pdf` imports and
+ `java` is on PATH; False otherwise
+- [x] `process()` returns an `EngineResult` with the correct `engine_name`,
+ `format`, non-empty `content`, populated `pages`, non-empty
+ `bounding_boxes`, and `processing_time_ms >= 0` — tested with a
+ mocked `convert()` that materializes a fake output directory
+- [x] JSON walker flattens nested `kids` into one `BoundingBox` per leaf
+ element, preserving page numbers and bbox coords
+- [x] `heading` → `SectionHeader`, `paragraph` → `Text`, `table` → `Table`,
+ `list` → `List` type mapping
+- [x] Errors from the underlying CLI surface as `RuntimeError`
+
+### Integration / E2E Tests
+- [x] `benchmark.py` discovers the engine when `opendataloader-pdf` and
+ Java are installed and reports CER/WER/time/bbox counts alongside the
+ other engines.
+
+### Test Commands
+```bash
+pytest tests/engines/test_opendataloader_engine.py -v
+pytest tests/ # full suite still green
+python benchmark.py # sanity check
+```
+
+## Edge Cases
+- `java` not installed → `is_available()` returns False, no crash.
+- Encrypted PDF without password → surface upstream error as `RuntimeError`.
+- Empty PDF / no elements → `bounding_boxes` is `None`, `content` is `""`.
+- Pages without any `kids` → still counted via `"number of pages"`.
+- Deeply nested `kids` (tables / lists) → recursion handles arbitrary depth.
+
+## Out of Scope
+- Hybrid AI mode (`hybrid=…`) — can be added later via kwargs.
+- Image extraction / annotated-PDF output.
+- Table structure parsing into `tables` list of dicts (bboxes only for now).
diff --git a/pyproject.toml b/pyproject.toml
index e4e0034..29af5f8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,6 +74,10 @@ llamaparse = [
liteparse = [
# No Python deps — requires Node.js 18+ and: npm i -g @llamaindex/liteparse
]
+opendataloader = [
+ # Requires Java 11+ on PATH; JAR is bundled by the Python wheel.
+ "opendataloader-pdf>=2.2",
+]
mistral-ocr = [
"mistralai>=1.0",
]
@@ -112,7 +116,7 @@ evaluation = [
"psutil>=5.9", # Memory measurement
]
all = [
- "docfold[docling,mineru,marker,pymupdf,paddleocr,tesseract,easyocr,unstructured,llamaparse,liteparse,mistral-ocr,textract,google-docai,azure-docint,nougat,chandra,surya,firecrawl,evaluation]",
+ "docfold[docling,mineru,marker,pymupdf,paddleocr,tesseract,easyocr,unstructured,llamaparse,liteparse,opendataloader,mistral-ocr,textract,google-docai,azure-docint,nougat,chandra,surya,firecrawl,evaluation]",
# Note: zerox excluded from [all] — py-zerox requires Python 3.11+
# Install separately: pip install docfold[zerox]
]
diff --git a/src/docfold/engines/opendataloader_engine.py b/src/docfold/engines/opendataloader_engine.py
new file mode 100644
index 0000000..fd5da97
--- /dev/null
+++ b/src/docfold/engines/opendataloader_engine.py
@@ -0,0 +1,272 @@
+"""OpenDataLoader PDF engine adapter.
+
+Wraps the `opendataloader-pdf `_
+Java tool via its Python package (`opendataloader-pdf` on PyPI). Produces
+Markdown / HTML / JSON output with per-element bounding boxes and PDF page
+numbers, fully local, no API keys.
+
+Install: ``pip install docfold[opendataloader]`` (also requires Java 11+).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import os
+import shutil
+import tempfile
+import time
+from typing import Any
+
+from docfold.engines.base import (
+ BoundingBox,
+ DocumentEngine,
+ EngineCapabilities,
+ EngineResult,
+ OutputFormat,
+)
+
+logger = logging.getLogger(__name__)
+
+_SUPPORTED_EXTENSIONS = {"pdf"}
+
+
+# Upstream block type -> docfold canonical type.
+_TYPE_MAP: dict[str, str] = {
+ "heading": "SectionHeader",
+ "title": "SectionHeader",
+ "paragraph": "Text",
+ "text": "Text",
+ "caption": "Caption",
+ "table": "Table",
+ "table-cell": "TableCell",
+ "list": "List",
+ "list-item": "ListItem",
+ "figure": "Image",
+ "image": "Image",
+ "header": "PageHeader",
+ "footer": "PageFooter",
+ "footnote": "Footnote",
+}
+
+
+def _convert(*args: Any, **kwargs: Any) -> None:
+ """Indirection so tests can monkey-patch the JAR call."""
+ from opendataloader_pdf import convert as _upstream_convert
+
+ _upstream_convert(*args, **kwargs)
+
+
+def _map_type(raw_type: str) -> str:
+ if not raw_type:
+ return "Text"
+ return _TYPE_MAP.get(raw_type.lower(), raw_type.capitalize())
+
+
+def _walk_kids(
+ nodes: list[dict[str, Any]],
+ bboxes: list[dict[str, Any]],
+ counter: dict[str, int],
+) -> None:
+ """Depth-first flatten nested ``kids`` into a list of :class:`BoundingBox` dicts."""
+ for node in nodes:
+ if not isinstance(node, dict):
+ continue
+
+ bbox_raw = node.get("bounding box")
+ page = node.get("page number")
+ text = (node.get("content") or "").strip()
+ children = node.get("kids") or []
+ node_type = _map_type(node.get("type", ""))
+
+ # Emit a bbox for any node that has enough geometry info. We prefer
+ # leaf nodes (no children) but also include parent containers that
+ # carry usable text of their own and geometry — they'll show as a
+ # single block instead of being lost.
+ if bbox_raw and page and (not children or text):
+ try:
+ coords = [float(x) for x in bbox_raw]
+ except (TypeError, ValueError):
+ coords = None
+ if coords and len(coords) == 4:
+ idx = counter["n"]
+ counter["n"] += 1
+ bboxes.append(
+ BoundingBox(
+ type=node_type,
+ bbox=coords,
+ page=int(page),
+ text=text,
+ id=f"p{int(page)}-e{idx}",
+ ).to_dict()
+ )
+
+ if children:
+ _walk_kids(children, bboxes, counter)
+
+
+def _find_output_file(output_dir: str, suffixes: tuple[str, ...]) -> str | None:
+ for name in sorted(os.listdir(output_dir)):
+ for suffix in suffixes:
+ if name.endswith(suffix):
+ return os.path.join(output_dir, name)
+ return None
+
+
+class OpenDataLoaderEngine(DocumentEngine):
+ """Adapter for ``opendataloader-pdf`` (Java CLI via Python wrapper)."""
+
+ def __init__(
+ self,
+ *,
+ reading_order: str | None = None,
+ table_method: str | None = None,
+ include_header_footer: bool = False,
+ keep_line_breaks: bool = False,
+ use_struct_tree: bool = False,
+ password: str | None = None,
+ hybrid: str | None = None,
+ ) -> None:
+ self._reading_order = reading_order
+ self._table_method = table_method
+ self._include_header_footer = include_header_footer
+ self._keep_line_breaks = keep_line_breaks
+ self._use_struct_tree = use_struct_tree
+ self._password = password
+ self._hybrid = hybrid
+
+ # ------------------------------------------------------------------
+ # Engine metadata
+ # ------------------------------------------------------------------
+
+ @property
+ def name(self) -> str:
+ return "opendataloader"
+
+ @property
+ def supported_extensions(self) -> set[str]:
+ return _SUPPORTED_EXTENSIONS
+
+ @property
+ def capabilities(self) -> EngineCapabilities:
+ return EngineCapabilities(
+ bounding_boxes=True,
+ reading_order=True,
+ heading_detection=True,
+ table_structure=True,
+ )
+
+ def is_available(self) -> bool:
+ if shutil.which("java") is None:
+ return False
+ try:
+ import opendataloader_pdf # noqa: F401
+ return True
+ except ImportError:
+ return False
+
+ # ------------------------------------------------------------------
+ # Processing
+ # ------------------------------------------------------------------
+
+ async def process(
+ self,
+ file_path: str,
+ output_format: OutputFormat = OutputFormat.MARKDOWN,
+ **kwargs: Any,
+ ) -> EngineResult:
+ start = time.perf_counter()
+
+ loop = asyncio.get_running_loop()
+ content, page_count, bboxes = await loop.run_in_executor(
+ None, self._run, file_path, output_format
+ )
+
+ elapsed_ms = int((time.perf_counter() - start) * 1000)
+
+ return EngineResult(
+ content=content,
+ format=output_format,
+ engine_name=self.name,
+ pages=page_count,
+ processing_time_ms=elapsed_ms,
+ bounding_boxes=bboxes or None,
+ metadata={
+ "reading_order": self._reading_order or "default",
+ "table_method": self._table_method or "default",
+ },
+ )
+
+ # ------------------------------------------------------------------
+ # Internal helpers
+ # ------------------------------------------------------------------
+
+ def _run(
+ self, file_path: str, output_format: OutputFormat,
+ ) -> tuple[str, int, list[dict[str, Any]]]:
+ formats = self._formats_for(output_format)
+
+ with tempfile.TemporaryDirectory() as out_dir:
+ try:
+ _convert(
+ file_path,
+ output_dir=out_dir,
+ format=formats,
+ password=self._password,
+ reading_order=self._reading_order,
+ table_method=self._table_method,
+ include_header_footer=self._include_header_footer,
+ keep_line_breaks=self._keep_line_breaks,
+ use_struct_tree=self._use_struct_tree,
+ hybrid=self._hybrid,
+ quiet=True,
+ )
+ except Exception as exc:
+ raise RuntimeError(f"opendataloader failed: {exc}") from exc
+
+ json_path = _find_output_file(out_dir, (".json",))
+ if not json_path:
+ raise RuntimeError("opendataloader produced no JSON output")
+
+ with open(json_path, encoding="utf-8") as f:
+ data = json.load(f)
+
+ page_count = int(data.get("number of pages", 0) or 0)
+ bboxes: list[dict[str, Any]] = []
+ _walk_kids(data.get("kids") or [], bboxes, {"n": 0})
+
+ content = self._read_primary(out_dir, output_format, data)
+
+ return content, page_count, bboxes
+
+ @staticmethod
+ def _formats_for(output_format: OutputFormat) -> list[str]:
+ """Build the list of output formats to request from the JAR."""
+ # Always include JSON — it carries the bounding-box tree we need.
+ formats = ["json"]
+ if output_format == OutputFormat.MARKDOWN:
+ formats.append("markdown")
+ elif output_format == OutputFormat.HTML:
+ formats.append("html")
+ elif output_format == OutputFormat.TEXT:
+ formats.append("text")
+ return formats
+
+ @staticmethod
+ def _read_primary(
+ out_dir: str, output_format: OutputFormat, json_data: dict[str, Any],
+ ) -> str:
+ if output_format == OutputFormat.JSON:
+ return json.dumps(json_data, ensure_ascii=False)
+
+ suffix_map = {
+ OutputFormat.MARKDOWN: (".md",),
+ OutputFormat.HTML: (".html",),
+ OutputFormat.TEXT: (".txt",),
+ }
+ path = _find_output_file(out_dir, suffix_map.get(output_format, (".md",)))
+ if path is None:
+ return ""
+ with open(path, encoding="utf-8") as f:
+ return f.read()
diff --git a/tests/engines/test_opendataloader_engine.py b/tests/engines/test_opendataloader_engine.py
new file mode 100644
index 0000000..4a1d66d
--- /dev/null
+++ b/tests/engines/test_opendataloader_engine.py
@@ -0,0 +1,337 @@
+"""Tests for OpenDataLoader PDF engine adapter.
+
+These are unit tests that mock the underlying ``opendataloader_pdf.convert``
+call so they run without Java installed.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from unittest.mock import patch
+
+import pytest
+
+from docfold.engines.base import EngineResult, OutputFormat
+
+
+def _odl_json(pages: int = 1, kids: list[dict] | None = None) -> dict:
+ """Build a JSON payload matching the real OpenDataLoader output shape."""
+ return {
+ "file name": "test.pdf",
+ "number of pages": pages,
+ "author": None,
+ "title": None,
+ "creation date": None,
+ "modification date": None,
+ "kids": kids or [
+ {
+ "type": "heading",
+ "id": 1,
+ "page number": 1,
+ "bounding box": [72.0, 688.85, 200.0, 705.0],
+ "heading level": 1,
+ "content": "Hello World",
+ },
+ {
+ "type": "paragraph",
+ "id": 2,
+ "page number": 1,
+ "bounding box": [72.0, 659.5, 400.0, 672.2],
+ "content": "Test document for OpenDataLoader",
+ },
+ ],
+ }
+
+
+def _fake_convert_factory(
+ json_payload: dict,
+ markdown: str = "# Hello World\n\nTest document for OpenDataLoader",
+ html: str = "Hello World
Test document for OpenDataLoader
",
+ text: str = "Hello World\nTest document for OpenDataLoader",
+ stem: str = "test",
+):
+ """Produce a fake ``convert`` that writes output files into ``output_dir``."""
+
+ def fake_convert(input_path, output_dir=None, format=None, **kwargs):
+ assert output_dir is not None, "engine must pass an output_dir"
+ fmts = format if isinstance(format, list) else ([format] if format else [])
+ for fmt in fmts:
+ if fmt == "json":
+ with open(os.path.join(output_dir, f"{stem}.json"), "w") as f:
+ json.dump(json_payload, f)
+ elif fmt in ("markdown", "markdown-with-html", "markdown-with-images"):
+ with open(os.path.join(output_dir, f"{stem}.md"), "w") as f:
+ f.write(markdown)
+ elif fmt == "html":
+ with open(os.path.join(output_dir, f"{stem}.html"), "w") as f:
+ f.write(html)
+ elif fmt == "text":
+ with open(os.path.join(output_dir, f"{stem}.txt"), "w") as f:
+ f.write(text)
+
+ return fake_convert
+
+
+# ---------------------------------------------------------------------------
+# Metadata / capabilities
+# ---------------------------------------------------------------------------
+
+
+class TestOpenDataLoaderEngineMetadata:
+ def test_name(self):
+ from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+ e = OpenDataLoaderEngine()
+ assert e.name == "opendataloader"
+
+ def test_supported_extensions(self):
+ from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+ e = OpenDataLoaderEngine()
+ assert "pdf" in e.supported_extensions
+
+ def test_capabilities(self):
+ from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+ e = OpenDataLoaderEngine()
+ caps = e.capabilities
+ assert caps.bounding_boxes is True
+ assert caps.reading_order is True
+ assert caps.heading_detection is True
+ assert caps.table_structure is True
+
+ def test_is_available_false_when_package_missing(self):
+ from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+ e = OpenDataLoaderEngine()
+ with patch.dict("sys.modules", {"opendataloader_pdf": None}):
+ assert isinstance(e.is_available(), bool)
+
+ def test_is_available_false_when_java_missing(self):
+ from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+ e = OpenDataLoaderEngine()
+ with patch("shutil.which", return_value=None):
+ assert e.is_available() is False
+
+ def test_config_stored(self):
+ from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+ e = OpenDataLoaderEngine(
+ reading_order="xycut",
+ table_method="cluster",
+ include_header_footer=True,
+ password="secret",
+ )
+ assert e._reading_order == "xycut"
+ assert e._table_method == "cluster"
+ assert e._include_header_footer is True
+ assert e._password == "secret"
+
+
+# ---------------------------------------------------------------------------
+# process()
+# ---------------------------------------------------------------------------
+
+
+class TestOpenDataLoaderEngineProcess:
+ @pytest.mark.asyncio
+ async def test_process_markdown_format(self):
+ from docfold.engines import opendataloader_engine
+ from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+ e = OpenDataLoaderEngine()
+ fake_convert = _fake_convert_factory(_odl_json(pages=1))
+
+ with patch.object(opendataloader_engine, "_convert", fake_convert):
+ result = await e.process("test.pdf", output_format=OutputFormat.MARKDOWN)
+
+ assert isinstance(result, EngineResult)
+ assert result.engine_name == "opendataloader"
+ assert result.format == OutputFormat.MARKDOWN
+ assert "Hello World" in result.content
+ assert result.pages == 1
+ assert result.bounding_boxes is not None
+ assert len(result.bounding_boxes) == 2
+ assert result.processing_time_ms >= 0
+
+ @pytest.mark.asyncio
+ async def test_process_json_format(self):
+ from docfold.engines import opendataloader_engine
+ from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+ e = OpenDataLoaderEngine()
+ fake_convert = _fake_convert_factory(_odl_json(pages=1))
+
+ with patch.object(opendataloader_engine, "_convert", fake_convert):
+ result = await e.process("test.pdf", output_format=OutputFormat.JSON)
+
+ assert result.format == OutputFormat.JSON
+ # Content must be valid JSON
+ parsed = json.loads(result.content)
+ assert parsed["number of pages"] == 1
+
+ @pytest.mark.asyncio
+ async def test_process_html_format(self):
+ from docfold.engines import opendataloader_engine
+ from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+ e = OpenDataLoaderEngine()
+ fake_convert = _fake_convert_factory(_odl_json(pages=1))
+
+ with patch.object(opendataloader_engine, "_convert", fake_convert):
+ result = await e.process("test.pdf", output_format=OutputFormat.HTML)
+
+ assert result.format == OutputFormat.HTML
+ assert "" in result.content.lower()
+
+ @pytest.mark.asyncio
+ async def test_type_mapping(self):
+ """heading/paragraph/table/list/list-item should map to canonical types."""
+ from docfold.engines import opendataloader_engine
+ from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+ kids = [
+ {
+ "type": "heading",
+ "id": 1,
+ "page number": 1,
+ "bounding box": [0, 0, 100, 20],
+ "content": "H",
+ },
+ {
+ "type": "paragraph",
+ "id": 2,
+ "page number": 1,
+ "bounding box": [0, 30, 100, 50],
+ "content": "P",
+ },
+ {
+ "type": "table",
+ "id": 3,
+ "page number": 1,
+ "bounding box": [0, 60, 100, 200],
+ "content": "T",
+ },
+ {
+ "type": "list",
+ "id": 4,
+ "page number": 1,
+ "bounding box": [0, 210, 100, 300],
+ "content": "L",
+ },
+ ]
+ e = OpenDataLoaderEngine()
+ fake_convert = _fake_convert_factory(_odl_json(pages=1, kids=kids))
+ with patch.object(opendataloader_engine, "_convert", fake_convert):
+ result = await e.process("test.pdf", output_format=OutputFormat.MARKDOWN)
+
+ types = {b["type"] for b in result.bounding_boxes}
+ assert "SectionHeader" in types
+ assert "Text" in types
+ assert "Table" in types
+ assert "List" in types
+
+ @pytest.mark.asyncio
+ async def test_nested_kids_flattened(self):
+ """Nested kids (e.g. header container) must be flattened into bboxes."""
+ from docfold.engines import opendataloader_engine
+ from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+ nested_kids = [
+ {
+ "type": "header",
+ "id": 10,
+ "page number": 1,
+ "bounding box": [0, 0, 500, 400],
+ "kids": [
+ {
+ "type": "heading",
+ "id": 1,
+ "page number": 1,
+ "bounding box": [0, 0, 100, 20],
+ "content": "Title",
+ },
+ {
+ "type": "paragraph",
+ "id": 2,
+ "page number": 1,
+ "bounding box": [0, 30, 400, 100],
+ "content": "Body",
+ },
+ ],
+ }
+ ]
+ e = OpenDataLoaderEngine()
+ fake_convert = _fake_convert_factory(_odl_json(pages=1, kids=nested_kids))
+
+ with patch.object(opendataloader_engine, "_convert", fake_convert):
+ result = await e.process("test.pdf", output_format=OutputFormat.MARKDOWN)
+
+ # At least the two leaves must be emitted as bounding boxes.
+ assert result.bounding_boxes is not None
+ texts = [b.get("text", "") for b in result.bounding_boxes]
+ assert "Title" in texts
+ assert "Body" in texts
+
+ @pytest.mark.asyncio
+ async def test_bbox_coordinates_preserved(self):
+ from docfold.engines import opendataloader_engine
+ from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+ kids = [
+ {
+ "type": "paragraph",
+ "id": 1,
+ "page number": 2,
+ "bounding box": [12.5, 34.5, 200.0, 90.0],
+ "content": "Hi",
+ }
+ ]
+ e = OpenDataLoaderEngine()
+ fake_convert = _fake_convert_factory(_odl_json(pages=2, kids=kids))
+
+ with patch.object(opendataloader_engine, "_convert", fake_convert):
+ result = await e.process("test.pdf", output_format=OutputFormat.MARKDOWN)
+
+ assert result.bounding_boxes is not None
+ bbox = result.bounding_boxes[0]
+ assert bbox["bbox"] == [12.5, 34.5, 200.0, 90.0]
+ assert bbox["page"] == 2
+ assert bbox["text"] == "Hi"
+
+ @pytest.mark.asyncio
+ async def test_process_surfaces_errors_as_runtime_error(self):
+ from docfold.engines import opendataloader_engine
+ from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+ def boom(*args, **kwargs):
+ raise RuntimeError("java exited 2")
+
+ e = OpenDataLoaderEngine()
+ with patch.object(opendataloader_engine, "_convert", boom):
+ with pytest.raises(RuntimeError, match="opendataloader"):
+ await e.process("test.pdf", output_format=OutputFormat.MARKDOWN)
+
+ @pytest.mark.asyncio
+ async def test_reading_order_option_passed(self):
+ """reading_order kwarg on the engine must reach the underlying convert call."""
+ from docfold.engines import opendataloader_engine
+ from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+ captured: dict = {}
+
+ def capture(input_path, output_dir=None, format=None, **kwargs):
+ captured["kwargs"] = kwargs
+ # Still produce files so parsing succeeds
+ _fake_convert_factory(_odl_json(pages=1))(
+ input_path, output_dir=output_dir, format=format, **kwargs
+ )
+
+ e = OpenDataLoaderEngine(reading_order="xycut", table_method="cluster")
+ with patch.object(opendataloader_engine, "_convert", capture):
+ await e.process("test.pdf", output_format=OutputFormat.MARKDOWN)
+
+ assert captured["kwargs"].get("reading_order") == "xycut"
+ assert captured["kwargs"].get("table_method") == "cluster"
diff --git a/tests/fixtures/fonts/LICENSE.txt b/tests/fixtures/fonts/LICENSE.txt
new file mode 100644
index 0000000..dffd6b1
--- /dev/null
+++ b/tests/fixtures/fonts/LICENSE.txt
@@ -0,0 +1,20 @@
+Benchmark font fixtures — all bundled Noto fonts are licensed under the
+SIL Open Font License, Version 1.1 (OFL-1.1).
+
+Files
+-----
+- NotoNaskhArabic-Regular.ttf — full Noto Naskh Arabic Regular
+- NotoSansCJKsc-Regular-subset.ttf — Noto Sans CJK SC Regular, subsetted to
+ the glyphs used in the Chinese benchmark document only
+- NotoSansHebrew-Regular-subset.ttf — Noto Sans Hebrew Regular, subsetted to
+ the glyphs used in the Hebrew benchmark document only
+
+Sources
+-------
+- https://fonts.google.com/noto/specimen/Noto+Naskh+Arabic
+- https://fonts.google.com/noto/specimen/Noto+Sans+Simplified+Chinese
+- https://fonts.google.com/noto/specimen/Noto+Sans+Hebrew
+- OFL license text: https://scripts.sil.org/OFL
+
+Subsetting was done with fontTools (``fontTools.subset.Subsetter``) preserving
+all OpenType layout features so shaping / bidi still work.
diff --git a/tests/fixtures/fonts/NotoNaskhArabic-Regular.ttf b/tests/fixtures/fonts/NotoNaskhArabic-Regular.ttf
new file mode 100644
index 0000000..00a33b3
Binary files /dev/null and b/tests/fixtures/fonts/NotoNaskhArabic-Regular.ttf differ
diff --git a/tests/fixtures/fonts/NotoSansCJKsc-Regular-subset.ttf b/tests/fixtures/fonts/NotoSansCJKsc-Regular-subset.ttf
new file mode 100644
index 0000000..1cdd83d
Binary files /dev/null and b/tests/fixtures/fonts/NotoSansCJKsc-Regular-subset.ttf differ
diff --git a/tests/fixtures/fonts/NotoSansHebrew-Regular-subset.ttf b/tests/fixtures/fonts/NotoSansHebrew-Regular-subset.ttf
new file mode 100644
index 0000000..9330e17
Binary files /dev/null and b/tests/fixtures/fonts/NotoSansHebrew-Regular-subset.ttf differ