diff --git a/CHANGELOG.md b/CHANGELOG.md
index 344b9a9..29a2b68 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+### Added
+
+- **OpenDataLoader PDF engine adapter** — wraps the Java-based [`opendataloader-pdf`](https://github.com/opendataloader-project/opendataloader-pdf) tool (via its bundled-JAR Python wheel). Local, deterministic extraction with typed structural elements (heading, paragraph, table, list, header, footer) and per-element bounding boxes. Install: `pip install docfold[opendataloader]` (also requires Java 11+).
+- **Multi-script benchmark coverage** — `benchmark.py` now generates Arabic (RTL + shaping), Hebrew (RTL, no shaping), and Simplified Chinese (CJK) synthetic PDFs alongside the existing English docs. Fonts are bundled under `tests/fixtures/fonts/` (OFL-1.1, subsetted where relevant) so the benchmark is reproducible without system font packages.
+
 ## [0.6.0] - 2026-02-20
 
 ### Added
diff --git a/README.md b/README.md
index 002f161..7c2d8f9 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,7 @@ Docfold is the open-source extraction engine from [Datatera.ai](https://datatera
 | [**MinerU**](https://github.com/opendatalab/MinerU) | ✅ | Local | AGPL | ★★★ | ★★★ | ★★★ | — | — | Slow | Free |
 | [**Marker**](https://www.datalab.to/) | ✅ | SaaS | Paid | ★★★ | ★★★ | ★★★ | ✅ | — | Fast | $$ |
 | [**PyMuPDF**](https://pymupdf.readthedocs.io/) | ✅ | Local | AGPL | ★★★ | ☆☆☆ | ★☆☆ | — | — | Ultra | Free |
+| [**OpenDataLoader**](https://github.com/opendataloader-project/opendataloader-pdf) | ✅ | Local | Apache | ★★★ | ☆☆☆ | ★★☆ | ✅ | — | Fast | Free |
 | [**PaddleOCR**](https://github.com/PaddlePaddle/PaddleOCR) | ✅ | Local | Apache | ★☆☆ | ★★★ | ★★☆ | — | ✅ | Medium | Free |
 | [**Tesseract**](https://github.com/tesseract-ocr/tesseract) | ✅ | Local | Apache | ★☆☆ | ★★☆ | ★☆☆ | — | — | Medium | Free |
 | [**EasyOCR**](https://github.com/JaidedAI/EasyOCR) | ✅ | Local | Apache | ★☆☆ | ★★★ | ☆☆☆ | — | ✅ | Medium | Free |
@@ -94,6 +95,7 @@ for name, res in results.items():
 | [**MinerU**](https://github.com/opendatalab/MinerU) | Local | AGPL-3.0 | PDF | Recommended | `pip install docfold[mineru]` |
 | [**Marker API**](https://www.datalab.to/) | SaaS | Paid | PDF, Office, images | N/A | `pip install docfold[marker]` |
 | [**PyMuPDF**](https://pymupdf.readthedocs.io/) | Local | AGPL-3.0 | PDF | No | `pip install docfold[pymupdf]` |
+| [**OpenDataLoader**](https://github.com/opendataloader-project/opendataloader-pdf) | Local | Apache-2.0 | PDF | No (needs Java 11+) | `pip install docfold[opendataloader]` |
 | [**PaddleOCR**](https://github.com/PaddlePaddle/PaddleOCR) | Local | Apache-2.0 | Images, scanned PDFs | Optional | `pip install docfold[paddleocr]` |
 | [**Tesseract**](https://github.com/tesseract-ocr/tesseract) | Local | Apache-2.0 | Images, scanned PDFs | No | `pip install docfold[tesseract]` |
 | [**EasyOCR**](https://github.com/JaidedAI/EasyOCR) | Local | Apache-2.0 | Images, scanned PDFs | Optional | `pip install docfold[easyocr]` |
diff --git a/benchmark.py b/benchmark.py
index c795e11..287d5a9 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -37,6 +37,90 @@ def create_text_pdf(path: str, pages: list[dict]) -> None:
     doc.close()
 
 
+_FIXTURE_FONT_DIR = os.path.join(
+    os.path.dirname(__file__), "tests", "fixtures", "fonts"
+)
+
+
+def _find_bundled_font(preferred: str, fallbacks: list[tuple[str, str]]) -> tuple[str, str] | None:
+    """Return ``(font_dir, ttf_name)`` for a font that exists on disk.
+
+    Prefers the bundled fixture under ``tests/fixtures/fonts/`` (shipped under
+    OFL-1.1) so the benchmark is reproducible on any host; falls back to
+    system paths only as a safety net.
+    """
+    if os.path.exists(os.path.join(_FIXTURE_FONT_DIR, preferred)):
+        return _FIXTURE_FONT_DIR, preferred
+    for d, fname in fallbacks:
+        if os.path.exists(os.path.join(d, fname)):
+            return d, fname
+    return None
+
+
+def _find_arabic_font() -> tuple[str, str] | None:
+    return _find_bundled_font(
+        "NotoNaskhArabic-Regular.ttf",
+        [
+            ("/usr/share/fonts/truetype/noto", "NotoNaskhArabic-Regular.ttf"),
+            ("/usr/share/fonts/noto", "NotoNaskhArabic-Regular.ttf"),
+            ("/usr/share/fonts/truetype/noto", "NotoSansArabic-Regular.ttf"),
+        ],
+    )
+
+
+def _find_script_font(preferred: str) -> tuple[str, str] | None:
+    """Bundled fonts for non-Arabic scripts — no system fallback because the
+    subsetted TTF is what we tested against."""
+    return _find_bundled_font(preferred, [])
+
+
+def _render_html_pdf(path: str, html_body: str, font_info: tuple[str, str]) -> None:
+    """Generic HTML → PDF renderer using PyMuPDF's ``insert_htmlbox`` with
+    a bundled font archive. Handles shaping / bidi via HarfBuzz under the hood.
+    """
+    import fitz
+
+    font_dir, ttf = font_info
+    doc = fitz.open()
+    page = doc.new_page(width=612, height=792)
+    archive = fitz.Archive(font_dir)
+    css = f"@font-face {{ font-family: 'BenchFont'; src: url({ttf}); }}"
+    page.insert_htmlbox(fitz.Rect(36, 36, 576, 756), html_body, css=css, archive=archive)
+    doc.save(path)
+    doc.close()
+
+
+def create_arabic_pdf(path: str, html_body: str) -> None:
+    """Render an Arabic HTML snippet to PDF using Noto Naskh Arabic."""
+    font_info = _find_arabic_font()
+    if font_info is None:
+        raise RuntimeError(
+            "Arabic font fixture missing: "
+            "tests/fixtures/fonts/NotoNaskhArabic-Regular.ttf"
+        )
+    _render_html_pdf(path, html_body, font_info)
+
+
+def create_script_pdf(path: str, html_body: str, font_ttf: str) -> None:
+    """Render an HTML snippet to PDF using a bundled script-specific font."""
+    font_info = _find_script_font(font_ttf)
+    if font_info is None:
+        raise RuntimeError(f"Font fixture missing: tests/fixtures/fonts/{font_ttf}")
+    _render_html_pdf(path, html_body, font_info)
+
+
+def _extract_ground_truth(pdf_path: str) -> str:
+    """Return PyMuPDF's extracted text — used as ground truth for docs whose
+    authoritative form depends on font shaping (e.g. Arabic).
+    """
+    import fitz
+
+    doc = fitz.open(pdf_path)
+    text = "\n".join(p.get_text() for p in doc)
+    doc.close()
+    return text
+
+
 def generate_benchmark_documents(tmpdir: str) -> list[dict]:
     """Generate synthetic PDFs and return metadata with ground truth."""
     documents = []
@@ -130,6 +214,78 @@ def generate_benchmark_documents(tmpdir: str) -> list[dict]:
         "category": "report",
     })
 
+    # --- Doc 5: Arabic (RTL + shaping) ---
+    # PDFs store Arabic in shaped presentation forms and reverse visual order.
+    # We use PyMuPDF's extraction of the generated PDF as ground truth — this
+    # measures whether *other* engines agree on the same text, not whether
+    # they normalize to logical Unicode (a harder task).
+    doc5_path = os.path.join(tmpdir, "arabic_report.pdf")
+    arabic_html = (
+        '<div lang="ar" dir="rtl" '
+        "style=\"font-family:'BenchFont';font-size:14pt;line-height:1.8;\">"
+        "<h1>تقرير سنوي 2024</h1>"
+        "<p>حققت الشركة نموا قياسيا هذا العام بإيرادات تجاوزت التوقعات.</p>"
+        "<p>بلغت نسبة رضا العملاء 94 بالمئة.</p>"
+        "<p>وصل معدل الاحتفاظ بالموظفين إلى 96 بالمئة.</p>"
+        "</div>"
+    )
+    create_arabic_pdf(doc5_path, arabic_html)
+    documents.append({
+        "name": "arabic_report",
+        "path": doc5_path,
+        "ground_truth": _extract_ground_truth(doc5_path),
+        "pages": 1,
+        "category": "rtl",
+    })
+
+    # --- Doc 6: Simplified Chinese (CJK) ---
+    # CJK has no shaping and LTR, but tests that engines don't mangle
+    # multi-byte Unicode. Font is subsetted (60 KB) from Noto Sans CJK SC.
+    doc6_path = os.path.join(tmpdir, "chinese_report.pdf")
+    chinese_html = (
+        '<div lang="zh" dir="ltr" '
+        "style=\"font-family:'BenchFont';font-size:14pt;line-height:1.8;\">"
+        "<h1>2024年度报告</h1>"
+        "<p>公司今年实现了创纪录的增长，收入超出预期。</p>"
+        "<p>客户满意度达到了94%。</p>"
+        "<p>员工保留率达到96%，创公司历史新高。</p>"
+        "</div>"
+    )
+    create_script_pdf(doc6_path, chinese_html, "NotoSansCJKsc-Regular-subset.ttf")
+    documents.append({
+        "name": "chinese_report",
+        "path": doc6_path,
+        "ground_truth": _extract_ground_truth(doc6_path),
+        "pages": 1,
+        "category": "cjk",
+    })
+
+    # --- Doc 7: Hebrew (RTL, no shaping) ---
+    # Good contrast to Arabic: same RTL bidi, but no contextual shaping.
+    doc7_path = os.path.join(tmpdir, "hebrew_report.pdf")
+    hebrew_html = (
+        '<div lang="he" dir="rtl" '
+        "style=\"font-family:'BenchFont';font-size:14pt;line-height:1.8;\">"
+        "<h1>דוח שנתי 2024</h1>"
+        "<p>החברה השיגה צמיחה שיא השנה, עם הכנסות שעלו על הציפיות.</p>"
+        "<p>שביעות רצון הלקוחות הגיעה ל-94 אחוז.</p>"
+        "<p>שיעור שימור העובדים הגיע ל-96 אחוז.</p>"
+        "</div>"
+    )
+    create_script_pdf(doc7_path, hebrew_html, "NotoSansHebrew-Regular-subset.ttf")
+    documents.append({
+        "name": "hebrew_report",
+        "path": doc7_path,
+        "ground_truth": _extract_ground_truth(doc7_path),
+        "pages": 1,
+        "category": "rtl",
+    })
+
+    # NOTE: Devanagari and Thai are intentionally omitted. PyMuPDF's
+    # ``insert_htmlbox`` produces PDFs whose ToUnicode maps don't survive
+    # round-trip extraction for those scripts (null bytes, dropped matras).
+    # They need real-world fixture PDFs — see docs/tasks/ for a follow-up.
+
     return documents
 
 
@@ -197,6 +353,7 @@ async def main():
     from docfold.engines.marker_local_engine import MarkerLocalEngine
     from docfold.engines.mineru_engine import MinerUEngine
     from docfold.engines.nougat_engine import NougatEngine
+    from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
     from docfold.engines.paddleocr_engine import PaddleOCREngine
     from docfold.engines.pymupdf_engine import PyMuPDFEngine
     from docfold.engines.surya_engine import SuryaEngine
@@ -210,6 +367,7 @@ async def main():
     candidates = [
         (PyMuPDFEngine(), "pip install pymupdf"),
         (LiteParseEngine(ocr_enabled=False), "npm i -g @llamaindex/liteparse"),
+        (OpenDataLoaderEngine(), "pip install docfold[opendataloader] (needs Java 11+)"),
         (MinerUEngine(), "pip install docfold[mineru]"),
         (MarkerLocalEngine(), "pip install marker-pdf"),
         (SuryaEngine(), "pip install surya-ocr"),
diff --git a/docs/benchmark_results.json b/docs/benchmark_results.json
index 9899727..631f884 100644
--- a/docs/benchmark_results.json
+++ b/docs/benchmark_results.json
@@ -1,14 +1,8 @@
 {
-  "benchmark_date": "2026-04-05 22:20:15",
+  "benchmark_date": "2026-04-17 10:17:34",
   "engines": [
     "pymupdf",
-    "mineru",
-    "marker_local",
-    "surya",
-    "docling",
-    "paddleocr",
-    "tesseract",
-    "unstructured"
+    "opendataloader"
   ],
   "documents": [
     {
@@ -30,20 +24,35 @@
       "name": "mixed_formatting",
       "pages": 1,
       "category": "report"
+    },
+    {
+      "name": "arabic_report",
+      "pages": 1,
+      "category": "rtl"
+    },
+    {
+      "name": "chinese_report",
+      "pages": 1,
+      "category": "cjk"
+    },
+    {
+      "name": "hebrew_report",
+      "pages": 1,
+      "category": "rtl"
     }
   ],
   "summary": {
     "pymupdf": {
-      "avg_time_ms": 3.5,
+      "avg_time_ms": 6.4,
       "avg_cer": 0.0,
       "avg_wer": 0.0,
-      "avg_bbox_count": 6.2,
+      "avg_bbox_count": 5.3,
       "errors": 0,
-      "successes": 4,
+      "successes": 7,
       "results": [
         {
           "doc": "simple_text",
-          "time_ms": 6,
+          "time_ms": 11,
           "cer": 0.0,
           "wer": 0.0,
           "bbox_count": 5,
@@ -52,7 +61,7 @@
         },
         {
           "doc": "multi_page",
-          "time_ms": 2,
+          "time_ms": 4,
           "cer": 0.0,
           "wer": 0.0,
           "bbox_count": 4,
@@ -61,7 +70,7 @@
         },
         {
           "doc": "dense_financial",
-          "time_ms": 3,
+          "time_ms": 5,
           "cer": 0.0,
           "wer": 0.0,
           "bbox_count": 10,
@@ -70,334 +79,112 @@
         },
         {
           "doc": "mixed_formatting",
-          "time_ms": 3,
+          "time_ms": 4,
           "cer": 0.0,
           "wer": 0.0,
           "bbox_count": 6,
           "content_length": 260,
           "pages": 1
-        }
-      ]
-    },
-    "mineru": {
-      "avg_time_ms": 18304.8,
-      "avg_cer": 0.0118,
-      "avg_wer": 0.0804,
-      "avg_bbox_count": 0.0,
-      "errors": 0,
-      "successes": 4,
-      "results": [
-        {
-          "doc": "simple_text",
-          "time_ms": 55848,
-          "cer": 0.0083,
-          "wer": 0.0556,
-          "bbox_count": 0,
-          "content_length": 122,
-          "pages": null
         },
         {
-          "doc": "multi_page",
-          "time_ms": 7511,
-          "cer": 0.0,
-          "wer": 0.0,
-          "bbox_count": 0,
-          "content_length": 300,
-          "pages": null
-        },
-        {
-          "doc": "dense_financial",
-          "time_ms": 4845,
-          "cer": 0.0235,
-          "wer": 0.2121,
-          "bbox_count": 0,
-          "content_length": 305,
-          "pages": null
-        },
-        {
-          "doc": "mixed_formatting",
-          "time_ms": 5015,
-          "cer": 0.0154,
-          "wer": 0.0541,
-          "bbox_count": 0,
-          "content_length": 264,
-          "pages": null
-        }
-      ]
-    },
-    "marker_local": {
-      "avg_time_ms": 39090.5,
-      "avg_cer": 0.0191,
-      "avg_wer": 0.0936,
-      "avg_bbox_count": 0.0,
-      "errors": 0,
-      "successes": 4,
-      "results": [
-        {
-          "doc": "simple_text",
-          "time_ms": 13760,
-          "cer": 0.0083,
-          "wer": 0.0556,
-          "bbox_count": 0,
-          "content_length": 122,
-          "pages": null
-        },
-        {
-          "doc": "multi_page",
-          "time_ms": 69190,
-          "cer": 0.01,
-          "wer": 0.0256,
-          "bbox_count": 0,
-          "content_length": 303,
-          "pages": null
-        },
-        {
-          "doc": "dense_financial",
-          "time_ms": 37193,
-          "cer": 0.0235,
-          "wer": 0.2121,
-          "bbox_count": 0,
-          "content_length": 305,
-          "pages": null
-        },
-        {
-          "doc": "mixed_formatting",
-          "time_ms": 36219,
-          "cer": 0.0346,
-          "wer": 0.0811,
-          "bbox_count": 0,
-          "content_length": 269,
-          "pages": null
-        }
-      ]
-    },
-    "surya": {
-      "avg_time_ms": 32959.2,
-      "avg_cer": 0.0135,
-      "avg_wer": 0.027,
-      "avg_bbox_count": 0.0,
-      "errors": 0,
-      "successes": 4,
-      "results": [
-        {
-          "doc": "simple_text",
-          "time_ms": 7741,
+          "doc": "arabic_report",
+          "time_ms": 6,
           "cer": 0.0,
           "wer": 0.0,
-          "bbox_count": 0,
-          "content_length": 121,
+          "bbox_count": 4,
+          "content_length": 151,
           "pages": 1
         },
         {
-          "doc": "multi_page",
-          "time_ms": 45426,
+          "doc": "chinese_report",
+          "time_ms": 4,
           "cer": 0.0,
           "wer": 0.0,
-          "bbox_count": 0,
-          "content_length": 300,
-          "pages": 2
+          "bbox_count": 4,
+          "content_length": 63,
+          "pages": 1
         },
         {
-          "doc": "dense_financial",
-          "time_ms": 35792,
+          "doc": "hebrew_report",
+          "time_ms": 11,
           "cer": 0.0,
           "wer": 0.0,
-          "bbox_count": 0,
-          "content_length": 298,
-          "pages": 1
-        },
-        {
-          "doc": "mixed_formatting",
-          "time_ms": 42878,
-          "cer": 0.0538,
-          "wer": 0.1081,
-          "bbox_count": 0,
-          "content_length": 274,
+          "bbox_count": 4,
+          "content_length": 143,
           "pages": 1
         }
       ]
     },
-    "docling": {
-      "avg_time_ms": 2601.0,
-      "avg_cer": 0.0173,
-      "avg_wer": 0.0406,
-      "avg_bbox_count": 0.0,
+    "opendataloader": {
+      "avg_time_ms": 796.6,
+      "avg_cer": 0.257,
+      "avg_wer": 0.3756,
+      "avg_bbox_count": 3.1,
       "errors": 0,
-      "successes": 4,
+      "successes": 7,
       "results": [
         {
           "doc": "simple_text",
-          "time_ms": 3738,
-          "cer": 0.0248,
+          "time_ms": 1083,
+          "cer": 0.0165,
           "wer": 0.0556,
-          "bbox_count": 0,
-          "content_length": 124,
-          "pages": null
+          "bbox_count": 2,
+          "content_length": 123,
+          "pages": 1
         },
         {
           "doc": "multi_page",
-          "time_ms": 3170,
-          "cer": 0.01,
-          "wer": 0.0256,
-          "bbox_count": 0,
-          "content_length": 303,
-          "pages": null
+          "time_ms": 756,
+          "cer": 0.0133,
+          "wer": 0.0513,
+          "bbox_count": 2,
+          "content_length": 304,
+          "pages": 2
         },
         {
           "doc": "dense_financial",
-          "time_ms": 1715,
+          "time_ms": 725,
           "cer": 0.0,
           "wer": 0.0,
-          "bbox_count": 0,
+          "bbox_count": 1,
           "content_length": 298,
-          "pages": null
+          "pages": 1
         },
         {
           "doc": "mixed_formatting",
-          "time_ms": 1781,
-          "cer": 0.0346,
+          "time_ms": 737,
+          "cer": 0.0308,
           "wer": 0.0811,
-          "bbox_count": 0,
-          "content_length": 269,
-          "pages": null
-        }
-      ]
-    },
-    "paddleocr": {
-      "avg_time_ms": 1617.2,
-      "avg_cer": 0.0018,
-      "avg_wer": 0.0132,
-      "avg_bbox_count": 0.0,
-      "errors": 0,
-      "successes": 4,
-      "results": [
-        {
-          "doc": "simple_text",
-          "time_ms": 1304,
-          "cer": 0.0,
-          "wer": 0.0,
-          "bbox_count": 0,
-          "content_length": 121,
-          "pages": null
-        },
-        {
-          "doc": "multi_page",
-          "time_ms": 2226,
-          "cer": 0.0033,
-          "wer": 0.0256,
-          "bbox_count": 0,
-          "content_length": 299,
-          "pages": null
-        },
-        {
-          "doc": "dense_financial",
-          "time_ms": 1391,
-          "cer": 0.0,
-          "wer": 0.0,
-          "bbox_count": 0,
-          "content_length": 298,
-          "pages": null
-        },
-        {
-          "doc": "mixed_formatting",
-          "time_ms": 1548,
-          "cer": 0.0038,
-          "wer": 0.027,
-          "bbox_count": 0,
-          "content_length": 261,
-          "pages": null
-        }
-      ]
-    },
-    "tesseract": {
-      "avg_time_ms": 1190.0,
-      "avg_cer": 0.0,
-      "avg_wer": 0.0,
-      "avg_bbox_count": 0.0,
-      "errors": 0,
-      "successes": 4,
-      "results": [
-        {
-          "doc": "simple_text",
-          "time_ms": 881,
-          "cer": 0.0,
-          "wer": 0.0,
-          "bbox_count": 0,
-          "content_length": 121,
-          "pages": null
-        },
-        {
-          "doc": "multi_page",
-          "time_ms": 1718,
-          "cer": 0.0,
-          "wer": 0.0,
-          "bbox_count": 0,
-          "content_length": 300,
-          "pages": null
-        },
-        {
-          "doc": "dense_financial",
-          "time_ms": 1106,
-          "cer": 0.0,
-          "wer": 0.0,
-          "bbox_count": 0,
-          "content_length": 298,
-          "pages": null
-        },
-        {
-          "doc": "mixed_formatting",
-          "time_ms": 1055,
-          "cer": 0.0,
-          "wer": 0.0,
-          "bbox_count": 0,
-          "content_length": 260,
-          "pages": null
-        }
-      ]
-    },
-    "unstructured": {
-      "avg_time_ms": 597.0,
-      "avg_cer": 0.0357,
-      "avg_wer": 0.1353,
-      "avg_bbox_count": 0.0,
-      "errors": 0,
-      "successes": 4,
-      "results": [
-        {
-          "doc": "simple_text",
-          "time_ms": 2213,
-          "cer": 0.0661,
-          "wer": 0.2222,
-          "bbox_count": 0,
-          "content_length": 129,
-          "pages": null
+          "bbox_count": 5,
+          "content_length": 268,
+          "pages": 1
         },
         {
-          "doc": "multi_page",
-          "time_ms": 75,
-          "cer": 0.0067,
-          "wer": 0.0256,
-          "bbox_count": 0,
-          "content_length": 302,
-          "pages": null
+          "doc": "arabic_report",
+          "time_ms": 766,
+          "cer": 0.8675,
+          "wer": 1.08,
+          "bbox_count": 4,
+          "content_length": 151,
+          "pages": 1
         },
         {
-          "doc": "dense_financial",
-          "time_ms": 55,
-          "cer": 0.047,
-          "wer": 0.2121,
-          "bbox_count": 0,
-          "content_length": 312,
-          "pages": null
+          "doc": "chinese_report",
+          "time_ms": 721,
+          "cer": 0.0317,
+          "wer": 0.25,
+          "bbox_count": 4,
+          "content_length": 65,
+          "pages": 1
         },
         {
-          "doc": "mixed_formatting",
-          "time_ms": 45,
-          "cer": 0.0231,
-          "wer": 0.0811,
-          "bbox_count": 0,
-          "content_length": 266,
-          "pages": null
+          "doc": "hebrew_report",
+          "time_ms": 788,
+          "cer": 0.8392,
+          "wer": 1.1111,
+          "bbox_count": 4,
+          "content_length": 149,
+          "pages": 1
         }
       ]
     }
diff --git a/docs/tasks/OPENDATALOADER_ENGINE.md b/docs/tasks/OPENDATALOADER_ENGINE.md
new file mode 100644
index 0000000..68a21e1
--- /dev/null
+++ b/docs/tasks/OPENDATALOADER_ENGINE.md
@@ -0,0 +1,95 @@
+---
+purpose: "Add OpenDataLoader PDF as a local, fast PDF structuring engine."
+status: "OPEN"
+priority: "P2"
+created: "2026-04-16"
+---
+
+# Feature: OpenDataLoader PDF Engine
+
+## Problem
+docfold already ships several local PDF engines (PyMuPDF, LiteParse, Docling, MinerU, …),
+but none of them are based on `opendataloader-pdf` — a popular, Apache-2.0 Java tool
+(16.8k stars on GitHub) exposed through a thin Python wrapper on PyPI
+(`opendataloader-pdf`).
+
+Why add it now:
+- Very fast deterministic layout + reading-order extraction on CPU (benchmarked
+  at 100+ pages/sec by upstream) — useful as a reliable, low-latency baseline
+  to compare against heavier OCR / ML engines.
+- Emits richly typed structural elements (`heading`, `paragraph`, `table`,
+  `list`, `header`, `footer`, …) with per-element bounding boxes and page
+  numbers — a good fit for the docfold `EngineResult` contract.
+- Users can opt into a hybrid AI mode later without changing the adapter.
+
+## Proposed Solution
+Implement a new engine adapter `OpenDataLoaderEngine` that wraps the
+`opendataloader-pdf` Python package. The engine will:
+
+1. Write output (`json` + requested format) to a temp directory via
+   `opendataloader_pdf.convert(...)`.
+2. Read back the produced files:
+   - For `MARKDOWN` — read the `.md` file (or `text` output).
+   - For `HTML` — read the `.html` file.
+   - For `JSON`/`TEXT` — use the JSON/text file directly.
+3. Parse the JSON output to build a flat list of `BoundingBox` entries by
+   recursively walking the nested `kids` tree. Map upstream types
+   (`heading`, `paragraph`, `table`, `list`, `header`, `footer`, …) to
+   docfold's `BoundingBox.type` names (`SectionHeader`, `Text`, `Table`,
+   `List`, …).
+4. Normalize PDF-point coordinates — upstream emits `[x1, y1, x2, y2]` in
+   PDF points; we pass them through unchanged (same as PyMuPDF).
+
+Capabilities advertised: `bounding_boxes=True`, `reading_order=True`,
+`heading_detection=True`, `table_structure=True`.
+
+## Affected Files
+- `src/docfold/engines/opendataloader_engine.py` — new adapter
+- `tests/engines/test_opendataloader_engine.py` — new tests (mocked subprocess)
+- `benchmark.py` — register the new engine alongside the existing ones
+- `pyproject.toml` — add `opendataloader = ["opendataloader-pdf>=2.2"]` extra,
+  include it in `[all]`
+- `README.md` / `CHANGELOG.md` — short mention (optional in this task)
+
+## Test Plan
+
+### Unit / Functional Tests
+- [x] `name` is `"opendataloader"`
+- [x] `supported_extensions` includes `"pdf"`
+- [x] `capabilities` advertises `bounding_boxes`, `reading_order`,
+      `heading_detection`, `table_structure`
+- [x] `is_available()` returns True when `opendataloader_pdf` imports and
+      `java` is on PATH; False otherwise
+- [x] `process()` returns an `EngineResult` with the correct `engine_name`,
+      `format`, non-empty `content`, populated `pages`, non-empty
+      `bounding_boxes`, and `processing_time_ms >= 0` — tested with a
+      mocked `convert()` that materializes a fake output directory
+- [x] JSON walker flattens nested `kids` into one `BoundingBox` per leaf
+      element, preserving page numbers and bbox coords
+- [x] `heading` → `SectionHeader`, `paragraph` → `Text`, `table` → `Table`,
+      `list` → `List` type mapping
+- [x] Errors from the underlying CLI surface as `RuntimeError`
+
+### Integration / E2E Tests
+- [x] `benchmark.py` discovers the engine when `opendataloader-pdf` and
+      Java are installed and reports CER/WER/time/bbox counts alongside the
+      other engines.
+
+### Test Commands
+```bash
+pytest tests/engines/test_opendataloader_engine.py -v
+pytest tests/                      # full suite still green
+python benchmark.py                # sanity check
+```
+
+## Edge Cases
+- `java` not installed → `is_available()` returns False, no crash.
+- Encrypted PDF without password → surface upstream error as `RuntimeError`.
+- Empty PDF / no elements → `bounding_boxes` is `None`, `content` is `""`.
+- Pages without any `kids` → still counted via `"number of pages"`.
+- Deeply nested `kids` (tables / lists) → recursion handles arbitrary depth.
+
+## Out of Scope
+- Hybrid AI mode (`hybrid=…`) — can be added later via kwargs.
+- Image extraction / annotated-PDF output.
+- Table structure parsing into `tables` list of dicts (bboxes only for now).
diff --git a/pyproject.toml b/pyproject.toml
index e4e0034..29af5f8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,6 +74,10 @@ llamaparse = [
 liteparse = [
     # No Python deps — requires Node.js 18+ and: npm i -g @llamaindex/liteparse
 ]
+opendataloader = [
+    # Requires Java 11+ on PATH; JAR is bundled by the Python wheel.
+    "opendataloader-pdf>=2.2",
+]
 mistral-ocr = [
     "mistralai>=1.0",
 ]
@@ -112,7 +116,7 @@ evaluation = [
     "psutil>=5.9",         # Memory measurement
 ]
 all = [
-    "docfold[docling,mineru,marker,pymupdf,paddleocr,tesseract,easyocr,unstructured,llamaparse,liteparse,mistral-ocr,textract,google-docai,azure-docint,nougat,chandra,surya,firecrawl,evaluation]",
+    "docfold[docling,mineru,marker,pymupdf,paddleocr,tesseract,easyocr,unstructured,llamaparse,liteparse,opendataloader,mistral-ocr,textract,google-docai,azure-docint,nougat,chandra,surya,firecrawl,evaluation]",
     # Note: zerox excluded from [all] — py-zerox requires Python 3.11+
     # Install separately: pip install docfold[zerox]
 ]
diff --git a/src/docfold/engines/opendataloader_engine.py b/src/docfold/engines/opendataloader_engine.py
new file mode 100644
index 0000000..fd5da97
--- /dev/null
+++ b/src/docfold/engines/opendataloader_engine.py
@@ -0,0 +1,272 @@
+"""OpenDataLoader PDF engine adapter.
+
+Wraps the `opendataloader-pdf <https://github.com/opendataloader-project/opendataloader-pdf>`_
+Java tool via its Python package (`opendataloader-pdf` on PyPI).  Produces
+Markdown / HTML / JSON output with per-element bounding boxes and PDF page
+numbers, fully local, no API keys.
+
+Install: ``pip install docfold[opendataloader]`` (also requires Java 11+).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import os
+import shutil
+import tempfile
+import time
+from typing import Any
+
+from docfold.engines.base import (
+    BoundingBox,
+    DocumentEngine,
+    EngineCapabilities,
+    EngineResult,
+    OutputFormat,
+)
+
+logger = logging.getLogger(__name__)
+
+_SUPPORTED_EXTENSIONS = {"pdf"}
+
+
+# Upstream block type -> docfold canonical type.
+_TYPE_MAP: dict[str, str] = {
+    "heading": "SectionHeader",
+    "title": "SectionHeader",
+    "paragraph": "Text",
+    "text": "Text",
+    "caption": "Caption",
+    "table": "Table",
+    "table-cell": "TableCell",
+    "list": "List",
+    "list-item": "ListItem",
+    "figure": "Image",
+    "image": "Image",
+    "header": "PageHeader",
+    "footer": "PageFooter",
+    "footnote": "Footnote",
+}
+
+
+def _convert(*args: Any, **kwargs: Any) -> None:
+    """Indirection so tests can monkey-patch the JAR call."""
+    from opendataloader_pdf import convert as _upstream_convert
+
+    _upstream_convert(*args, **kwargs)
+
+
+def _map_type(raw_type: str) -> str:
+    if not raw_type:
+        return "Text"
+    return _TYPE_MAP.get(raw_type.lower(), raw_type.capitalize())
+
+
+def _walk_kids(
+    nodes: list[dict[str, Any]],
+    bboxes: list[dict[str, Any]],
+    counter: dict[str, int],
+) -> None:
+    """Depth-first flatten nested ``kids`` into a list of :class:`BoundingBox` dicts."""
+    for node in nodes:
+        if not isinstance(node, dict):
+            continue
+
+        bbox_raw = node.get("bounding box")
+        page = node.get("page number")
+        text = (node.get("content") or "").strip()
+        children = node.get("kids") or []
+        node_type = _map_type(node.get("type", ""))
+
+        # Emit a bbox for any node that has enough geometry info.  We prefer
+        # leaf nodes (no children) but also include parent containers that
+        # carry usable text of their own and geometry — they'll show as a
+        # single block instead of being lost.
+        if bbox_raw and page and (not children or text):
+            try:
+                coords = [float(x) for x in bbox_raw]
+            except (TypeError, ValueError):
+                coords = None
+            if coords and len(coords) == 4:
+                idx = counter["n"]
+                counter["n"] += 1
+                bboxes.append(
+                    BoundingBox(
+                        type=node_type,
+                        bbox=coords,
+                        page=int(page),
+                        text=text,
+                        id=f"p{int(page)}-e{idx}",
+                    ).to_dict()
+                )
+
+        if children:
+            _walk_kids(children, bboxes, counter)
+
+
+def _find_output_file(output_dir: str, suffixes: tuple[str, ...]) -> str | None:
+    for name in sorted(os.listdir(output_dir)):
+        for suffix in suffixes:
+            if name.endswith(suffix):
+                return os.path.join(output_dir, name)
+    return None
+
+
+class OpenDataLoaderEngine(DocumentEngine):
+    """Adapter for ``opendataloader-pdf`` (Java CLI via Python wrapper)."""
+
+    def __init__(
+        self,
+        *,
+        reading_order: str | None = None,
+        table_method: str | None = None,
+        include_header_footer: bool = False,
+        keep_line_breaks: bool = False,
+        use_struct_tree: bool = False,
+        password: str | None = None,
+        hybrid: str | None = None,
+    ) -> None:
+        self._reading_order = reading_order
+        self._table_method = table_method
+        self._include_header_footer = include_header_footer
+        self._keep_line_breaks = keep_line_breaks
+        self._use_struct_tree = use_struct_tree
+        self._password = password
+        self._hybrid = hybrid
+
+    # ------------------------------------------------------------------
+    # Engine metadata
+    # ------------------------------------------------------------------
+
+    @property
+    def name(self) -> str:
+        return "opendataloader"
+
+    @property
+    def supported_extensions(self) -> set[str]:
+        return _SUPPORTED_EXTENSIONS
+
+    @property
+    def capabilities(self) -> EngineCapabilities:
+        return EngineCapabilities(
+            bounding_boxes=True,
+            reading_order=True,
+            heading_detection=True,
+            table_structure=True,
+        )
+
+    def is_available(self) -> bool:
+        if shutil.which("java") is None:
+            return False
+        try:
+            import opendataloader_pdf  # noqa: F401
+            return True
+        except ImportError:
+            return False
+
+    # ------------------------------------------------------------------
+    # Processing
+    # ------------------------------------------------------------------
+
+    async def process(
+        self,
+        file_path: str,
+        output_format: OutputFormat = OutputFormat.MARKDOWN,
+        **kwargs: Any,
+    ) -> EngineResult:
+        start = time.perf_counter()
+
+        loop = asyncio.get_running_loop()
+        content, page_count, bboxes = await loop.run_in_executor(
+            None, self._run, file_path, output_format
+        )
+
+        elapsed_ms = int((time.perf_counter() - start) * 1000)
+
+        return EngineResult(
+            content=content,
+            format=output_format,
+            engine_name=self.name,
+            pages=page_count,
+            processing_time_ms=elapsed_ms,
+            bounding_boxes=bboxes or None,
+            metadata={
+                "reading_order": self._reading_order or "default",
+                "table_method": self._table_method or "default",
+            },
+        )
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    def _run(
+        self, file_path: str, output_format: OutputFormat,
+    ) -> tuple[str, int, list[dict[str, Any]]]:
+        formats = self._formats_for(output_format)
+
+        with tempfile.TemporaryDirectory() as out_dir:
+            try:
+                _convert(
+                    file_path,
+                    output_dir=out_dir,
+                    format=formats,
+                    password=self._password,
+                    reading_order=self._reading_order,
+                    table_method=self._table_method,
+                    include_header_footer=self._include_header_footer,
+                    keep_line_breaks=self._keep_line_breaks,
+                    use_struct_tree=self._use_struct_tree,
+                    hybrid=self._hybrid,
+                    quiet=True,
+                )
+            except Exception as exc:
+                raise RuntimeError(f"opendataloader failed: {exc}") from exc
+
+            json_path = _find_output_file(out_dir, (".json",))
+            if not json_path:
+                raise RuntimeError("opendataloader produced no JSON output")
+
+            with open(json_path, encoding="utf-8") as f:
+                data = json.load(f)
+
+            page_count = int(data.get("number of pages", 0) or 0)
+            bboxes: list[dict[str, Any]] = []
+            _walk_kids(data.get("kids") or [], bboxes, {"n": 0})
+
+            content = self._read_primary(out_dir, output_format, data)
+
+        return content, page_count, bboxes
+
+    @staticmethod
+    def _formats_for(output_format: OutputFormat) -> list[str]:
+        """Build the list of output formats to request from the JAR."""
+        # Always include JSON — it carries the bounding-box tree we need.
+        formats = ["json"]
+        if output_format == OutputFormat.MARKDOWN:
+            formats.append("markdown")
+        elif output_format == OutputFormat.HTML:
+            formats.append("html")
+        elif output_format == OutputFormat.TEXT:
+            formats.append("text")
+        return formats
+
+    @staticmethod
+    def _read_primary(
+        out_dir: str, output_format: OutputFormat, json_data: dict[str, Any],
+    ) -> str:
+        if output_format == OutputFormat.JSON:
+            return json.dumps(json_data, ensure_ascii=False)
+
+        suffix_map = {
+            OutputFormat.MARKDOWN: (".md",),
+            OutputFormat.HTML: (".html",),
+            OutputFormat.TEXT: (".txt",),
+        }
+        path = _find_output_file(out_dir, suffix_map.get(output_format, (".md",)))
+        if path is None:
+            return ""
+        with open(path, encoding="utf-8") as f:
+            return f.read()
diff --git a/tests/engines/test_opendataloader_engine.py b/tests/engines/test_opendataloader_engine.py
new file mode 100644
index 0000000..4a1d66d
--- /dev/null
+++ b/tests/engines/test_opendataloader_engine.py
@@ -0,0 +1,337 @@
+"""Tests for OpenDataLoader PDF engine adapter.
+
+These are unit tests that mock the underlying ``opendataloader_pdf.convert``
+call so they run without Java installed.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from unittest.mock import patch
+
+import pytest
+
+from docfold.engines.base import EngineResult, OutputFormat
+
+
+def _odl_json(pages: int = 1, kids: list[dict] | None = None) -> dict:
+    """Build a JSON payload matching the real OpenDataLoader output shape."""
+    return {
+        "file name": "test.pdf",
+        "number of pages": pages,
+        "author": None,
+        "title": None,
+        "creation date": None,
+        "modification date": None,
+        "kids": kids or [
+            {
+                "type": "heading",
+                "id": 1,
+                "page number": 1,
+                "bounding box": [72.0, 688.85, 200.0, 705.0],
+                "heading level": 1,
+                "content": "Hello World",
+            },
+            {
+                "type": "paragraph",
+                "id": 2,
+                "page number": 1,
+                "bounding box": [72.0, 659.5, 400.0, 672.2],
+                "content": "Test document for OpenDataLoader",
+            },
+        ],
+    }
+
+
+def _fake_convert_factory(
+    json_payload: dict,
+    markdown: str = "# Hello World\n\nTest document for OpenDataLoader",
+    html: str = "<h1>Hello World</h1><p>Test document for OpenDataLoader</p>",
+    text: str = "Hello World\nTest document for OpenDataLoader",
+    stem: str = "test",
+):
+    """Produce a fake ``convert`` that writes output files into ``output_dir``."""
+
+    def fake_convert(input_path, output_dir=None, format=None, **kwargs):
+        assert output_dir is not None, "engine must pass an output_dir"
+        fmts = format if isinstance(format, list) else ([format] if format else [])
+        for fmt in fmts:
+            if fmt == "json":
+                with open(os.path.join(output_dir, f"{stem}.json"), "w") as f:
+                    json.dump(json_payload, f)
+            elif fmt in ("markdown", "markdown-with-html", "markdown-with-images"):
+                with open(os.path.join(output_dir, f"{stem}.md"), "w") as f:
+                    f.write(markdown)
+            elif fmt == "html":
+                with open(os.path.join(output_dir, f"{stem}.html"), "w") as f:
+                    f.write(html)
+            elif fmt == "text":
+                with open(os.path.join(output_dir, f"{stem}.txt"), "w") as f:
+                    f.write(text)
+
+    return fake_convert
+
+
+# ---------------------------------------------------------------------------
+# Metadata / capabilities
+# ---------------------------------------------------------------------------
+
+
+class TestOpenDataLoaderEngineMetadata:
+    def test_name(self):
+        from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+        e = OpenDataLoaderEngine()
+        assert e.name == "opendataloader"
+
+    def test_supported_extensions(self):
+        from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+        e = OpenDataLoaderEngine()
+        assert "pdf" in e.supported_extensions
+
+    def test_capabilities(self):
+        from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+        e = OpenDataLoaderEngine()
+        caps = e.capabilities
+        assert caps.bounding_boxes is True
+        assert caps.reading_order is True
+        assert caps.heading_detection is True
+        assert caps.table_structure is True
+
+    def test_is_available_false_when_package_missing(self):
+        from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+        e = OpenDataLoaderEngine()
+        with patch.dict("sys.modules", {"opendataloader_pdf": None}):
+            assert isinstance(e.is_available(), bool)
+
+    def test_is_available_false_when_java_missing(self):
+        from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+        e = OpenDataLoaderEngine()
+        with patch("shutil.which", return_value=None):
+            assert e.is_available() is False
+
+    def test_config_stored(self):
+        from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+        e = OpenDataLoaderEngine(
+            reading_order="xycut",
+            table_method="cluster",
+            include_header_footer=True,
+            password="secret",
+        )
+        assert e._reading_order == "xycut"
+        assert e._table_method == "cluster"
+        assert e._include_header_footer is True
+        assert e._password == "secret"
+
+
+# ---------------------------------------------------------------------------
+# process()
+# ---------------------------------------------------------------------------
+
+
+class TestOpenDataLoaderEngineProcess:
+    @pytest.mark.asyncio
+    async def test_process_markdown_format(self):
+        from docfold.engines import opendataloader_engine
+        from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+        e = OpenDataLoaderEngine()
+        fake_convert = _fake_convert_factory(_odl_json(pages=1))
+
+        with patch.object(opendataloader_engine, "_convert", fake_convert):
+            result = await e.process("test.pdf", output_format=OutputFormat.MARKDOWN)
+
+        assert isinstance(result, EngineResult)
+        assert result.engine_name == "opendataloader"
+        assert result.format == OutputFormat.MARKDOWN
+        assert "Hello World" in result.content
+        assert result.pages == 1
+        assert result.bounding_boxes is not None
+        assert len(result.bounding_boxes) == 2
+        assert result.processing_time_ms >= 0
+
+    @pytest.mark.asyncio
+    async def test_process_json_format(self):
+        from docfold.engines import opendataloader_engine
+        from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+        e = OpenDataLoaderEngine()
+        fake_convert = _fake_convert_factory(_odl_json(pages=1))
+
+        with patch.object(opendataloader_engine, "_convert", fake_convert):
+            result = await e.process("test.pdf", output_format=OutputFormat.JSON)
+
+        assert result.format == OutputFormat.JSON
+        # Content must be valid JSON
+        parsed = json.loads(result.content)
+        assert parsed["number of pages"] == 1
+
+    @pytest.mark.asyncio
+    async def test_process_html_format(self):
+        from docfold.engines import opendataloader_engine
+        from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+        e = OpenDataLoaderEngine()
+        fake_convert = _fake_convert_factory(_odl_json(pages=1))
+
+        with patch.object(opendataloader_engine, "_convert", fake_convert):
+            result = await e.process("test.pdf", output_format=OutputFormat.HTML)
+
+        assert result.format == OutputFormat.HTML
+        assert "<h1>" in result.content.lower()
+
+    @pytest.mark.asyncio
+    async def test_type_mapping(self):
+        """heading/paragraph/table/list/list-item should map to canonical types."""
+        from docfold.engines import opendataloader_engine
+        from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+        kids = [
+            {
+                "type": "heading",
+                "id": 1,
+                "page number": 1,
+                "bounding box": [0, 0, 100, 20],
+                "content": "H",
+            },
+            {
+                "type": "paragraph",
+                "id": 2,
+                "page number": 1,
+                "bounding box": [0, 30, 100, 50],
+                "content": "P",
+            },
+            {
+                "type": "table",
+                "id": 3,
+                "page number": 1,
+                "bounding box": [0, 60, 100, 200],
+                "content": "T",
+            },
+            {
+                "type": "list",
+                "id": 4,
+                "page number": 1,
+                "bounding box": [0, 210, 100, 300],
+                "content": "L",
+            },
+        ]
+        e = OpenDataLoaderEngine()
+        fake_convert = _fake_convert_factory(_odl_json(pages=1, kids=kids))
+        with patch.object(opendataloader_engine, "_convert", fake_convert):
+            result = await e.process("test.pdf", output_format=OutputFormat.MARKDOWN)
+
+        types = {b["type"] for b in result.bounding_boxes}
+        assert "SectionHeader" in types
+        assert "Text" in types
+        assert "Table" in types
+        assert "List" in types
+
+    @pytest.mark.asyncio
+    async def test_nested_kids_flattened(self):
+        """Nested kids (e.g. header container) must be flattened into bboxes."""
+        from docfold.engines import opendataloader_engine
+        from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+        nested_kids = [
+            {
+                "type": "header",
+                "id": 10,
+                "page number": 1,
+                "bounding box": [0, 0, 500, 400],
+                "kids": [
+                    {
+                        "type": "heading",
+                        "id": 1,
+                        "page number": 1,
+                        "bounding box": [0, 0, 100, 20],
+                        "content": "Title",
+                    },
+                    {
+                        "type": "paragraph",
+                        "id": 2,
+                        "page number": 1,
+                        "bounding box": [0, 30, 400, 100],
+                        "content": "Body",
+                    },
+                ],
+            }
+        ]
+        e = OpenDataLoaderEngine()
+        fake_convert = _fake_convert_factory(_odl_json(pages=1, kids=nested_kids))
+
+        with patch.object(opendataloader_engine, "_convert", fake_convert):
+            result = await e.process("test.pdf", output_format=OutputFormat.MARKDOWN)
+
+        # At least the two leaves must be emitted as bounding boxes.
+        assert result.bounding_boxes is not None
+        texts = [b.get("text", "") for b in result.bounding_boxes]
+        assert "Title" in texts
+        assert "Body" in texts
+
+    @pytest.mark.asyncio
+    async def test_bbox_coordinates_preserved(self):
+        from docfold.engines import opendataloader_engine
+        from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+        kids = [
+            {
+                "type": "paragraph",
+                "id": 1,
+                "page number": 2,
+                "bounding box": [12.5, 34.5, 200.0, 90.0],
+                "content": "Hi",
+            }
+        ]
+        e = OpenDataLoaderEngine()
+        fake_convert = _fake_convert_factory(_odl_json(pages=2, kids=kids))
+
+        with patch.object(opendataloader_engine, "_convert", fake_convert):
+            result = await e.process("test.pdf", output_format=OutputFormat.MARKDOWN)
+
+        assert result.bounding_boxes is not None
+        bbox = result.bounding_boxes[0]
+        assert bbox["bbox"] == [12.5, 34.5, 200.0, 90.0]
+        assert bbox["page"] == 2
+        assert bbox["text"] == "Hi"
+
+    @pytest.mark.asyncio
+    async def test_process_surfaces_errors_as_runtime_error(self):
+        from docfold.engines import opendataloader_engine
+        from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+        def boom(*args, **kwargs):
+            raise RuntimeError("java exited 2")
+
+        e = OpenDataLoaderEngine()
+        with patch.object(opendataloader_engine, "_convert", boom):
+            with pytest.raises(RuntimeError, match="opendataloader"):
+                await e.process("test.pdf", output_format=OutputFormat.MARKDOWN)
+
+    @pytest.mark.asyncio
+    async def test_reading_order_option_passed(self):
+        """reading_order kwarg on the engine must reach the underlying convert call."""
+        from docfold.engines import opendataloader_engine
+        from docfold.engines.opendataloader_engine import OpenDataLoaderEngine
+
+        captured: dict = {}
+
+        def capture(input_path, output_dir=None, format=None, **kwargs):
+            captured["kwargs"] = kwargs
+            # Still produce files so parsing succeeds
+            _fake_convert_factory(_odl_json(pages=1))(
+                input_path, output_dir=output_dir, format=format, **kwargs
+            )
+
+        e = OpenDataLoaderEngine(reading_order="xycut", table_method="cluster")
+        with patch.object(opendataloader_engine, "_convert", capture):
+            await e.process("test.pdf", output_format=OutputFormat.MARKDOWN)
+
+        assert captured["kwargs"].get("reading_order") == "xycut"
+        assert captured["kwargs"].get("table_method") == "cluster"
diff --git a/tests/fixtures/fonts/LICENSE.txt b/tests/fixtures/fonts/LICENSE.txt
new file mode 100644
index 0000000..dffd6b1
--- /dev/null
+++ b/tests/fixtures/fonts/LICENSE.txt
@@ -0,0 +1,20 @@
+Benchmark font fixtures — all bundled Noto fonts are licensed under the
+SIL Open Font License, Version 1.1 (OFL-1.1).
+
+Files
+-----
+- NotoNaskhArabic-Regular.ttf — full Noto Naskh Arabic Regular
+- NotoSansCJKsc-Regular-subset.ttf — Noto Sans CJK SC Regular, subsetted to
+  the glyphs used in the Chinese benchmark document only
+- NotoSansHebrew-Regular-subset.ttf — Noto Sans Hebrew Regular, subsetted to
+  the glyphs used in the Hebrew benchmark document only
+
+Sources
+-------
+- https://fonts.google.com/noto/specimen/Noto+Naskh+Arabic
+- https://fonts.google.com/noto/specimen/Noto+Sans+Simplified+Chinese
+- https://fonts.google.com/noto/specimen/Noto+Sans+Hebrew
+- OFL license text: https://scripts.sil.org/OFL
+
+Subsetting was done with fontTools (``fontTools.subset.Subsetter``) preserving
+all OpenType layout features so shaping / bidi still work.
diff --git a/tests/fixtures/fonts/NotoNaskhArabic-Regular.ttf b/tests/fixtures/fonts/NotoNaskhArabic-Regular.ttf
new file mode 100644
index 0000000..00a33b3
Binary files /dev/null and b/tests/fixtures/fonts/NotoNaskhArabic-Regular.ttf differ
diff --git a/tests/fixtures/fonts/NotoSansCJKsc-Regular-subset.ttf b/tests/fixtures/fonts/NotoSansCJKsc-Regular-subset.ttf
new file mode 100644
index 0000000..1cdd83d
Binary files /dev/null and b/tests/fixtures/fonts/NotoSansCJKsc-Regular-subset.ttf differ
diff --git a/tests/fixtures/fonts/NotoSansHebrew-Regular-subset.ttf b/tests/fixtures/fonts/NotoSansHebrew-Regular-subset.ttf
new file mode 100644
index 0000000..9330e17
Binary files /dev/null and b/tests/fixtures/fonts/NotoSansHebrew-Regular-subset.ttf differ