diff --git a/CHANGELOG.md b/CHANGELOG.md index 344b9a9..29a2b68 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added + +- **OpenDataLoader PDF engine adapter** — wraps the Java-based [`opendataloader-pdf`](https://github.com/opendataloader-project/opendataloader-pdf) tool (via its bundled-JAR Python wheel). Local, deterministic extraction with typed structural elements (heading, paragraph, table, list, header, footer) and per-element bounding boxes. Install: `pip install docfold[opendataloader]` (also requires Java 11+). +- **Multi-script benchmark coverage** — `benchmark.py` now generates Arabic (RTL + shaping), Hebrew (RTL, no shaping), and Simplified Chinese (CJK) synthetic PDFs alongside the existing English docs. Fonts are bundled under `tests/fixtures/fonts/` (OFL-1.1, subsetted where relevant) so the benchmark is reproducible without system font packages. + ## [0.6.0] - 2026-02-20 ### Added diff --git a/README.md b/README.md index 002f161..7c2d8f9 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ Docfold is the open-source extraction engine from [Datatera.ai](https://datatera | [**MinerU**](https://github.com/opendatalab/MinerU) | ✅ | Local | AGPL | ★★★ | ★★★ | ★★★ | — | — | Slow | Free | | [**Marker**](https://www.datalab.to/) | ✅ | SaaS | Paid | ★★★ | ★★★ | ★★★ | ✅ | — | Fast | $$ | | [**PyMuPDF**](https://pymupdf.readthedocs.io/) | ✅ | Local | AGPL | ★★★ | ☆☆☆ | ★☆☆ | — | — | Ultra | Free | +| [**OpenDataLoader**](https://github.com/opendataloader-project/opendataloader-pdf) | ✅ | Local | Apache | ★★★ | ☆☆☆ | ★★☆ | ✅ | — | Fast | Free | | [**PaddleOCR**](https://github.com/PaddlePaddle/PaddleOCR) | ✅ | Local | Apache | ★☆☆ | ★★★ | ★★☆ | — | ✅ | Medium | Free | | [**Tesseract**](https://github.com/tesseract-ocr/tesseract) | ✅ | Local | Apache | ★☆☆ | ★★☆ | ★☆☆ | — | — | Medium | Free | | [**EasyOCR**](https://github.com/JaidedAI/EasyOCR) | ✅ | Local | Apache | ★☆☆ | ★★★ | ☆☆☆ | — | ✅ | Medium | Free | @@ -94,6 +95,7 @@ for name, res in results.items(): | [**MinerU**](https://github.com/opendatalab/MinerU) | Local | AGPL-3.0 | PDF | Recommended | `pip install docfold[mineru]` | | [**Marker API**](https://www.datalab.to/) | SaaS | Paid | PDF, Office, images | N/A | `pip install docfold[marker]` | | [**PyMuPDF**](https://pymupdf.readthedocs.io/) | Local | AGPL-3.0 | PDF | No | `pip install docfold[pymupdf]` | +| [**OpenDataLoader**](https://github.com/opendataloader-project/opendataloader-pdf) | Local | Apache-2.0 | PDF | No (needs Java 11+) | `pip install docfold[opendataloader]` | | [**PaddleOCR**](https://github.com/PaddlePaddle/PaddleOCR) | Local | Apache-2.0 | Images, scanned PDFs | Optional | `pip install docfold[paddleocr]` | | [**Tesseract**](https://github.com/tesseract-ocr/tesseract) | Local | Apache-2.0 | Images, scanned PDFs | No | `pip install docfold[tesseract]` | | [**EasyOCR**](https://github.com/JaidedAI/EasyOCR) | Local | Apache-2.0 | Images, scanned PDFs | Optional | `pip install docfold[easyocr]` | diff --git a/benchmark.py b/benchmark.py index c795e11..287d5a9 100644 --- a/benchmark.py +++ b/benchmark.py @@ -37,6 +37,90 @@ def create_text_pdf(path: str, pages: list[dict]) -> None: doc.close() +_FIXTURE_FONT_DIR = os.path.join( + os.path.dirname(__file__), "tests", "fixtures", "fonts" +) + + +def _find_bundled_font(preferred: str, fallbacks: list[tuple[str, str]]) -> tuple[str, str] | None: + """Return ``(font_dir, ttf_name)`` for a font that exists on disk. + + Prefers the bundled fixture under ``tests/fixtures/fonts/`` (shipped under + OFL-1.1) so the benchmark is reproducible on any host; falls back to + system paths only as a safety net. + """ + if os.path.exists(os.path.join(_FIXTURE_FONT_DIR, preferred)): + return _FIXTURE_FONT_DIR, preferred + for d, fname in fallbacks: + if os.path.exists(os.path.join(d, fname)): + return d, fname + return None + + +def _find_arabic_font() -> tuple[str, str] | None: + return _find_bundled_font( + "NotoNaskhArabic-Regular.ttf", + [ + ("/usr/share/fonts/truetype/noto", "NotoNaskhArabic-Regular.ttf"), + ("/usr/share/fonts/noto", "NotoNaskhArabic-Regular.ttf"), + ("/usr/share/fonts/truetype/noto", "NotoSansArabic-Regular.ttf"), + ], + ) + + +def _find_script_font(preferred: str) -> tuple[str, str] | None: + """Bundled fonts for non-Arabic scripts — no system fallback because the + subsetted TTF is what we tested against.""" + return _find_bundled_font(preferred, []) + + +def _render_html_pdf(path: str, html_body: str, font_info: tuple[str, str]) -> None: + """Generic HTML → PDF renderer using PyMuPDF's ``insert_htmlbox`` with + a bundled font archive. Handles shaping / bidi via HarfBuzz under the hood. + """ + import fitz + + font_dir, ttf = font_info + doc = fitz.open() + page = doc.new_page(width=612, height=792) + archive = fitz.Archive(font_dir) + css = f"@font-face {{ font-family: 'BenchFont'; src: url({ttf}); }}" + page.insert_htmlbox(fitz.Rect(36, 36, 576, 756), html_body, css=css, archive=archive) + doc.save(path) + doc.close() + + +def create_arabic_pdf(path: str, html_body: str) -> None: + """Render an Arabic HTML snippet to PDF using Noto Naskh Arabic.""" + font_info = _find_arabic_font() + if font_info is None: + raise RuntimeError( + "Arabic font fixture missing: " + "tests/fixtures/fonts/NotoNaskhArabic-Regular.ttf" + ) + _render_html_pdf(path, html_body, font_info) + + +def create_script_pdf(path: str, html_body: str, font_ttf: str) -> None: + """Render an HTML snippet to PDF using a bundled script-specific font.""" + font_info = _find_script_font(font_ttf) + if font_info is None: + raise RuntimeError(f"Font fixture missing: tests/fixtures/fonts/{font_ttf}") + _render_html_pdf(path, html_body, font_info) + + +def _extract_ground_truth(pdf_path: str) -> str: + """Return PyMuPDF's extracted text — used as ground truth for docs whose + authoritative form depends on font shaping (e.g. Arabic). + """ + import fitz + + doc = fitz.open(pdf_path) + text = "\n".join(p.get_text() for p in doc) + doc.close() + return text + + def generate_benchmark_documents(tmpdir: str) -> list[dict]: """Generate synthetic PDFs and return metadata with ground truth.""" documents = [] @@ -130,6 +214,78 @@ def generate_benchmark_documents(tmpdir: str) -> list[dict]: "category": "report", }) + # --- Doc 5: Arabic (RTL + shaping) --- + # PDFs store Arabic in shaped presentation forms and reverse visual order. + # We use PyMuPDF's extraction of the generated PDF as ground truth — this + # measures whether *other* engines agree on the same text, not whether + # they normalize to logical Unicode (a harder task). + doc5_path = os.path.join(tmpdir, "arabic_report.pdf") + arabic_html = ( + '
" + "

تقرير سنوي 2024

" + "

حققت الشركة نموا قياسيا هذا العام بإيرادات تجاوزت التوقعات.

" + "

بلغت نسبة رضا العملاء 94 بالمئة.

" + "

وصل معدل الاحتفاظ بالموظفين إلى 96 بالمئة.

" + "
" + ) + create_arabic_pdf(doc5_path, arabic_html) + documents.append({ + "name": "arabic_report", + "path": doc5_path, + "ground_truth": _extract_ground_truth(doc5_path), + "pages": 1, + "category": "rtl", + }) + + # --- Doc 6: Simplified Chinese (CJK) --- + # CJK has no shaping and LTR, but tests that engines don't mangle + # multi-byte Unicode. Font is subsetted (60 KB) from Noto Sans CJK SC. + doc6_path = os.path.join(tmpdir, "chinese_report.pdf") + chinese_html = ( + '
" + "

2024年度报告

" + "

公司今年实现了创纪录的增长,收入超出预期。

" + "

客户满意度达到了94%。

" + "

员工保留率达到96%,创公司历史新高。

" + "
" + ) + create_script_pdf(doc6_path, chinese_html, "NotoSansCJKsc-Regular-subset.ttf") + documents.append({ + "name": "chinese_report", + "path": doc6_path, + "ground_truth": _extract_ground_truth(doc6_path), + "pages": 1, + "category": "cjk", + }) + + # --- Doc 7: Hebrew (RTL, no shaping) --- + # Good contrast to Arabic: same RTL bidi, but no contextual shaping. + doc7_path = os.path.join(tmpdir, "hebrew_report.pdf") + hebrew_html = ( + '
" + "

דוח שנתי 2024

" + "

החברה השיגה צמיחה שיא השנה, עם הכנסות שעלו על הציפיות.

" + "

שביעות רצון הלקוחות הגיעה ל-94 אחוז.

" + "

שיעור שימור העובדים הגיע ל-96 אחוז.

" + "
" + ) + create_script_pdf(doc7_path, hebrew_html, "NotoSansHebrew-Regular-subset.ttf") + documents.append({ + "name": "hebrew_report", + "path": doc7_path, + "ground_truth": _extract_ground_truth(doc7_path), + "pages": 1, + "category": "rtl", + }) + + # NOTE: Devanagari and Thai are intentionally omitted. PyMuPDF's + # ``insert_htmlbox`` produces PDFs whose ToUnicode maps don't survive + # round-trip extraction for those scripts (null bytes, dropped matras). + # They need real-world fixture PDFs — see docs/tasks/ for a follow-up. + return documents @@ -197,6 +353,7 @@ async def main(): from docfold.engines.marker_local_engine import MarkerLocalEngine from docfold.engines.mineru_engine import MinerUEngine from docfold.engines.nougat_engine import NougatEngine + from docfold.engines.opendataloader_engine import OpenDataLoaderEngine from docfold.engines.paddleocr_engine import PaddleOCREngine from docfold.engines.pymupdf_engine import PyMuPDFEngine from docfold.engines.surya_engine import SuryaEngine @@ -210,6 +367,7 @@ async def main(): candidates = [ (PyMuPDFEngine(), "pip install pymupdf"), (LiteParseEngine(ocr_enabled=False), "npm i -g @llamaindex/liteparse"), + (OpenDataLoaderEngine(), "pip install docfold[opendataloader] (needs Java 11+)"), (MinerUEngine(), "pip install docfold[mineru]"), (MarkerLocalEngine(), "pip install marker-pdf"), (SuryaEngine(), "pip install surya-ocr"), diff --git a/docs/benchmark_results.json b/docs/benchmark_results.json index 9899727..631f884 100644 --- a/docs/benchmark_results.json +++ b/docs/benchmark_results.json @@ -1,14 +1,8 @@ { - "benchmark_date": "2026-04-05 22:20:15", + "benchmark_date": "2026-04-17 10:17:34", "engines": [ "pymupdf", - "mineru", - "marker_local", - "surya", - "docling", - "paddleocr", - "tesseract", - "unstructured" + "opendataloader" ], "documents": [ { @@ -30,20 +24,35 @@ "name": "mixed_formatting", "pages": 1, "category": "report" + }, + { + "name": "arabic_report", + "pages": 1, + "category": "rtl" + }, + { + "name": "chinese_report", + "pages": 1, + "category": "cjk" + }, + { + "name": "hebrew_report", + "pages": 1, + "category": "rtl" } ], "summary": { "pymupdf": { - "avg_time_ms": 3.5, + "avg_time_ms": 6.4, "avg_cer": 0.0, "avg_wer": 0.0, - "avg_bbox_count": 6.2, + "avg_bbox_count": 5.3, "errors": 0, - "successes": 4, + "successes": 7, "results": [ { "doc": "simple_text", - "time_ms": 6, + "time_ms": 11, "cer": 0.0, "wer": 0.0, "bbox_count": 5, @@ -52,7 +61,7 @@ }, { "doc": "multi_page", - "time_ms": 2, + "time_ms": 4, "cer": 0.0, "wer": 0.0, "bbox_count": 4, @@ -61,7 +70,7 @@ }, { "doc": "dense_financial", - "time_ms": 3, + "time_ms": 5, "cer": 0.0, "wer": 0.0, "bbox_count": 10, @@ -70,334 +79,112 @@ }, { "doc": "mixed_formatting", - "time_ms": 3, + "time_ms": 4, "cer": 0.0, "wer": 0.0, "bbox_count": 6, "content_length": 260, "pages": 1 - } - ] - }, - "mineru": { - "avg_time_ms": 18304.8, - "avg_cer": 0.0118, - "avg_wer": 0.0804, - "avg_bbox_count": 0.0, - "errors": 0, - "successes": 4, - "results": [ - { - "doc": "simple_text", - "time_ms": 55848, - "cer": 0.0083, - "wer": 0.0556, - "bbox_count": 0, - "content_length": 122, - "pages": null }, { - "doc": "multi_page", - "time_ms": 7511, - "cer": 0.0, - "wer": 0.0, - "bbox_count": 0, - "content_length": 300, - "pages": null - }, - { - "doc": "dense_financial", - "time_ms": 4845, - "cer": 0.0235, - "wer": 0.2121, - "bbox_count": 0, - "content_length": 305, - "pages": null - }, - { - "doc": "mixed_formatting", - "time_ms": 5015, - "cer": 0.0154, - "wer": 0.0541, - "bbox_count": 0, - "content_length": 264, - "pages": null - } - ] - }, - "marker_local": { - "avg_time_ms": 39090.5, - "avg_cer": 0.0191, - "avg_wer": 0.0936, - "avg_bbox_count": 0.0, - "errors": 0, - "successes": 4, - "results": [ - { - "doc": "simple_text", - "time_ms": 13760, - "cer": 0.0083, - "wer": 0.0556, - "bbox_count": 0, - "content_length": 122, - "pages": null - }, - { - "doc": "multi_page", - "time_ms": 69190, - "cer": 0.01, - "wer": 0.0256, - "bbox_count": 0, - "content_length": 303, - "pages": null - }, - { - "doc": "dense_financial", - "time_ms": 37193, - "cer": 0.0235, - "wer": 0.2121, - "bbox_count": 0, - "content_length": 305, - "pages": null - }, - { - "doc": "mixed_formatting", - "time_ms": 36219, - "cer": 0.0346, - "wer": 0.0811, - "bbox_count": 0, - "content_length": 269, - "pages": null - } - ] - }, - "surya": { - "avg_time_ms": 32959.2, - "avg_cer": 0.0135, - "avg_wer": 0.027, - "avg_bbox_count": 0.0, - "errors": 0, - "successes": 4, - "results": [ - { - "doc": "simple_text", - "time_ms": 7741, + "doc": "arabic_report", + "time_ms": 6, "cer": 0.0, "wer": 0.0, - "bbox_count": 0, - "content_length": 121, + "bbox_count": 4, + "content_length": 151, "pages": 1 }, { - "doc": "multi_page", - "time_ms": 45426, + "doc": "chinese_report", + "time_ms": 4, "cer": 0.0, "wer": 0.0, - "bbox_count": 0, - "content_length": 300, - "pages": 2 + "bbox_count": 4, + "content_length": 63, + "pages": 1 }, { - "doc": "dense_financial", - "time_ms": 35792, + "doc": "hebrew_report", + "time_ms": 11, "cer": 0.0, "wer": 0.0, - "bbox_count": 0, - "content_length": 298, - "pages": 1 - }, - { - "doc": "mixed_formatting", - "time_ms": 42878, - "cer": 0.0538, - "wer": 0.1081, - "bbox_count": 0, - "content_length": 274, + "bbox_count": 4, + "content_length": 143, "pages": 1 } ] }, - "docling": { - "avg_time_ms": 2601.0, - "avg_cer": 0.0173, - "avg_wer": 0.0406, - "avg_bbox_count": 0.0, + "opendataloader": { + "avg_time_ms": 796.6, + "avg_cer": 0.257, + "avg_wer": 0.3756, + "avg_bbox_count": 3.1, "errors": 0, - "successes": 4, + "successes": 7, "results": [ { "doc": "simple_text", - "time_ms": 3738, - "cer": 0.0248, + "time_ms": 1083, + "cer": 0.0165, "wer": 0.0556, - "bbox_count": 0, - "content_length": 124, - "pages": null + "bbox_count": 2, + "content_length": 123, + "pages": 1 }, { "doc": "multi_page", - "time_ms": 3170, - "cer": 0.01, - "wer": 0.0256, - "bbox_count": 0, - "content_length": 303, - "pages": null + "time_ms": 756, + "cer": 0.0133, + "wer": 0.0513, + "bbox_count": 2, + "content_length": 304, + "pages": 2 }, { "doc": "dense_financial", - "time_ms": 1715, + "time_ms": 725, "cer": 0.0, "wer": 0.0, - "bbox_count": 0, + "bbox_count": 1, "content_length": 298, - "pages": null + "pages": 1 }, { "doc": "mixed_formatting", - "time_ms": 1781, - "cer": 0.0346, + "time_ms": 737, + "cer": 0.0308, "wer": 0.0811, - "bbox_count": 0, - "content_length": 269, - "pages": null - } - ] - }, - "paddleocr": { - "avg_time_ms": 1617.2, - "avg_cer": 0.0018, - "avg_wer": 0.0132, - "avg_bbox_count": 0.0, - "errors": 0, - "successes": 4, - "results": [ - { - "doc": "simple_text", - "time_ms": 1304, - "cer": 0.0, - "wer": 0.0, - "bbox_count": 0, - "content_length": 121, - "pages": null - }, - { - "doc": "multi_page", - "time_ms": 2226, - "cer": 0.0033, - "wer": 0.0256, - "bbox_count": 0, - "content_length": 299, - "pages": null - }, - { - "doc": "dense_financial", - "time_ms": 1391, - "cer": 0.0, - "wer": 0.0, - "bbox_count": 0, - "content_length": 298, - "pages": null - }, - { - "doc": "mixed_formatting", - "time_ms": 1548, - "cer": 0.0038, - "wer": 0.027, - "bbox_count": 0, - "content_length": 261, - "pages": null - } - ] - }, - "tesseract": { - "avg_time_ms": 1190.0, - "avg_cer": 0.0, - "avg_wer": 0.0, - "avg_bbox_count": 0.0, - "errors": 0, - "successes": 4, - "results": [ - { - "doc": "simple_text", - "time_ms": 881, - "cer": 0.0, - "wer": 0.0, - "bbox_count": 0, - "content_length": 121, - "pages": null - }, - { - "doc": "multi_page", - "time_ms": 1718, - "cer": 0.0, - "wer": 0.0, - "bbox_count": 0, - "content_length": 300, - "pages": null - }, - { - "doc": "dense_financial", - "time_ms": 1106, - "cer": 0.0, - "wer": 0.0, - "bbox_count": 0, - "content_length": 298, - "pages": null - }, - { - "doc": "mixed_formatting", - "time_ms": 1055, - "cer": 0.0, - "wer": 0.0, - "bbox_count": 0, - "content_length": 260, - "pages": null - } - ] - }, - "unstructured": { - "avg_time_ms": 597.0, - "avg_cer": 0.0357, - "avg_wer": 0.1353, - "avg_bbox_count": 0.0, - "errors": 0, - "successes": 4, - "results": [ - { - "doc": "simple_text", - "time_ms": 2213, - "cer": 0.0661, - "wer": 0.2222, - "bbox_count": 0, - "content_length": 129, - "pages": null + "bbox_count": 5, + "content_length": 268, + "pages": 1 }, { - "doc": "multi_page", - "time_ms": 75, - "cer": 0.0067, - "wer": 0.0256, - "bbox_count": 0, - "content_length": 302, - "pages": null + "doc": "arabic_report", + "time_ms": 766, + "cer": 0.8675, + "wer": 1.08, + "bbox_count": 4, + "content_length": 151, + "pages": 1 }, { - "doc": "dense_financial", - "time_ms": 55, - "cer": 0.047, - "wer": 0.2121, - "bbox_count": 0, - "content_length": 312, - "pages": null + "doc": "chinese_report", + "time_ms": 721, + "cer": 0.0317, + "wer": 0.25, + "bbox_count": 4, + "content_length": 65, + "pages": 1 }, { - "doc": "mixed_formatting", - "time_ms": 45, - "cer": 0.0231, - "wer": 0.0811, - "bbox_count": 0, - "content_length": 266, - "pages": null + "doc": "hebrew_report", + "time_ms": 788, + "cer": 0.8392, + "wer": 1.1111, + "bbox_count": 4, + "content_length": 149, + "pages": 1 } ] } diff --git a/docs/tasks/OPENDATALOADER_ENGINE.md b/docs/tasks/OPENDATALOADER_ENGINE.md new file mode 100644 index 0000000..68a21e1 --- /dev/null +++ b/docs/tasks/OPENDATALOADER_ENGINE.md @@ -0,0 +1,95 @@ +--- +purpose: "Add OpenDataLoader PDF as a local, fast PDF structuring engine." +status: "OPEN" +priority: "P2" +created: "2026-04-16" +--- + +# Feature: OpenDataLoader PDF Engine + +## Problem +docfold already ships several local PDF engines (PyMuPDF, LiteParse, Docling, MinerU, …), +but none of them are based on `opendataloader-pdf` — a popular, Apache-2.0 Java tool +(16.8k stars on GitHub) exposed through a thin Python wrapper on PyPI +(`opendataloader-pdf`). + +Why add it now: +- Very fast deterministic layout + reading-order extraction on CPU (benchmarked + at 100+ pages/sec by upstream) — useful as a reliable, low-latency baseline + to compare against heavier OCR / ML engines. +- Emits richly typed structural elements (`heading`, `paragraph`, `table`, + `list`, `header`, `footer`, …) with per-element bounding boxes and page + numbers — a good fit for the docfold `EngineResult` contract. +- Users can opt into a hybrid AI mode later without changing the adapter. + +## Proposed Solution +Implement a new engine adapter `OpenDataLoaderEngine` that wraps the +`opendataloader-pdf` Python package. The engine will: + +1. Write output (`json` + requested format) to a temp directory via + `opendataloader_pdf.convert(...)`. +2. Read back the produced files: + - For `MARKDOWN` — read the `.md` file (or `text` output). + - For `HTML` — read the `.html` file. + - For `JSON`/`TEXT` — use the JSON/text file directly. +3. Parse the JSON output to build a flat list of `BoundingBox` entries by + recursively walking the nested `kids` tree. Map upstream types + (`heading`, `paragraph`, `table`, `list`, `header`, `footer`, …) to + docfold's `BoundingBox.type` names (`SectionHeader`, `Text`, `Table`, + `List`, …). +4. Normalize PDF-point coordinates — upstream emits `[x1, y1, x2, y2]` in + PDF points; we pass them through unchanged (same as PyMuPDF). + +Capabilities advertised: `bounding_boxes=True`, `reading_order=True`, +`heading_detection=True`, `table_structure=True`. + +## Affected Files +- `src/docfold/engines/opendataloader_engine.py` — new adapter +- `tests/engines/test_opendataloader_engine.py` — new tests (mocked subprocess) +- `benchmark.py` — register the new engine alongside the existing ones +- `pyproject.toml` — add `opendataloader = ["opendataloader-pdf>=2.2"]` extra, + include it in `[all]` +- `README.md` / `CHANGELOG.md` — short mention (optional in this task) + +## Test Plan + +### Unit / Functional Tests +- [x] `name` is `"opendataloader"` +- [x] `supported_extensions` includes `"pdf"` +- [x] `capabilities` advertises `bounding_boxes`, `reading_order`, + `heading_detection`, `table_structure` +- [x] `is_available()` returns True when `opendataloader_pdf` imports and + `java` is on PATH; False otherwise +- [x] `process()` returns an `EngineResult` with the correct `engine_name`, + `format`, non-empty `content`, populated `pages`, non-empty + `bounding_boxes`, and `processing_time_ms >= 0` — tested with a + mocked `convert()` that materializes a fake output directory +- [x] JSON walker flattens nested `kids` into one `BoundingBox` per leaf + element, preserving page numbers and bbox coords +- [x] `heading` → `SectionHeader`, `paragraph` → `Text`, `table` → `Table`, + `list` → `List` type mapping +- [x] Errors from the underlying CLI surface as `RuntimeError` + +### Integration / E2E Tests +- [x] `benchmark.py` discovers the engine when `opendataloader-pdf` and + Java are installed and reports CER/WER/time/bbox counts alongside the + other engines. + +### Test Commands +```bash +pytest tests/engines/test_opendataloader_engine.py -v +pytest tests/ # full suite still green +python benchmark.py # sanity check +``` + +## Edge Cases +- `java` not installed → `is_available()` returns False, no crash. +- Encrypted PDF without password → surface upstream error as `RuntimeError`. +- Empty PDF / no elements → `bounding_boxes` is `None`, `content` is `""`. +- Pages without any `kids` → still counted via `"number of pages"`. +- Deeply nested `kids` (tables / lists) → recursion handles arbitrary depth. + +## Out of Scope +- Hybrid AI mode (`hybrid=…`) — can be added later via kwargs. +- Image extraction / annotated-PDF output. +- Table structure parsing into `tables` list of dicts (bboxes only for now). diff --git a/pyproject.toml b/pyproject.toml index e4e0034..29af5f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,6 +74,10 @@ llamaparse = [ liteparse = [ # No Python deps — requires Node.js 18+ and: npm i -g @llamaindex/liteparse ] +opendataloader = [ + # Requires Java 11+ on PATH; JAR is bundled by the Python wheel. + "opendataloader-pdf>=2.2", +] mistral-ocr = [ "mistralai>=1.0", ] @@ -112,7 +116,7 @@ evaluation = [ "psutil>=5.9", # Memory measurement ] all = [ - "docfold[docling,mineru,marker,pymupdf,paddleocr,tesseract,easyocr,unstructured,llamaparse,liteparse,mistral-ocr,textract,google-docai,azure-docint,nougat,chandra,surya,firecrawl,evaluation]", + "docfold[docling,mineru,marker,pymupdf,paddleocr,tesseract,easyocr,unstructured,llamaparse,liteparse,opendataloader,mistral-ocr,textract,google-docai,azure-docint,nougat,chandra,surya,firecrawl,evaluation]", # Note: zerox excluded from [all] — py-zerox requires Python 3.11+ # Install separately: pip install docfold[zerox] ] diff --git a/src/docfold/engines/opendataloader_engine.py b/src/docfold/engines/opendataloader_engine.py new file mode 100644 index 0000000..fd5da97 --- /dev/null +++ b/src/docfold/engines/opendataloader_engine.py @@ -0,0 +1,272 @@ +"""OpenDataLoader PDF engine adapter. + +Wraps the `opendataloader-pdf `_ +Java tool via its Python package (`opendataloader-pdf` on PyPI). Produces +Markdown / HTML / JSON output with per-element bounding boxes and PDF page +numbers, fully local, no API keys. + +Install: ``pip install docfold[opendataloader]`` (also requires Java 11+). +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +import shutil +import tempfile +import time +from typing import Any + +from docfold.engines.base import ( + BoundingBox, + DocumentEngine, + EngineCapabilities, + EngineResult, + OutputFormat, +) + +logger = logging.getLogger(__name__) + +_SUPPORTED_EXTENSIONS = {"pdf"} + + +# Upstream block type -> docfold canonical type. +_TYPE_MAP: dict[str, str] = { + "heading": "SectionHeader", + "title": "SectionHeader", + "paragraph": "Text", + "text": "Text", + "caption": "Caption", + "table": "Table", + "table-cell": "TableCell", + "list": "List", + "list-item": "ListItem", + "figure": "Image", + "image": "Image", + "header": "PageHeader", + "footer": "PageFooter", + "footnote": "Footnote", +} + + +def _convert(*args: Any, **kwargs: Any) -> None: + """Indirection so tests can monkey-patch the JAR call.""" + from opendataloader_pdf import convert as _upstream_convert + + _upstream_convert(*args, **kwargs) + + +def _map_type(raw_type: str) -> str: + if not raw_type: + return "Text" + return _TYPE_MAP.get(raw_type.lower(), raw_type.capitalize()) + + +def _walk_kids( + nodes: list[dict[str, Any]], + bboxes: list[dict[str, Any]], + counter: dict[str, int], +) -> None: + """Depth-first flatten nested ``kids`` into a list of :class:`BoundingBox` dicts.""" + for node in nodes: + if not isinstance(node, dict): + continue + + bbox_raw = node.get("bounding box") + page = node.get("page number") + text = (node.get("content") or "").strip() + children = node.get("kids") or [] + node_type = _map_type(node.get("type", "")) + + # Emit a bbox for any node that has enough geometry info. We prefer + # leaf nodes (no children) but also include parent containers that + # carry usable text of their own and geometry — they'll show as a + # single block instead of being lost. + if bbox_raw and page and (not children or text): + try: + coords = [float(x) for x in bbox_raw] + except (TypeError, ValueError): + coords = None + if coords and len(coords) == 4: + idx = counter["n"] + counter["n"] += 1 + bboxes.append( + BoundingBox( + type=node_type, + bbox=coords, + page=int(page), + text=text, + id=f"p{int(page)}-e{idx}", + ).to_dict() + ) + + if children: + _walk_kids(children, bboxes, counter) + + +def _find_output_file(output_dir: str, suffixes: tuple[str, ...]) -> str | None: + for name in sorted(os.listdir(output_dir)): + for suffix in suffixes: + if name.endswith(suffix): + return os.path.join(output_dir, name) + return None + + +class OpenDataLoaderEngine(DocumentEngine): + """Adapter for ``opendataloader-pdf`` (Java CLI via Python wrapper).""" + + def __init__( + self, + *, + reading_order: str | None = None, + table_method: str | None = None, + include_header_footer: bool = False, + keep_line_breaks: bool = False, + use_struct_tree: bool = False, + password: str | None = None, + hybrid: str | None = None, + ) -> None: + self._reading_order = reading_order + self._table_method = table_method + self._include_header_footer = include_header_footer + self._keep_line_breaks = keep_line_breaks + self._use_struct_tree = use_struct_tree + self._password = password + self._hybrid = hybrid + + # ------------------------------------------------------------------ + # Engine metadata + # ------------------------------------------------------------------ + + @property + def name(self) -> str: + return "opendataloader" + + @property + def supported_extensions(self) -> set[str]: + return _SUPPORTED_EXTENSIONS + + @property + def capabilities(self) -> EngineCapabilities: + return EngineCapabilities( + bounding_boxes=True, + reading_order=True, + heading_detection=True, + table_structure=True, + ) + + def is_available(self) -> bool: + if shutil.which("java") is None: + return False + try: + import opendataloader_pdf # noqa: F401 + return True + except ImportError: + return False + + # ------------------------------------------------------------------ + # Processing + # ------------------------------------------------------------------ + + async def process( + self, + file_path: str, + output_format: OutputFormat = OutputFormat.MARKDOWN, + **kwargs: Any, + ) -> EngineResult: + start = time.perf_counter() + + loop = asyncio.get_running_loop() + content, page_count, bboxes = await loop.run_in_executor( + None, self._run, file_path, output_format + ) + + elapsed_ms = int((time.perf_counter() - start) * 1000) + + return EngineResult( + content=content, + format=output_format, + engine_name=self.name, + pages=page_count, + processing_time_ms=elapsed_ms, + bounding_boxes=bboxes or None, + metadata={ + "reading_order": self._reading_order or "default", + "table_method": self._table_method or "default", + }, + ) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _run( + self, file_path: str, output_format: OutputFormat, + ) -> tuple[str, int, list[dict[str, Any]]]: + formats = self._formats_for(output_format) + + with tempfile.TemporaryDirectory() as out_dir: + try: + _convert( + file_path, + output_dir=out_dir, + format=formats, + password=self._password, + reading_order=self._reading_order, + table_method=self._table_method, + include_header_footer=self._include_header_footer, + keep_line_breaks=self._keep_line_breaks, + use_struct_tree=self._use_struct_tree, + hybrid=self._hybrid, + quiet=True, + ) + except Exception as exc: + raise RuntimeError(f"opendataloader failed: {exc}") from exc + + json_path = _find_output_file(out_dir, (".json",)) + if not json_path: + raise RuntimeError("opendataloader produced no JSON output") + + with open(json_path, encoding="utf-8") as f: + data = json.load(f) + + page_count = int(data.get("number of pages", 0) or 0) + bboxes: list[dict[str, Any]] = [] + _walk_kids(data.get("kids") or [], bboxes, {"n": 0}) + + content = self._read_primary(out_dir, output_format, data) + + return content, page_count, bboxes + + @staticmethod + def _formats_for(output_format: OutputFormat) -> list[str]: + """Build the list of output formats to request from the JAR.""" + # Always include JSON — it carries the bounding-box tree we need. + formats = ["json"] + if output_format == OutputFormat.MARKDOWN: + formats.append("markdown") + elif output_format == OutputFormat.HTML: + formats.append("html") + elif output_format == OutputFormat.TEXT: + formats.append("text") + return formats + + @staticmethod + def _read_primary( + out_dir: str, output_format: OutputFormat, json_data: dict[str, Any], + ) -> str: + if output_format == OutputFormat.JSON: + return json.dumps(json_data, ensure_ascii=False) + + suffix_map = { + OutputFormat.MARKDOWN: (".md",), + OutputFormat.HTML: (".html",), + OutputFormat.TEXT: (".txt",), + } + path = _find_output_file(out_dir, suffix_map.get(output_format, (".md",))) + if path is None: + return "" + with open(path, encoding="utf-8") as f: + return f.read() diff --git a/tests/engines/test_opendataloader_engine.py b/tests/engines/test_opendataloader_engine.py new file mode 100644 index 0000000..4a1d66d --- /dev/null +++ b/tests/engines/test_opendataloader_engine.py @@ -0,0 +1,337 @@ +"""Tests for OpenDataLoader PDF engine adapter. + +These are unit tests that mock the underlying ``opendataloader_pdf.convert`` +call so they run without Java installed. +""" + +from __future__ import annotations + +import json +import os +from unittest.mock import patch + +import pytest + +from docfold.engines.base import EngineResult, OutputFormat + + +def _odl_json(pages: int = 1, kids: list[dict] | None = None) -> dict: + """Build a JSON payload matching the real OpenDataLoader output shape.""" + return { + "file name": "test.pdf", + "number of pages": pages, + "author": None, + "title": None, + "creation date": None, + "modification date": None, + "kids": kids or [ + { + "type": "heading", + "id": 1, + "page number": 1, + "bounding box": [72.0, 688.85, 200.0, 705.0], + "heading level": 1, + "content": "Hello World", + }, + { + "type": "paragraph", + "id": 2, + "page number": 1, + "bounding box": [72.0, 659.5, 400.0, 672.2], + "content": "Test document for OpenDataLoader", + }, + ], + } + + +def _fake_convert_factory( + json_payload: dict, + markdown: str = "# Hello World\n\nTest document for OpenDataLoader", + html: str = "

Hello World

Test document for OpenDataLoader

", + text: str = "Hello World\nTest document for OpenDataLoader", + stem: str = "test", +): + """Produce a fake ``convert`` that writes output files into ``output_dir``.""" + + def fake_convert(input_path, output_dir=None, format=None, **kwargs): + assert output_dir is not None, "engine must pass an output_dir" + fmts = format if isinstance(format, list) else ([format] if format else []) + for fmt in fmts: + if fmt == "json": + with open(os.path.join(output_dir, f"{stem}.json"), "w") as f: + json.dump(json_payload, f) + elif fmt in ("markdown", "markdown-with-html", "markdown-with-images"): + with open(os.path.join(output_dir, f"{stem}.md"), "w") as f: + f.write(markdown) + elif fmt == "html": + with open(os.path.join(output_dir, f"{stem}.html"), "w") as f: + f.write(html) + elif fmt == "text": + with open(os.path.join(output_dir, f"{stem}.txt"), "w") as f: + f.write(text) + + return fake_convert + + +# --------------------------------------------------------------------------- +# Metadata / capabilities +# --------------------------------------------------------------------------- + + +class TestOpenDataLoaderEngineMetadata: + def test_name(self): + from docfold.engines.opendataloader_engine import OpenDataLoaderEngine + + e = OpenDataLoaderEngine() + assert e.name == "opendataloader" + + def test_supported_extensions(self): + from docfold.engines.opendataloader_engine import OpenDataLoaderEngine + + e = OpenDataLoaderEngine() + assert "pdf" in e.supported_extensions + + def test_capabilities(self): + from docfold.engines.opendataloader_engine import OpenDataLoaderEngine + + e = OpenDataLoaderEngine() + caps = e.capabilities + assert caps.bounding_boxes is True + assert caps.reading_order is True + assert caps.heading_detection is True + assert caps.table_structure is True + + def test_is_available_false_when_package_missing(self): + from docfold.engines.opendataloader_engine import OpenDataLoaderEngine + + e = OpenDataLoaderEngine() + with patch.dict("sys.modules", {"opendataloader_pdf": None}): + assert isinstance(e.is_available(), bool) + + def test_is_available_false_when_java_missing(self): + from docfold.engines.opendataloader_engine import OpenDataLoaderEngine + + e = OpenDataLoaderEngine() + with patch("shutil.which", return_value=None): + assert e.is_available() is False + + def test_config_stored(self): + from docfold.engines.opendataloader_engine import OpenDataLoaderEngine + + e = OpenDataLoaderEngine( + reading_order="xycut", + table_method="cluster", + include_header_footer=True, + password="secret", + ) + assert e._reading_order == "xycut" + assert e._table_method == "cluster" + assert e._include_header_footer is True + assert e._password == "secret" + + +# --------------------------------------------------------------------------- +# process() +# --------------------------------------------------------------------------- + + +class TestOpenDataLoaderEngineProcess: + @pytest.mark.asyncio + async def test_process_markdown_format(self): + from docfold.engines import opendataloader_engine + from docfold.engines.opendataloader_engine import OpenDataLoaderEngine + + e = OpenDataLoaderEngine() + fake_convert = _fake_convert_factory(_odl_json(pages=1)) + + with patch.object(opendataloader_engine, "_convert", fake_convert): + result = await e.process("test.pdf", output_format=OutputFormat.MARKDOWN) + + assert isinstance(result, EngineResult) + assert result.engine_name == "opendataloader" + assert result.format == OutputFormat.MARKDOWN + assert "Hello World" in result.content + assert result.pages == 1 + assert result.bounding_boxes is not None + assert len(result.bounding_boxes) == 2 + assert result.processing_time_ms >= 0 + + @pytest.mark.asyncio + async def test_process_json_format(self): + from docfold.engines import opendataloader_engine + from docfold.engines.opendataloader_engine import OpenDataLoaderEngine + + e = OpenDataLoaderEngine() + fake_convert = _fake_convert_factory(_odl_json(pages=1)) + + with patch.object(opendataloader_engine, "_convert", fake_convert): + result = await e.process("test.pdf", output_format=OutputFormat.JSON) + + assert result.format == OutputFormat.JSON + # Content must be valid JSON + parsed = json.loads(result.content) + assert parsed["number of pages"] == 1 + + @pytest.mark.asyncio + async def test_process_html_format(self): + from docfold.engines import opendataloader_engine + from docfold.engines.opendataloader_engine import OpenDataLoaderEngine + + e = OpenDataLoaderEngine() + fake_convert = _fake_convert_factory(_odl_json(pages=1)) + + with patch.object(opendataloader_engine, "_convert", fake_convert): + result = await e.process("test.pdf", output_format=OutputFormat.HTML) + + assert result.format == OutputFormat.HTML + assert "

" in result.content.lower() + + @pytest.mark.asyncio + async def test_type_mapping(self): + """heading/paragraph/table/list/list-item should map to canonical types.""" + from docfold.engines import opendataloader_engine + from docfold.engines.opendataloader_engine import OpenDataLoaderEngine + + kids = [ + { + "type": "heading", + "id": 1, + "page number": 1, + "bounding box": [0, 0, 100, 20], + "content": "H", + }, + { + "type": "paragraph", + "id": 2, + "page number": 1, + "bounding box": [0, 30, 100, 50], + "content": "P", + }, + { + "type": "table", + "id": 3, + "page number": 1, + "bounding box": [0, 60, 100, 200], + "content": "T", + }, + { + "type": "list", + "id": 4, + "page number": 1, + "bounding box": [0, 210, 100, 300], + "content": "L", + }, + ] + e = OpenDataLoaderEngine() + fake_convert = _fake_convert_factory(_odl_json(pages=1, kids=kids)) + with patch.object(opendataloader_engine, "_convert", fake_convert): + result = await e.process("test.pdf", output_format=OutputFormat.MARKDOWN) + + types = {b["type"] for b in result.bounding_boxes} + assert "SectionHeader" in types + assert "Text" in types + assert "Table" in types + assert "List" in types + + @pytest.mark.asyncio + async def test_nested_kids_flattened(self): + """Nested kids (e.g. header container) must be flattened into bboxes.""" + from docfold.engines import opendataloader_engine + from docfold.engines.opendataloader_engine import OpenDataLoaderEngine + + nested_kids = [ + { + "type": "header", + "id": 10, + "page number": 1, + "bounding box": [0, 0, 500, 400], + "kids": [ + { + "type": "heading", + "id": 1, + "page number": 1, + "bounding box": [0, 0, 100, 20], + "content": "Title", + }, + { + "type": "paragraph", + "id": 2, + "page number": 1, + "bounding box": [0, 30, 400, 100], + "content": "Body", + }, + ], + } + ] + e = OpenDataLoaderEngine() + fake_convert = _fake_convert_factory(_odl_json(pages=1, kids=nested_kids)) + + with patch.object(opendataloader_engine, "_convert", fake_convert): + result = await e.process("test.pdf", output_format=OutputFormat.MARKDOWN) + + # At least the two leaves must be emitted as bounding boxes. + assert result.bounding_boxes is not None + texts = [b.get("text", "") for b in result.bounding_boxes] + assert "Title" in texts + assert "Body" in texts + + @pytest.mark.asyncio + async def test_bbox_coordinates_preserved(self): + from docfold.engines import opendataloader_engine + from docfold.engines.opendataloader_engine import OpenDataLoaderEngine + + kids = [ + { + "type": "paragraph", + "id": 1, + "page number": 2, + "bounding box": [12.5, 34.5, 200.0, 90.0], + "content": "Hi", + } + ] + e = OpenDataLoaderEngine() + fake_convert = _fake_convert_factory(_odl_json(pages=2, kids=kids)) + + with patch.object(opendataloader_engine, "_convert", fake_convert): + result = await e.process("test.pdf", output_format=OutputFormat.MARKDOWN) + + assert result.bounding_boxes is not None + bbox = result.bounding_boxes[0] + assert bbox["bbox"] == [12.5, 34.5, 200.0, 90.0] + assert bbox["page"] == 2 + assert bbox["text"] == "Hi" + + @pytest.mark.asyncio + async def test_process_surfaces_errors_as_runtime_error(self): + from docfold.engines import opendataloader_engine + from docfold.engines.opendataloader_engine import OpenDataLoaderEngine + + def boom(*args, **kwargs): + raise RuntimeError("java exited 2") + + e = OpenDataLoaderEngine() + with patch.object(opendataloader_engine, "_convert", boom): + with pytest.raises(RuntimeError, match="opendataloader"): + await e.process("test.pdf", output_format=OutputFormat.MARKDOWN) + + @pytest.mark.asyncio + async def test_reading_order_option_passed(self): + """reading_order kwarg on the engine must reach the underlying convert call.""" + from docfold.engines import opendataloader_engine + from docfold.engines.opendataloader_engine import OpenDataLoaderEngine + + captured: dict = {} + + def capture(input_path, output_dir=None, format=None, **kwargs): + captured["kwargs"] = kwargs + # Still produce files so parsing succeeds + _fake_convert_factory(_odl_json(pages=1))( + input_path, output_dir=output_dir, format=format, **kwargs + ) + + e = OpenDataLoaderEngine(reading_order="xycut", table_method="cluster") + with patch.object(opendataloader_engine, "_convert", capture): + await e.process("test.pdf", output_format=OutputFormat.MARKDOWN) + + assert captured["kwargs"].get("reading_order") == "xycut" + assert captured["kwargs"].get("table_method") == "cluster" diff --git a/tests/fixtures/fonts/LICENSE.txt b/tests/fixtures/fonts/LICENSE.txt new file mode 100644 index 0000000..dffd6b1 --- /dev/null +++ b/tests/fixtures/fonts/LICENSE.txt @@ -0,0 +1,20 @@ +Benchmark font fixtures — all bundled Noto fonts are licensed under the +SIL Open Font License, Version 1.1 (OFL-1.1). + +Files +----- +- NotoNaskhArabic-Regular.ttf — full Noto Naskh Arabic Regular +- NotoSansCJKsc-Regular-subset.ttf — Noto Sans CJK SC Regular, subsetted to + the glyphs used in the Chinese benchmark document only +- NotoSansHebrew-Regular-subset.ttf — Noto Sans Hebrew Regular, subsetted to + the glyphs used in the Hebrew benchmark document only + +Sources +------- +- https://fonts.google.com/noto/specimen/Noto+Naskh+Arabic +- https://fonts.google.com/noto/specimen/Noto+Sans+Simplified+Chinese +- https://fonts.google.com/noto/specimen/Noto+Sans+Hebrew +- OFL license text: https://scripts.sil.org/OFL + +Subsetting was done with fontTools (``fontTools.subset.Subsetter``) preserving +all OpenType layout features so shaping / bidi still work. diff --git a/tests/fixtures/fonts/NotoNaskhArabic-Regular.ttf b/tests/fixtures/fonts/NotoNaskhArabic-Regular.ttf new file mode 100644 index 0000000..00a33b3 Binary files /dev/null and b/tests/fixtures/fonts/NotoNaskhArabic-Regular.ttf differ diff --git a/tests/fixtures/fonts/NotoSansCJKsc-Regular-subset.ttf b/tests/fixtures/fonts/NotoSansCJKsc-Regular-subset.ttf new file mode 100644 index 0000000..1cdd83d Binary files /dev/null and b/tests/fixtures/fonts/NotoSansCJKsc-Regular-subset.ttf differ diff --git a/tests/fixtures/fonts/NotoSansHebrew-Regular-subset.ttf b/tests/fixtures/fonts/NotoSansHebrew-Regular-subset.ttf new file mode 100644 index 0000000..9330e17 Binary files /dev/null and b/tests/fixtures/fonts/NotoSansHebrew-Regular-subset.ttf differ