From d5314bfa63066baa8af7148d8bfae1788af349dd Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 31 Mar 2026 19:55:47 +0000 Subject: [PATCH 1/2] docs: add Chandra OCR 2 research and integration plan Analyze Chandra OCR 2 (Datalab) as a candidate engine for docfold: - RESEARCH_CHANDRA_OCR.md: model details, benchmarks (85.9% olmOCR SOTA), capabilities, API usage, confidence scoring, and comparison with existing engines - TASK_CHANDRA_ENGINE.md: step-by-step integration plan following TDD approach, covering engine adapter, router, CLI, tests, and documentation updates Sources: GitHub repo, HuggingFace model card, Datalab blog, confidence scoring docs https://claude.ai/code/session_014XarzdnTKLSQmJW7VPVNcW --- docs/RESEARCH_CHANDRA_OCR.md | 255 +++++++++++++++++++++++++++++++++++ docs/TASK_CHANDRA_ENGINE.md | 169 +++++++++++++++++++++++ 2 files changed, 424 insertions(+) create mode 100644 docs/RESEARCH_CHANDRA_OCR.md create mode 100644 docs/TASK_CHANDRA_ENGINE.md diff --git a/docs/RESEARCH_CHANDRA_OCR.md b/docs/RESEARCH_CHANDRA_OCR.md new file mode 100644 index 0000000..8e80b53 --- /dev/null +++ b/docs/RESEARCH_CHANDRA_OCR.md @@ -0,0 +1,255 @@ +# Research: Chandra OCR 2 — State-of-the-Art Document OCR + +A comprehensive analysis of Chandra OCR 2 by Datalab, evaluating it as a candidate engine for docfold. + +--- + +## Overview + +| Attribute | Details | +|---|---| +| **Source** | [GitHub](https://github.com/datalab-to/chandra), [HuggingFace](https://huggingface.co/datalab-to/chandra-ocr-2) | +| **Core capability** | Convert images/PDFs to structured Markdown, HTML, or JSON with layout preservation | +| **Model size** | ~5B parameters (4B advertised, BF16) | +| **Architecture** | Vision Language Model based on Qwen 3.5 (Image-Text-to-Text) | +| **Code license** | Apache 2.0 | +| **Model license** | Modified OpenRAIL-M (free for research, personal use, startups <$2M funding/revenue) | +| **Install** | `pip install chandra-ocr` (extras: `[hf]`, `[all]`) | +| **Inference** | vLLM (recommended) or HuggingFace Transformers | +| **Downloads** | ~31K/month on HuggingFace | + +--- + +## Key Capabilities + +- **Output formats:** Markdown, HTML, JSON with detailed layout information +- **Handwriting:** Excellent handwriting recognition (cursive, notes, forms) +- **Forms:** Accurate checkbox and filled-field reconstruction +- **Tables:** Strong table extraction and structure preservation +- **Math:** LaTeX formula extraction from printed and handwritten equations +- **Images/diagrams:** Extraction with captions and structured data +- **Languages:** 90+ languages with strong multilingual performance +- **Complex layouts:** Multi-column, headers/footers, nested structures + +--- + +## Benchmark Results + +### olmOCR Benchmark (Overall Score %) + +| Model | Score | +|---|---| +| **Datalab API** | **86.7** | +| **Chandra 2** | **85.9** | +| dots.ocr 1.5 | 83.9 | +| Chandra 1 | 83.1 | +| olmOCR 2 | 82.4 | +| dots.ocr | 79.1 | +| olmOCR v0.3.0 | 78.5 | +| Marker v1.10.0 | 76.5 | +| Deepseek OCR | 75.4 | +| Mistral OCR | 72.0 | +| GPT-4o | 69.9 | +| Qwen 3 VL 8B | 64.6 | +| Gemini Flash 2 | 63.8 | + +### olmOCR Benchmark Breakdown + +| Model | ArXiv | Old Scans Math | Tables | Headers/Footers | Overall | +|---|---|---|---|---|---| +| **Chandra 2** | 90.2 | 89.3 | 89.9 | 92.5 | 85.9 | +| Datalab API | 90.4 | 90.2 | 90.7 | 91.6 | 86.7 | +| dots.ocr 1.5 | 85.9 | 85.5 | 90.7 | 94.0 | 83.9 | +| Chandra 1 | 82.2 | 80.3 | 88.0 | 90.8 | 83.1 | +| GPT-4o | 53.5 | 74.5 | 70.0 | 93.8 | 69.9 | +| Gemini Flash 2 | 54.5 | 56.1 | 72.1 | 64.7 | 63.8 | + +### Multilingual Benchmark (43 languages) + +- **Average:** 77.8% (+12% improvement over Chandra 1) +- **Top performers:** Portuguese (95.2%), German (94.8%), Italian (94.1%), French (93.7%), Swedish (92.8%) +- **90-language eval:** Chandra 2 averages 72.7% vs Gemini 2.5 Flash at 60.8% + +### Chandra 2 vs Existing docfold Engines + +| Criterion | Chandra 2 | Marker | Mistral OCR | Surya | Nougat | +|---|---|---|---|---|---| +| **olmOCR Score** | 85.9% | 76.5% | 72.0% | N/A | N/A | +| **Multi-lang** | 90+ langs | Good | Good | 90+ langs | English-centric | +| **Tables** | ★★★ | ★★★ | ★★★ | ★★☆ | ★★☆ | +| **Math/Formulas** | ★★★ | ★★☆ | ★★★ | ★☆☆ | ★★★ | +| **Handwriting** | ★★★ | ★☆☆ | ★★☆ | ★★☆ | ☆☆☆ | +| **Speed (H100)** | ~1.44 pp/s | Fast | Fast (SaaS) | Medium | Slow | +| **License** | OpenRAIL-M* | Paid SaaS | Paid SaaS | GPL-3.0 | MIT | +| **GPU required** | Yes (rec.) | No | No (SaaS) | Optional | Yes | +| **Local inference** | Yes | No | No | Yes | Yes | + +*\*Free for research/personal/startups <$2M. Requires commercial license otherwise.* + +--- + +## Performance & Throughput + +**vLLM on NVIDIA H100 80GB:** + +| Metric | Value | +|---|---| +| Pages/second | 1.44 | +| Average latency | 60s | +| P95 latency | 156s | +| Failure rate | 0% | +| Real-world estimate | ~2 pages/s | + +--- + +## Usage + +### Installation + +```bash +pip install chandra-ocr # base +pip install chandra-ocr[hf] # with HuggingFace (requires torch) +pip install chandra-ocr[all] # all extras +``` + +### CLI + +```bash +# With vLLM server (start first with: chandra_vllm) +chandra input.pdf ./output + +# With HuggingFace +chandra input.pdf ./output --method hf +``` + +### Python API — vLLM (recommended) + +```python +from chandra.model import InferenceManager +from chandra.model.schema import BatchInputItem +from PIL import Image + +manager = InferenceManager(method="vllm") +batch = [ + BatchInputItem( + image=Image.open("document.png"), + prompt_type="ocr_layout" + ) +] +result = manager.generate(batch)[0] +print(result.markdown) +``` + +### Python API — HuggingFace + +```python +from transformers import AutoModelForImageTextToText, AutoProcessor +from chandra.model.hf import generate_hf +from chandra.model.schema import BatchInputItem +from chandra.output import parse_markdown +from PIL import Image +import torch + +model = AutoModelForImageTextToText.from_pretrained( + "datalab-to/chandra-ocr-2", + dtype=torch.bfloat16, + device_map="auto", +) +model.eval() +model.processor = AutoProcessor.from_pretrained("datalab-to/chandra-ocr-2") +model.processor.tokenizer.padding_side = "left" + +batch = [ + BatchInputItem( + image=Image.open("document.png"), + prompt_type="ocr_layout" + ) +] +result = generate_hf(batch, model)[0] +markdown = parse_markdown(result.raw) +print(markdown) +``` + +--- + +## Datalab API: Structured Extraction with Confidence Scoring + +Beyond OCR, Datalab offers a hosted API with **per-field confidence scoring** for structured data extraction. + +### Confidence Scoring + +Each extracted field gets a score from 1–5 with reasoning: + +| Score | Meaning | +|---|---| +| 5 | High confidence — clear match with strong citation | +| 4 | Good confidence — match found with minor ambiguity | +| 3 | Moderate confidence — partial or uncertain evidence | +| 2 | Low confidence — inferred or weakly supported | +| 1 | Very low confidence — no clear evidence found | + +### Two Scoring Modes + +**Async (recommended):** +1. Extract with `save_checkpoint=true` → `POST /api/v1/extract` +2. Poll `request_check_url` until complete +3. Submit `checkpoint_id` to `POST /api/v1/extract/score` +4. Poll for scoring results + +**Sync:** +- Pass `include_scores=true` to `POST /api/v1/extract` — returns extraction + scores in one call + +### Response Fields + +```json +{ + "field_name": "value", + "field_name_citations": ["block_id"], + "field_name_score": { + "score": 4, + "reasoning": "Clear match found in paragraph 2" + }, + "extraction_score_average": 4.2 +} +``` + +Use `extraction_score_average` for quick quality checks. Route fields scoring ≤2 to human review. + +--- + +## Relevance to docfold + +### Why Add Chandra + +1. **Best-in-class OCR accuracy** — 85.9% on olmOCR, significantly beating Marker (76.5%) and Mistral OCR (72.0%) +2. **True local inference** — unlike Marker/LlamaParse/Mistral OCR which are SaaS-only, Chandra runs locally via vLLM or HuggingFace +3. **Strongest multilingual support** — 90+ languages at 77.8% average, filling a gap for non-English documents +4. **Handwriting + forms** — unique strength among local engines, none of the current 16 engines excel at handwriting +5. **Structured output** — native Markdown/HTML/JSON output aligns perfectly with docfold's `EngineResult` model +6. **Confidence scoring** — the Datalab API's per-field scoring maps well to docfold's quality assessment utilities + +### Concerns + +1. **GPU requirement** — the 5B VLM needs significant GPU memory (~16GB+ VRAM), limiting CPU-only environments +2. **License restrictions** — OpenRAIL-M is not fully permissive; commercial use above $2M threshold requires paid license +3. **Speed** — at ~1.44 pp/s on H100, significantly slower than PyMuPDF or text-based engines +4. **Dual mode complexity** — supporting both vLLM and HuggingFace backends adds adapter complexity +5. **Heavy dependencies** — `torch`, `transformers`, `vllm` are large packages + +### Integration Approach + +The engine should be implemented similarly to other VLM-based engines (like `zerox_engine.py` / `nougat_engine.py`): +- Lazy model loading on first `process()` call +- Support both vLLM (remote server) and HuggingFace (local) backends +- Map Chandra's markdown/HTML output to `EngineResult` +- Optional dependency via `pip install docfold[chandra]` + +--- + +## References + +- GitHub: https://github.com/datalab-to/chandra +- HuggingFace: https://huggingface.co/datalab-to/chandra-ocr-2 +- Datalab API: https://www.datalab.to/ +- Confidence Scoring Docs: https://documentation.datalab.to/docs/recipes/structured-extraction/confidence-scoring +- Playground: https://www.datalab.to/playground diff --git a/docs/TASK_CHANDRA_ENGINE.md b/docs/TASK_CHANDRA_ENGINE.md new file mode 100644 index 0000000..7884adb --- /dev/null +++ b/docs/TASK_CHANDRA_ENGINE.md @@ -0,0 +1,169 @@ +# Task: Add Chandra OCR 2 Engine to Docfold + +## Overview + +Add **Chandra OCR 2** (by Datalab) as a new document processing engine in docfold. Chandra 2 is a 5B-parameter Vision Language Model that achieves **85.9% on the olmOCR benchmark** (state of the art), significantly outperforming existing docfold engines like Marker (76.5%) and Mistral OCR (72.0%). It converts images and PDFs to structured Markdown, HTML, or JSON with layout preservation, supports 90+ languages, and excels at handwriting, tables, math, and complex layouts. + +Chandra supports two inference backends: **vLLM** (recommended, remote server) and **HuggingFace Transformers** (local). The docfold adapter should support both. + +**Research document:** `docs/RESEARCH_CHANDRA_OCR.md` + +--- + +## Files to Create / Modify + +### 1. NEW: `src/docfold/engines/chandra_engine.py` + +Create the engine adapter implementing `DocumentEngine` ABC. + +**Class: `ChandraEngine`** + +- **Constructor parameters:** + - `method: str = "vllm"` — Inference backend (`"vllm"` or `"hf"`) + - `model: str = "datalab-to/chandra-ocr-2"` — HuggingFace model name + - `prompt_type: str = "ocr_layout"` — Chandra prompt type (`"ocr_layout"`, etc.) + - `vllm_url: str = "http://localhost:8000"` — vLLM server URL (only used when `method="vllm"`) + - `torch_dtype: str = "bfloat16"` — Dtype for HF inference + - `device_map: str = "auto"` — Device map for HF inference + +- **`name` property:** returns `"chandra"` + +- **`supported_extensions` property:** returns `{"pdf", "png", "jpg", "jpeg", "tiff", "bmp", "webp"}` + +- **`capabilities` property:** returns: + ```python + EngineCapabilities( + table_structure=True, + heading_detection=True, + reading_order=True, + ) + ``` + +- **`is_available()` method:** + - For `method="vllm"`: check `import chandra` succeeds + - For `method="hf"`: check `import transformers, torch, chandra` succeed + - No model loading or network calls + +- **`process()` method:** + 1. Convert input file to PIL Image (per page for PDFs) + 2. Build `BatchInputItem(image=img, prompt_type=self._prompt_type)` + 3. For `method="vllm"`: use `InferenceManager(method="vllm")` + `manager.generate(batch)` + 4. For `method="hf"`: lazy-load model, use `generate_hf(batch, model)` + 5. Parse output with `parse_markdown(result.raw)` for Markdown format + 6. Return `EngineResult` with: + - `content`: parsed markdown/HTML/JSON string + - `format`: `OutputFormat.MARKDOWN` (default) + - `engine_name`: `"chandra"` + - `metadata`: `{"model": self._model, "method": self._method}` + - `processing_time_ms`: measured wall-clock time + +- **No model loading at:** + - Module import time + - Engine construction time + - `is_available()` check + - Router registration time + +--- + +### 2. MODIFY: `src/docfold/engines/router.py` + +- Add `"chandra"` to `_EXTENSION_PRIORITY` for image and PDF extensions. Given its SOTA accuracy, place it high in priority for scanned/image documents: + - `"pdf"`: insert after `"mineru"` (or early in list, given superior benchmark scores) + - `"png"`, `"jpg"`, `"jpeg"`, `"tiff"`, `"bmp"`, `"webp"`: add to image priority lists + +- Add `"chandra"` to `_DEFAULT_FALLBACK` list + +--- + +### 3. MODIFY: `src/docfold/cli.py` + +- Add Chandra registration block in `_build_router()`: + ```python + try: + from docfold.engines.chandra_engine import ChandraEngine + router.register(ChandraEngine()) + except Exception: + pass + ``` + +--- + +### 4. MODIFY: `pyproject.toml` + +- Add optional dependency group: + ```toml + chandra = [ + "chandra-ocr>=0.1", + ] + ``` + - The `chandra-ocr` package already manages its own torch/transformers/vllm dependencies + - For HF-only usage, users can install `chandra-ocr[hf]` separately + +- Add `chandra` to the `all` extra + +--- + +### 5. MODIFY: `tests/engines/test_adapters.py` + +Add `TestChandraEngine` class following the existing test pattern: + +- `test_name()` — assert `name == "chandra"` +- `test_supported_extensions()` — assert key extensions present (`pdf`, `png`, `jpg`) +- `test_is_available_when_missing()` — mock `sys.modules` to verify `is_available()` returns `False` +- `test_config_stored()` — verify constructor params stored correctly (`_method`, `_model`, `_prompt_type`, `_vllm_url`) +- `test_default_method_is_vllm()` — verify `_method` defaults to `"vllm"` +- `test_capabilities()` — assert `table_structure=True`, `heading_detection=True`, `reading_order=True` + +Also update `TestAllEnginesImplementInterface`: +- Add `"docfold.engines.chandra_engine.ChandraEngine"` to the parametrized list + +--- + +### 6. MODIFY: `docs/benchmarks.md` + +Add Chandra to the Quick Comparison table: + +```markdown +| **Chandra** | Local/VLM | OpenRAIL-M* | ★★★ | ★★★ | ★★★ | ★★★ | ★★★ (90+) | Slow | Free* | +``` + +Add Engine Profile section for Chandra. + +--- + +## Design Decisions + +1. **Dual backend support (vLLM + HF).** vLLM is recommended for production throughput; HF is simpler for development/testing. The `method` parameter switches between them, similar to how Chandra's own CLI works. + +2. **vLLM as default.** Production users will run a vLLM server for throughput. The adapter defaults to connecting to `localhost:8000` where `chandra_vllm` runs. + +3. **High router priority for scanned documents.** With 85.9% olmOCR score, Chandra should be preferred over lower-scoring engines for scanned PDFs and images. However, for text-based PDFs, faster engines like PyMuPDF should still take priority. + +4. **Delegate to chandra-ocr package.** Rather than reimplementing inference logic, depend on the `chandra-ocr` pip package which handles model loading, prompting, and output parsing. This simplifies the adapter and tracks upstream improvements. + +5. **Image-first processing.** Chandra is a VLM — it processes images, not text. For PDFs, each page is rendered as an image before being passed to the model. Multi-page PDFs are processed page-by-page. + +6. **License awareness.** The OpenRAIL-M model license restricts commercial use above the $2M threshold. Document this clearly in the engine docstring and benchmarks. The code (Apache 2.0) and docfold adapter (MIT) have no restrictions. + +--- + +## Implementation Order + +Following TDD (per CLAUDE.md): + +1. **Write tests** in `tests/engines/test_adapters.py` for `ChandraEngine` +2. **Create** `src/docfold/engines/chandra_engine.py` — implement to pass tests +3. **Update** `pyproject.toml` — add `chandra` optional dependency +4. **Update** `src/docfold/engines/router.py` — add to priority maps +5. **Update** `src/docfold/cli.py` — register engine +6. **Update** `docs/benchmarks.md` — add Chandra profile +7. **Run** full test suite to verify no regressions + +--- + +## Future Enhancements + +- **Datalab API engine** — Add a separate `DatalabEngine` for the hosted API (86.7% olmOCR score, confidence scoring, structured extraction) +- **Confidence scoring integration** — Map Datalab API's per-field confidence scores (1–5) to docfold's quality assessment utilities +- **Batch processing** — Leverage Chandra's native batch inference for multi-page documents +- **Streamlit app integration** — Add Chandra to docfold's comparison UI From 2e4b54a6197458da49921a8d914ba993a7e5a156 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 31 Mar 2026 20:23:59 +0000 Subject: [PATCH 2/2] feat: add Chandra OCR 2 engine adapter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Datalab Chandra OCR 2 as the 19th engine in docfold — a 5B VLM achieving 85.9% on olmOCR benchmark (SOTA). Supports 90+ languages, handwriting, tables, math, and complex layouts via vLLM or HuggingFace. - New ChandraEngine adapter with dual backend (vllm/hf), lazy model loading - Tests: 6 unit tests + interface compliance test - Router: chandra added to PDF, image, and default fallback priorities - CLI: registered in _build_router() - pyproject.toml: chandra optional dependency group, added to [all] - benchmarks.md: quick comparison, engine profile, feature/format/hw/cost matrices All 279 tests pass, ruff clean. https://claude.ai/code/session_014XarzdnTKLSQmJW7VPVNcW --- docs/benchmarks.md | 16 ++ pyproject.toml | 5 +- src/docfold/cli.py | 6 + src/docfold/engines/chandra_engine.py | 217 ++++++++++++++++++++++++++ src/docfold/engines/router.py | 6 +- tests/engines/test_adapters.py | 54 +++++++ 6 files changed, 300 insertions(+), 4 deletions(-) create mode 100644 src/docfold/engines/chandra_engine.py diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 82abecb..86988b3 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -10,6 +10,7 @@ This guide helps you choose the right document processing engine for your use ca |--------|------|---------|----------|----------|--------|----------|------------|-------|------| | **Docling** | Local | MIT | ★★★ | ★★☆ | ★★★ | ★★☆ | ★★★ | Medium | Free | | **MinerU** | Local | AGPL-3.0 | ★★★ | ★★★ | ★★★ | ★★★ | ★★☆ | Slow | Free | +| **Chandra** | Local/VLM | OpenRAIL-M* | ★★★ | ★★★ | ★★★ | ★★★ | ★★★ (90+) | Slow | Free* | | **Marker** | SaaS | Paid | ★★★ | ★★★ | ★★★ | ★★☆ | ★★★ | Fast | ~$1/1K pages | | **PyMuPDF** | Local | AGPL-3.0 | ★★★ | ☆☆☆ | ★☆☆ | ☆☆☆ | ★★★ | Ultra-fast | Free | | **PaddleOCR** | Local | Apache-2.0 | ★☆☆ | ★★★ | ★★☆ | ☆☆☆ | ★★★ (80+) | Medium | Free | @@ -43,6 +44,17 @@ This guide helps you choose the right document processing engine for your use ca - **Install:** `pip install docfold[docling]` - **Links:** [GitHub](https://github.com/docling-project/docling) | [Paper](https://arxiv.org/abs/2408.09869) +### Chandra OCR 2 (Datalab) + +**Best for:** Highest-accuracy document OCR — handwriting, forms, math, tables, multilingual. + +- **Strengths:** State-of-the-art 85.9% olmOCR benchmark score. Excellent handwriting recognition (cursive, notes, forms). 90+ languages at 77.8% multilingual average. Strong table, math/LaTeX, and complex layout handling. Native Markdown/HTML/JSON output. Runs locally via vLLM or HuggingFace. +- **Weaknesses:** 5B VLM requires significant GPU memory (~16 GB+ VRAM). Slower than rule-based engines (~1.44 pp/s on H100). OpenRAIL-M license restricts commercial use above $2M revenue/funding threshold. Heavy dependencies (torch, transformers, vllm). +- **GPU:** Required (CUDA). HuggingFace needs ~16 GB VRAM; vLLM benefits from H100/A100. +- **Cost:** Free for research/personal/startups <$2M. Commercial license required otherwise. +- **Install:** `pip install docfold[chandra]` +- **Links:** [GitHub](https://github.com/datalab-to/chandra) | [HuggingFace](https://huggingface.co/datalab-to/chandra-ocr-2) | [Playground](https://www.datalab.to/playground) + ### MinerU / PDF-Extract-Kit (OpenDataLab) **Best for:** Academic papers, technical documents with formulas and complex layouts. @@ -235,6 +247,7 @@ Capabilities each engine can populate in `EngineResult`: | Engine | BBox | Confidence | Images | Tables | Headings | Reading Order | |--------|:----:|:----------:|:------:|:------:|:--------:|:-------------:| | Docling | ✅ | — | ✅ | ✅ | ✅ | ✅ | +| Chandra | — | — | — | ✅ | ✅ | ✅ | | MinerU | — | — | — | ✅ | ✅ | ✅ | | Marker | ✅ | — | ✅ | ✅ | ✅ | — | | PyMuPDF | — | — | — | — | — | — | @@ -268,6 +281,7 @@ Capabilities each engine can populate in `EngineResult`: | Engine | PDF | DOCX | PPTX | XLSX | HTML | Images | Email | Audio | ePub | |--------|-----|------|------|------|------|--------|-------|-------|------| | Docling | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | — | ✅ | — | +| Chandra | ✅* | — | — | — | — | ✅ | — | — | — | | MinerU | ✅ | — | — | — | — | — | — | — | — | | Marker | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | — | — | ✅ | | PyMuPDF | ✅ | — | — | — | — | — | — | — | — | @@ -294,6 +308,7 @@ Capabilities each engine can populate in `EngineResult`: | Engine | Min RAM | Recommended RAM | GPU | Disk (models) | |--------|---------|-----------------|-----|----------------| | Docling | 4 GB | 8 GB | Optional | ~2 GB | +| Chandra | 16 GB | 32 GB | CUDA 16+ GB | ~10 GB | | MinerU | 8 GB | 16 GB | CUDA 8+ GB | ~5 GB | | Marker (local) | 4 GB | 8 GB | Optional | ~2 GB | | PyMuPDF | 512 MB | 1 GB | — | — | @@ -320,6 +335,7 @@ Capabilities each engine can populate in `EngineResult`: | Docling | Free | $0 (compute only) | | Nougat | Free | $0 (compute + GPU) | | Surya | Free | $0 (compute only) | +| Chandra | Free* | $0 (compute + GPU)* | | MinerU | Free | $0 (compute + GPU) | | Unstructured | Free / API | $0 local / ~$10 API | | Marker API | SaaS | ~$1 | diff --git a/pyproject.toml b/pyproject.toml index 978784a..e4e0034 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,6 +93,9 @@ nougat = [ "nougat-ocr>=0.1.17", "torch>=2.0", ] +chandra = [ + "chandra-ocr>=0.1", +] surya = [ "surya-ocr>=0.6", "torch>=2.0", @@ -109,7 +112,7 @@ evaluation = [ "psutil>=5.9", # Memory measurement ] all = [ - "docfold[docling,mineru,marker,pymupdf,paddleocr,tesseract,easyocr,unstructured,llamaparse,liteparse,mistral-ocr,textract,google-docai,azure-docint,nougat,surya,firecrawl,evaluation]", + "docfold[docling,mineru,marker,pymupdf,paddleocr,tesseract,easyocr,unstructured,llamaparse,liteparse,mistral-ocr,textract,google-docai,azure-docint,nougat,chandra,surya,firecrawl,evaluation]", # Note: zerox excluded from [all] — py-zerox requires Python 3.11+ # Install separately: pip install docfold[zerox] ] diff --git a/src/docfold/cli.py b/src/docfold/cli.py index ac32dc3..8671289 100644 --- a/src/docfold/cli.py +++ b/src/docfold/cli.py @@ -172,6 +172,12 @@ def _build_router(): except Exception: pass + try: + from docfold.engines.chandra_engine import ChandraEngine + router.register(ChandraEngine()) + except Exception: + pass + try: from docfold.engines.nougat_engine import NougatEngine router.register(NougatEngine()) diff --git a/src/docfold/engines/chandra_engine.py b/src/docfold/engines/chandra_engine.py new file mode 100644 index 0000000..0e256d3 --- /dev/null +++ b/src/docfold/engines/chandra_engine.py @@ -0,0 +1,217 @@ +"""Chandra OCR 2 engine adapter — Datalab's state-of-the-art document OCR model. + +Install: ``pip install docfold[chandra]`` + +Chandra OCR 2 is a 5B-parameter Vision Language Model that converts images and +PDFs to structured Markdown, HTML, or JSON with layout preservation. Supports +90+ languages, handwriting, tables, math, and complex layouts. + +Supports two inference backends: +- **vLLM** (recommended): connect to a running ``chandra_vllm`` server +- **HuggingFace**: load the model locally via transformers + +Model license: Modified OpenRAIL-M (free for research, personal use, and +startups <$2M funding/revenue). +""" + +from __future__ import annotations + +import logging +import time +from typing import Any + +from docfold.engines.base import DocumentEngine, EngineCapabilities, EngineResult, OutputFormat + +logger = logging.getLogger(__name__) + +_SUPPORTED_EXTENSIONS = {"pdf", "png", "jpg", "jpeg", "tiff", "tif", "bmp", "webp"} + + +class ChandraEngine(DocumentEngine): + """Adapter for Datalab Chandra OCR 2 (document → Markdown/HTML/JSON). + + Achieves 85.9% on the olmOCR benchmark — state of the art for open models. + Excels at handwriting, forms, tables, math, and 90+ languages. + + See https://github.com/datalab-to/chandra + """ + + def __init__( + self, + method: str = "vllm", + model: str = "datalab-to/chandra-ocr-2", + prompt_type: str = "ocr_layout", + vllm_url: str = "http://localhost:8000", + torch_dtype: str = "bfloat16", + device_map: str = "auto", + ) -> None: + self._method = method + self._model = model + self._prompt_type = prompt_type + self._vllm_url = vllm_url + self._torch_dtype = torch_dtype + self._device_map = device_map + self._hf_model: Any = None + + @property + def name(self) -> str: + return "chandra" + + @property + def supported_extensions(self) -> set[str]: + return _SUPPORTED_EXTENSIONS + + @property + def capabilities(self) -> EngineCapabilities: + return EngineCapabilities( + table_structure=True, + heading_detection=True, + reading_order=True, + ) + + def is_available(self) -> bool: + try: + import chandra # noqa: F401 + if self._method == "hf": + import torch # noqa: F401 + import transformers # noqa: F401 + return True + except Exception: + return False + + async def process( + self, + file_path: str, + output_format: OutputFormat = OutputFormat.MARKDOWN, + **kwargs: Any, + ) -> EngineResult: + import asyncio + + start = time.perf_counter() + + loop = asyncio.get_running_loop() + content, page_count = await loop.run_in_executor( + None, self._do_process, file_path, output_format, + ) + + elapsed_ms = int((time.perf_counter() - start) * 1000) + + return EngineResult( + content=content, + format=output_format, + engine_name=self.name, + pages=page_count, + processing_time_ms=elapsed_ms, + metadata={"model": self._model, "method": self._method}, + ) + + def _do_process( + self, file_path: str, output_format: OutputFormat, + ) -> tuple[str, int]: + from pathlib import Path + + from chandra.model.schema import BatchInputItem + from chandra.output import parse_markdown + from PIL import Image + + ext = Path(file_path).suffix.lstrip(".").lower() + + # Convert input to list of PIL images (one per page) + images: list[Image.Image] = [] + if ext == "pdf": + images = self._pdf_to_images(file_path) + else: + images = [Image.open(file_path)] + + # Build batch + batch = [ + BatchInputItem(image=img, prompt_type=self._prompt_type) + for img in images + ] + + # Run inference + if self._method == "vllm": + results = self._infer_vllm(batch) + else: + results = self._infer_hf(batch) + + # Parse results + pages_text: list[str] = [] + for result in results: + md = parse_markdown(result.raw) + pages_text.append(md) + + page_count = len(pages_text) + full_text = "\n\n".join(pages_text) + + if output_format == OutputFormat.JSON: + import json + content = json.dumps( + {"pages": [{"page": i + 1, "text": t} for i, t in enumerate(pages_text)]}, + ensure_ascii=False, + ) + elif output_format == OutputFormat.HTML: + html_parts = [ + f"

{t}

" + for i, t in enumerate(pages_text) + ] + content = "" + "\n".join(html_parts) + "" + else: + content = full_text + + return content, page_count + + def _pdf_to_images(self, file_path: str) -> list: + """Convert PDF pages to PIL images.""" + from PIL import Image + + try: + import fitz + + doc = fitz.open(file_path) + images = [] + for page in doc: + pix = page.get_pixmap(dpi=300) + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + images.append(img) + doc.close() + return images + except ImportError: + pass + + from pdf2image import convert_from_path + + return convert_from_path(file_path, dpi=300) + + def _infer_vllm(self, batch: list) -> list: + """Run inference via vLLM server.""" + from chandra.model import InferenceManager + + manager = InferenceManager(method="vllm", vllm_url=self._vllm_url) + return manager.generate(batch) + + def _infer_hf(self, batch: list) -> list: + """Run inference via HuggingFace transformers (lazy model loading).""" + import torch + from chandra.model.hf import generate_hf + from transformers import AutoModelForImageTextToText, AutoProcessor + + if self._hf_model is None: + dtype_map = { + "bfloat16": torch.bfloat16, + "float16": torch.float16, + "float32": torch.float32, + } + dtype = dtype_map.get(self._torch_dtype, torch.bfloat16) + + model = AutoModelForImageTextToText.from_pretrained( + self._model, + dtype=dtype, + device_map=self._device_map, + ) + model.eval() + model.processor = AutoProcessor.from_pretrained(self._model) + model.processor.tokenizer.padding_side = "left" + self._hf_model = model + + return generate_hf(batch, self._hf_model) diff --git a/src/docfold/engines/router.py b/src/docfold/engines/router.py index 5096f6d..d23fe57 100644 --- a/src/docfold/engines/router.py +++ b/src/docfold/engines/router.py @@ -23,14 +23,14 @@ # the first *available* engine that supports the extension. _IMAGE_PRIORITY = [ - "surya", "paddleocr", "tesseract", "easyocr", "docling", "liteparse", + "chandra", "surya", "paddleocr", "tesseract", "easyocr", "docling", "liteparse", "mistral_ocr", "google_docai", "textract", "azure_docint", "zerox", "marker", ] _EXTENSION_PRIORITY: dict[str, list[str]] = { # --- PDF --- "pdf": [ - "docling", "mineru", "unstructured", "marker", + "docling", "mineru", "chandra", "unstructured", "marker", "llamaparse", "liteparse", "mistral_ocr", "firecrawl", "google_docai", "azure_docint", "textract", "zerox", "nougat", "surya", "pymupdf", "paddleocr", "tesseract", "easyocr", @@ -76,7 +76,7 @@ # Ultimate fallback when extension is unknown or missing from the map. _DEFAULT_FALLBACK = [ - "docling", "mineru", "unstructured", "marker", + "docling", "mineru", "chandra", "unstructured", "marker", "llamaparse", "liteparse", "mistral_ocr", "google_docai", "azure_docint", "textract", "zerox", "nougat", "surya", "pymupdf", "paddleocr", "tesseract", "easyocr", diff --git a/tests/engines/test_adapters.py b/tests/engines/test_adapters.py index 361a99d..b66aed4 100644 --- a/tests/engines/test_adapters.py +++ b/tests/engines/test_adapters.py @@ -943,6 +943,59 @@ async def test_process_api_error(self): os.unlink(tmp_path) +class TestChandraEngine: + def test_name(self): + from docfold.engines.chandra_engine import ChandraEngine + e = ChandraEngine() + assert e.name == "chandra" + + def test_supported_extensions(self): + from docfold.engines.chandra_engine import ChandraEngine + e = ChandraEngine() + exts = e.supported_extensions + assert "pdf" in exts + assert "png" in exts + assert "jpg" in exts + assert "jpeg" in exts + assert "tiff" in exts + assert "bmp" in exts + assert "webp" in exts + + def test_is_available_when_missing(self): + from docfold.engines.chandra_engine import ChandraEngine + e = ChandraEngine() + with patch.dict("sys.modules", {"chandra": None}): + result = e.is_available() + assert isinstance(result, bool) + + def test_config_stored(self): + from docfold.engines.chandra_engine import ChandraEngine + e = ChandraEngine( + method="hf", + model="datalab-to/chandra-ocr-2", + prompt_type="ocr_with_layout", + vllm_url="http://localhost:9000", + ) + assert e._method == "hf" + assert e._model == "datalab-to/chandra-ocr-2" + assert e._prompt_type == "ocr_with_layout" + assert e._vllm_url == "http://localhost:9000" + + def test_default_method_is_vllm(self): + from docfold.engines.chandra_engine import ChandraEngine + e = ChandraEngine() + assert e._method == "vllm" + + def test_capabilities(self): + from docfold.engines.chandra_engine import ChandraEngine + caps = ChandraEngine().capabilities + assert caps.table_structure is True + assert caps.heading_detection is True + assert caps.reading_order is True + assert caps.bounding_boxes is False + assert caps.confidence is False + + class TestAllEnginesImplementInterface: """Verify every adapter satisfies the DocumentEngine ABC.""" @@ -965,6 +1018,7 @@ class TestAllEnginesImplementInterface: "docfold.engines.surya_engine.SuryaEngine", "docfold.engines.docling_serve_engine.DoclingServeEngine", "docfold.engines.firecrawl_engine.FirecrawlEngine", + "docfold.engines.chandra_engine.ChandraEngine", ]) def test_has_required_attributes(self, engine_cls_path): module_path, cls_name = engine_cls_path.rsplit(".", 1)