diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000..85e0703 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,326 @@ +"""Benchmark script — compares LiteParse vs PyMuPDF on synthetic PDF documents. + +Generates test PDFs with known content, runs both engines, and measures: +- Processing speed (ms) +- Text extraction quality (CER, WER) +- Bounding box coverage +""" + +from __future__ import annotations + +import asyncio +import json +import os +import sys +import tempfile +import time + +# Ensure docfold is importable +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src")) + + +def create_text_pdf(path: str, pages: list[dict]) -> None: + """Create a PDF with known text content using PyMuPDF.""" + import fitz + + doc = fitz.open() + for page_data in pages: + page = doc.new_page(width=612, height=792) + y = 72 + for block in page_data.get("blocks", []): + text = block["text"] + fontsize = block.get("fontsize", 11) + font = block.get("font", "helv") + page.insert_text((72, y), text, fontsize=fontsize, fontname=font) + y += fontsize * 1.5 + 8 + doc.save(path) + doc.close() + + +def generate_benchmark_documents(tmpdir: str) -> list[dict]: + """Generate synthetic PDFs and return metadata with ground truth.""" + documents = [] + + # --- Doc 1: Simple single-page text --- + doc1_path = os.path.join(tmpdir, "simple_text.pdf") + doc1_text = "Invoice Number: INV-2024-001\nDate: January 15, 2024\nBill To: Acme Corporation\nAmount Due: $1,250.00\nPayment Terms: Net 30" + create_text_pdf(doc1_path, [ + {"blocks": [ + {"text": "Invoice Number: INV-2024-001", "fontsize": 14}, + {"text": "Date: January 15, 2024", "fontsize": 11}, + {"text": "Bill To: Acme Corporation", "fontsize": 11}, + {"text": "Amount Due: $1,250.00", "fontsize": 11}, + {"text": "Payment Terms: Net 30", "fontsize": 11}, + ]} + ]) + documents.append({ + "name": "simple_text", + "path": doc1_path, + "ground_truth": doc1_text, + "pages": 1, + "category": "invoice", + }) + + # --- Doc 2: Multi-page document --- + doc2_path = os.path.join(tmpdir, "multi_page.pdf") + paragraphs = [ + "Chapter 1: Introduction to Document Processing", + "Document processing is the task of converting unstructured documents into structured data formats.", + "This involves text extraction, layout analysis, and semantic understanding of content.", + "Modern approaches use deep learning models for accurate extraction.", + ] + doc2_text = "\n".join(paragraphs) + page1_blocks = [{"text": p, "fontsize": 12} for p in paragraphs[:2]] + page2_blocks = [{"text": p, "fontsize": 12} for p in paragraphs[2:]] + create_text_pdf(doc2_path, [ + {"blocks": page1_blocks}, + {"blocks": page2_blocks}, + ]) + documents.append({ + "name": "multi_page", + "path": doc2_path, + "ground_truth": doc2_text, + "pages": 2, + "category": "report", + }) + + # --- Doc 3: Dense text --- + doc3_path = os.path.join(tmpdir, "dense_text.pdf") + dense_lines = [ + "Financial Summary Report Q4 2024", + "Total Revenue: $4,523,891.00", + "Operating Expenses: $2,187,432.50", + "Net Income: $2,336,458.50", + "Gross Margin: 51.7%", + "Year-over-Year Growth: 23.4%", + "Accounts Receivable: $892,100.00", + "Accounts Payable: $445,200.00", + "Cash and Equivalents: $3,112,750.00", + "Total Assets: $12,445,890.00", + ] + doc3_text = "\n".join(dense_lines) + create_text_pdf(doc3_path, [ + {"blocks": [{"text": line, "fontsize": 10} for line in dense_lines]} + ]) + documents.append({ + "name": "dense_financial", + "path": doc3_path, + "ground_truth": doc3_text, + "pages": 1, + "category": "financial", + }) + + # --- Doc 4: Mixed font sizes (headings + body) --- + doc4_path = os.path.join(tmpdir, "mixed_formatting.pdf") + doc4_blocks = [ + {"text": "Annual Report 2024", "fontsize": 18}, + {"text": "Executive Summary", "fontsize": 14}, + {"text": "Our company achieved record growth this fiscal year with revenue exceeding expectations.", "fontsize": 10}, + {"text": "Key Metrics", "fontsize": 14}, + {"text": "Customer satisfaction score improved from 87% to 94%.", "fontsize": 10}, + {"text": "Employee retention rate reached 96%, the highest in company history.", "fontsize": 10}, + ] + doc4_text = "\n".join(b["text"] for b in doc4_blocks) + create_text_pdf(doc4_path, [{"blocks": doc4_blocks}]) + documents.append({ + "name": "mixed_formatting", + "path": doc4_path, + "ground_truth": doc4_text, + "pages": 1, + "category": "report", + }) + + return documents + + +def compute_cer(predicted: str, reference: str) -> float: + """Character Error Rate — Levenshtein distance / reference length.""" + if not reference: + return 0.0 if not predicted else 1.0 + + # Simple Levenshtein + n, m = len(reference), len(predicted) + dp = list(range(n + 1)) + for j in range(1, m + 1): + prev = dp[:] + dp[0] = j + for i in range(1, n + 1): + cost = 0 if reference[i - 1] == predicted[j - 1] else 1 + dp[i] = min(prev[i] + 1, dp[i - 1] + 1, prev[i - 1] + cost) + return dp[n] / n + + +def compute_wer(predicted: str, reference: str) -> float: + """Word Error Rate.""" + ref_words = reference.split() + pred_words = predicted.split() + if not ref_words: + return 0.0 if not pred_words else 1.0 + + n, m = len(ref_words), len(pred_words) + dp = list(range(n + 1)) + for j in range(1, m + 1): + prev = dp[:] + dp[0] = j + for i in range(1, n + 1): + cost = 0 if ref_words[i - 1] == pred_words[j - 1] else 1 + dp[i] = min(prev[i] + 1, dp[i - 1] + 1, prev[i - 1] + cost) + return dp[n] / n + + +def normalize_text(text: str) -> str: + """Normalize whitespace for fair comparison.""" + import re + text = re.sub(r'\s+', ' ', text.strip()) + return text + + +async def run_engine(engine, file_path: str, fmt): + """Run an engine and return (result, error).""" + try: + result = await engine.process(file_path, output_format=fmt) + return result, None + except Exception as exc: + return None, str(exc) + + +async def main(): + from docfold.engines.base import OutputFormat + from docfold.engines.liteparse_engine import LiteParseEngine + from docfold.engines.pymupdf_engine import PyMuPDFEngine + + # Use --no-ocr for digital PDFs (Tesseract.js may not work in all envs) + liteparse = LiteParseEngine(ocr_enabled=False) + pymupdf = PyMuPDFEngine() + + engines = [] + if pymupdf.is_available(): + engines.append(pymupdf) + else: + print("WARNING: PyMuPDF not available, skipping") + if liteparse.is_available(): + engines.append(liteparse) + else: + print("WARNING: LiteParse not available (install: npm i -g @llamaindex/liteparse)") + + if not engines: + print("ERROR: No engines available for benchmarking") + return + + print(f"Engines: {[e.name for e in engines]}") + print() + + with tempfile.TemporaryDirectory() as tmpdir: + documents = generate_benchmark_documents(tmpdir) + print(f"Generated {len(documents)} benchmark documents") + print("=" * 90) + + # Collect all results + all_results: dict[str, list[dict]] = {e.name: [] for e in engines} + + for doc in documents: + print(f"\n{'─' * 90}") + print(f"Document: {doc['name']} | Pages: {doc['pages']} | Category: {doc['category']}") + print(f"{'─' * 90}") + + gt = doc["ground_truth"] + + for engine in engines: + result, error = await run_engine( + engine, doc["path"], OutputFormat.MARKDOWN + ) + + if error: + print(f" {engine.name:<14} ERROR: {error}") + all_results[engine.name].append({ + "doc": doc["name"], + "error": error, + }) + continue + + extracted = normalize_text(result.content) + gt_norm = normalize_text(gt) + + cer = compute_cer(extracted, gt_norm) + wer = compute_wer(extracted, gt_norm) + bbox_count = len(result.bounding_boxes) if result.bounding_boxes else 0 + time_ms = result.processing_time_ms + + score = { + "doc": doc["name"], + "time_ms": time_ms, + "cer": round(cer, 4), + "wer": round(wer, 4), + "bbox_count": bbox_count, + "content_length": len(extracted), + "pages": result.pages, + } + all_results[engine.name].append(score) + + print( + f" {engine.name:<14} " + f"time={time_ms:>6}ms " + f"CER={cer:.4f} " + f"WER={wer:.4f} " + f"BBoxes={bbox_count:>3} " + f"len={len(extracted):>5}" + ) + + # Summary + print(f"\n{'=' * 90}") + print("BENCHMARK SUMMARY") + print(f"{'=' * 90}") + print( + f" {'Engine':<14} {'Avg Time':>10} {'Avg CER':>10} {'Avg WER':>10} " + f"{'Avg BBoxes':>11} {'Errors':>8}" + ) + print(f" {'─' * 68}") + + summary = {} + for engine_name, results in all_results.items(): + successes = [r for r in results if "error" not in r] + errors = [r for r in results if "error" in r] + + if successes: + avg_time = sum(r["time_ms"] for r in successes) / len(successes) + avg_cer = sum(r["cer"] for r in successes) / len(successes) + avg_wer = sum(r["wer"] for r in successes) / len(successes) + avg_bbox = sum(r["bbox_count"] for r in successes) / len(successes) + else: + avg_time = avg_cer = avg_wer = avg_bbox = 0 + + summary[engine_name] = { + "avg_time_ms": round(avg_time, 1), + "avg_cer": round(avg_cer, 4), + "avg_wer": round(avg_wer, 4), + "avg_bbox_count": round(avg_bbox, 1), + "errors": len(errors), + "successes": len(successes), + "results": results, + } + + print( + f" {engine_name:<14} {avg_time:>9.1f}ms {avg_cer:>10.4f} {avg_wer:>10.4f} " + f"{avg_bbox:>11.1f} {len(errors):>8}" + ) + + # Write JSON report + report_path = os.path.join( + os.path.dirname(__file__), "docs", "benchmark_results.json" + ) + report = { + "benchmark_date": time.strftime("%Y-%m-%d %H:%M:%S"), + "engines": list(all_results.keys()), + "documents": [ + {"name": d["name"], "pages": d["pages"], "category": d["category"]} + for d in documents + ], + "summary": summary, + } + with open(report_path, "w") as f: + json.dump(report, f, indent=2, ensure_ascii=False) + print(f"\nDetailed report saved to: {report_path}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/benchmark_results.json b/docs/benchmark_results.json new file mode 100644 index 0000000..5eb2e7f --- /dev/null +++ b/docs/benchmark_results.json @@ -0,0 +1,123 @@ +{ + "benchmark_date": "2026-03-22 14:07:03", + "engines": [ + "pymupdf", + "liteparse" + ], + "documents": [ + { + "name": "simple_text", + "pages": 1, + "category": "invoice" + }, + { + "name": "multi_page", + "pages": 2, + "category": "report" + }, + { + "name": "dense_financial", + "pages": 1, + "category": "financial" + }, + { + "name": "mixed_formatting", + "pages": 1, + "category": "report" + } + ], + "summary": { + "pymupdf": { + "avg_time_ms": 4.5, + "avg_cer": 0.0, + "avg_wer": 0.0, + "avg_bbox_count": 6.2, + "errors": 0, + "successes": 4, + "results": [ + { + "doc": "simple_text", + "time_ms": 10, + "cer": 0.0, + "wer": 0.0, + "bbox_count": 5, + "content_length": 121, + "pages": 1 + }, + { + "doc": "multi_page", + "time_ms": 3, + "cer": 0.0, + "wer": 0.0, + "bbox_count": 4, + "content_length": 300, + "pages": 2 + }, + { + "doc": "dense_financial", + "time_ms": 3, + "cer": 0.0, + "wer": 0.0, + "bbox_count": 10, + "content_length": 298, + "pages": 1 + }, + { + "doc": "mixed_formatting", + "time_ms": 2, + "cer": 0.0, + "wer": 0.0, + "bbox_count": 6, + "content_length": 260, + "pages": 1 + } + ] + }, + "liteparse": { + "avg_time_ms": 382.0, + "avg_cer": 0.0, + "avg_wer": 0.0, + "avg_bbox_count": 31.8, + "errors": 0, + "successes": 4, + "results": [ + { + "doc": "simple_text", + "time_ms": 426, + "cer": 0.0, + "wer": 0.0, + "bbox_count": 18, + "content_length": 121, + "pages": 1 + }, + { + "doc": "multi_page", + "time_ms": 359, + "cer": 0.0, + "wer": 0.0, + "bbox_count": 39, + "content_length": 300, + "pages": 2 + }, + { + "doc": "dense_financial", + "time_ms": 364, + "cer": 0.0, + "wer": 0.0, + "bbox_count": 33, + "content_length": 298, + "pages": 1 + }, + { + "doc": "mixed_formatting", + "time_ms": 379, + "cer": 0.0, + "wer": 0.0, + "bbox_count": 37, + "content_length": 260, + "pages": 1 + } + ] + } + } +} \ No newline at end of file diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 6ae98fd..82abecb 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -17,6 +17,7 @@ This guide helps you choose the right document processing engine for your use ca | **EasyOCR** | Local | Apache-2.0 | ★☆☆ | ★★★ | ☆☆☆ | ☆☆☆ | ★★★ (80+) | Medium | Free | | **Unstructured** | Local/SaaS | Apache-2.0 | ★★☆ | ★★☆ | ★★☆ | ★☆☆ | ★★☆ | Medium | Free / Paid API | | **LlamaParse** | SaaS | Paid | ★★★ | ★★★ | ★★★ | ★★★ | ★★☆ | Fast | ~$3/1K pages | +| **LiteParse** | Local | Apache-2.0 | ★★★ | ★★☆ | ★★☆ | ☆☆☆ | ★★☆ | Fast | Free | | **Mistral OCR** | SaaS | Paid | ★★★ | ★★★ | ★★★ | ★★★ | ★★★ | Fast | ~$1/1K pages | | **Zerox** | VLM | MIT | ★★★ | ★★★ | ★★☆ | ★★☆ | ★★★ | Slow | VLM API cost | | **Nougat** | Local | MIT | ★★★ | ★★☆ | ★★☆ | ★★★ | ★☆☆ | Slow | Free | @@ -123,6 +124,16 @@ This guide helps you choose the right document processing engine for your use ca - **Install:** `pip install docfold[llamaparse]` - **Links:** [Docs](https://docs.llamaindex.ai/en/stable/llama_cloud/llama_parse/) +### LiteParse (LlamaIndex) + +**Best for:** Fast local PDF parsing with bounding boxes, no cloud dependencies. + +- **Strengths:** Fast, lightweight local parser built on PDF.js. Bounding boxes with confidence scores. Wide format support (PDF, Office, images) via LibreOffice conversion. Flexible OCR integration (Tesseract.js built-in, or connect PaddleOCR/EasyOCR servers). Apache 2.0 license. No API key required. +- **Weaknesses:** Requires Node.js 18+. No formula recognition. Table extraction is basic (no cell-level structure). Non-Python — runs as subprocess. Needs LibreOffice for non-PDF formats. +- **GPU:** Not needed. +- **Install:** `npm i -g @llamaindex/liteparse` then `pip install docfold[liteparse]` +- **Links:** [GitHub](https://github.com/run-llama/liteparse) + ### Mistral OCR **Best for:** High-accuracy document understanding with strong multilingual support. @@ -231,6 +242,7 @@ Capabilities each engine can populate in `EngineResult`: | Tesseract | — | — | — | — | — | — | | Unstructured | — | — | — | ✅ | ✅ | — | | LlamaParse | — | — | — | ✅ | ✅ | — | +| LiteParse | ✅ | ✅ | — | — | — | — | | Mistral OCR | — | — | — | ✅ | ✅ | — | | Zerox | — | — | — | — | — | — | | **Textract** | ✅ | ✅ | — | ✅ | — | ✅ | @@ -263,6 +275,7 @@ Capabilities each engine can populate in `EngineResult`: | Tesseract | ✅* | — | — | — | — | ✅ | — | — | — | | Unstructured | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | — | ✅ | | LlamaParse | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | — | — | ✅ | +| LiteParse | ✅ | ✅ | ✅ | ✅ | — | ✅ | — | — | — | | Mistral OCR | ✅ | — | — | — | — | ✅ | — | — | — | | Zerox | ✅ | — | — | — | — | ✅ | — | — | — | | **Textract** | ✅ | — | — | — | — | ✅ | — | — | — | @@ -290,6 +303,7 @@ Capabilities each engine can populate in `EngineResult`: | Unstructured (hi_res) | 8 GB | 16 GB | Optional | ~2 GB | | Nougat | 8 GB | 16 GB | CUDA 8+ GB | ~1.5 GB | | Surya | 4 GB | 8 GB | Optional | ~1 GB | +| LiteParse | 512 MB | 1 GB | — | ~100 MB (Node.js) | *SaaS engines (LlamaParse, Mistral OCR, Zerox, Marker API, Textract, Google DocAI, Azure DocInt) have no local hardware requirements.* @@ -311,6 +325,7 @@ Capabilities each engine can populate in `EngineResult`: | Marker API | SaaS | ~$1 | | Mistral OCR | SaaS | ~$1 (token-based) | | LlamaParse | SaaS | ~$3 (free: 1K/day) | +| LiteParse | Free | $0 (Node.js runtime) | | AWS Textract | SaaS | ~$1.50 | | Google Doc AI | SaaS | ~$1.50 | | Azure Doc Intel | SaaS | ~$1.50 | diff --git a/pyproject.toml b/pyproject.toml index 10f5ea5..f2ea736 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,9 @@ unstructured = [ llamaparse = [ "llama-parse>=0.5", ] +liteparse = [ + # No Python deps — requires Node.js 18+ and: npm i -g @llamaindex/liteparse +] mistral-ocr = [ "mistralai>=1.0", ] @@ -106,7 +109,7 @@ evaluation = [ "psutil>=5.9", # Memory measurement ] all = [ - "docfold[docling,mineru,marker,pymupdf,paddleocr,tesseract,easyocr,unstructured,llamaparse,mistral-ocr,textract,google-docai,azure-docint,nougat,surya,firecrawl,evaluation]", + "docfold[docling,mineru,marker,pymupdf,paddleocr,tesseract,easyocr,unstructured,llamaparse,liteparse,mistral-ocr,textract,google-docai,azure-docint,nougat,surya,firecrawl,evaluation]", # Note: zerox excluded from [all] — py-zerox requires Python 3.11+ # Install separately: pip install docfold[zerox] ] diff --git a/src/docfold/cli.py b/src/docfold/cli.py index 39a819c..ac32dc3 100644 --- a/src/docfold/cli.py +++ b/src/docfold/cli.py @@ -136,6 +136,12 @@ def _build_router(): except Exception: pass + try: + from docfold.engines.liteparse_engine import LiteParseEngine + router.register(LiteParseEngine()) + except Exception: + pass + try: from docfold.engines.mistral_ocr_engine import MistralOCREngine router.register(MistralOCREngine()) diff --git a/src/docfold/engines/liteparse_engine.py b/src/docfold/engines/liteparse_engine.py new file mode 100644 index 0000000..9791eac --- /dev/null +++ b/src/docfold/engines/liteparse_engine.py @@ -0,0 +1,219 @@ +"""LiteParse engine adapter — fast local document parsing via CLI. + +LiteParse is a standalone OSS tool by LlamaIndex for high-speed PDF parsing +with bounding boxes. It runs locally with no API key required. + +Requires Node.js 18+ and the ``lit`` CLI: +``npm i -g @llamaindex/liteparse`` + +See https://github.com/run-llama/liteparse +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import shutil +import time +from typing import Any + +from docfold.engines.base import ( + BoundingBox, + DocumentEngine, + EngineCapabilities, + EngineResult, + OutputFormat, +) + +logger = logging.getLogger(__name__) + +_SUPPORTED_EXTENSIONS = { + "pdf", "docx", "doc", "pptx", "ppt", "xlsx", "xls", + "odt", "rtf", "odp", "csv", "tsv", + "png", "jpg", "jpeg", "gif", "bmp", "tiff", "tif", "webp", +} + + +class LiteParseEngine(DocumentEngine): + """Adapter for LiteParse (run-llama/liteparse). + + Calls the ``lit parse`` CLI as a subprocess and parses the structured + JSON output. Supports bounding boxes and confidence scores out of the box. + """ + + def __init__( + self, + cli_path: str = "lit", + ocr_enabled: bool = True, + ocr_language: str = "en", + dpi: int = 150, + num_workers: int | None = None, + max_pages: int | None = None, + ) -> None: + self._cli_path = cli_path + self._ocr_enabled = ocr_enabled + self._ocr_language = ocr_language + self._dpi = dpi + self._num_workers = num_workers + self._max_pages = max_pages + + @property + def name(self) -> str: + return "liteparse" + + @property + def supported_extensions(self) -> set[str]: + return _SUPPORTED_EXTENSIONS + + @property + def capabilities(self) -> EngineCapabilities: + return EngineCapabilities(bounding_boxes=True, confidence=True) + + def is_available(self) -> bool: + return shutil.which(self._cli_path) is not None + + async def process( + self, + file_path: str, + output_format: OutputFormat = OutputFormat.MARKDOWN, + **kwargs: Any, + ) -> EngineResult: + start = time.perf_counter() + + # For text output we use --format text; for everything else use json + # so we can extract bounding boxes. + use_json = output_format != OutputFormat.TEXT + + cmd = self._build_command(file_path, use_json=use_json) + + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + + if proc.returncode != 0: + err_msg = stderr.decode(errors="replace").strip() + raise RuntimeError( + f"liteparse failed (exit {proc.returncode}): {err_msg}" + ) + + raw = stdout.decode(errors="replace") + elapsed_ms = int((time.perf_counter() - start) * 1000) + + if use_json: + return self._parse_json_output(raw, output_format, elapsed_ms) + else: + return EngineResult( + content=raw, + format=output_format, + engine_name=self.name, + processing_time_ms=elapsed_ms, + metadata={"cli": self._cli_path}, + ) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _build_command(self, file_path: str, *, use_json: bool) -> list[str]: + cmd = [self._cli_path, "parse", file_path] + + if use_json: + cmd += ["--format", "json"] + else: + cmd += ["--format", "text"] + + if not self._ocr_enabled: + cmd.append("--no-ocr") + else: + cmd += ["--ocr-language", self._ocr_language] + + if self._dpi != 150: + cmd += ["--dpi", str(self._dpi)] + + if self._num_workers is not None: + cmd += ["--num-workers", str(self._num_workers)] + + if self._max_pages is not None: + cmd += ["--max-pages", str(self._max_pages)] + + return cmd + + @staticmethod + def _extract_json(raw: str) -> str: + """Extract JSON object from raw output that may contain log lines.""" + # The CLI may print log/progress lines before the JSON. + # Find the first '{' that starts the JSON object. + idx = raw.find("{") + if idx == -1: + return raw + return raw[idx:] + + def _parse_json_output( + self, + raw: str, + output_format: OutputFormat, + elapsed_ms: int, + ) -> EngineResult: + data = json.loads(self._extract_json(raw)) + pages = data.get("pages", []) + + texts: list[str] = [] + bboxes: list[dict[str, Any]] = [] + + for page_data in pages: + page_num = page_data.get("page", 1) + # LiteParse uses "text" at page level (not nested in "content") + page_text = page_data.get("text", "") + texts.append(page_text) + + pw = page_data.get("width") + ph = page_data.get("height") + + # LiteParse provides "textItems" with {text, x, y, width, height} + # and "boundingBoxes" with {x1, y1, x2, y2} + for idx, item in enumerate(page_data.get("textItems", [])): + x = item.get("x", 0) + y = item.get("y", 0) + w = item.get("width", 0) + h = item.get("height", 0) + bboxes.append( + BoundingBox( + type="Text", + bbox=[x, y, x + w, y + h], + page=page_num, + text=item.get("text", "").strip(), + id=f"p{page_num}-i{idx}", + confidence=item.get("confidence"), + page_width=pw, + page_height=ph, + ).to_dict() + ) + + full_text = "\n\n".join(texts) + page_count = len(pages) + + if output_format == OutputFormat.JSON: + content = json.dumps(data, ensure_ascii=False) + elif output_format == OutputFormat.HTML: + html_parts = [ + f"
{t}