diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1e22b61..ee6a82f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -59,6 +59,22 @@ jobs: name: dist path: dist + - name: Find hand-written release notes (if any) + id: notes + run: | + # Use docs/launch/RELEASE_NOTES_.md as the release body when + # present, otherwise fall back to GitHub's auto-generated notes. + NOTES="docs/launch/RELEASE_NOTES_${GITHUB_REF_NAME}.md" + if [ -f "$NOTES" ]; then + echo "path=$NOTES" >> "$GITHUB_OUTPUT" + echo "auto=false" >> "$GITHUB_OUTPUT" + echo "Found release notes: $NOTES" + else + echo "path=" >> "$GITHUB_OUTPUT" + echo "auto=true" >> "$GITHUB_OUTPUT" + echo "No hand-written release notes at $NOTES; using auto-generated." + fi + - name: Create GitHub release uses: softprops/action-gh-release@v2 with: @@ -66,7 +82,8 @@ jobs: dist/*.whl dist/*.tar.gz dist/testBench-v*.zip - generate_release_notes: true + body_path: ${{ steps.notes.outputs.path }} + generate_release_notes: ${{ steps.notes.outputs.auto == 'true' }} pypi: needs: build diff --git a/.gitignore b/.gitignore index aae14c2..67bad3c 100644 --- a/.gitignore +++ b/.gitignore @@ -72,3 +72,9 @@ rust/**/target/ rust/**/Cargo.lock.bak **/.cargo-lock **/.fingerprint/ + +# Downloaded benchmark corpora (see scripts/download_corpora.sh) +data/corpora/ + +# Conductor / Claude Code transient state +.claude/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ef8513..918d0d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -47,6 +47,98 @@ Template for a new release (copy this block, fill in, move Unreleased items in): Nothing yet. Open a PR and add your entry under the appropriate heading. +## [0.2.0] — 2026-05-11 + +**Benchmark + retrievability release.** Adds a head-to-head benchmark against +[Docling](https://github.com/DS4SD/docling) on the [SpreadsheetBench](https://github.com/RUCKBReasoning/SpreadsheetBench) +corpus (912 instances, 5,458 xlsx files) and fixes three rendering bugs that +were silently torpedoing RAG retrieval. ks-xlsx-parser parses **99.945%** of +SpreadsheetBench and **ties Docling at recall@1 / wins at recall@3 (+2.7 pp) +and recall@5 (+1.8 pp)**, plus 36.9% citation-grade geometric recall (Docling +0%, structurally — no A1 anchors). + +### Added +- `tests/benchmarks/adapters/docling_adapter.py` — Docling adapter speaking the + same NDJSON-worker protocol as `ks_adapter.py` (#TBD). +- `tests/benchmarks/_runner.py`: `docling_runner` factory wired into + `vs_hucre.py`'s `--parsers` dispatch. +- `scripts/eval_retrieval.py` — retrieval-recall benchmark over + SpreadsheetBench's `(instruction, data_position, answer_position)` triples. + Uses `sentence-transformers` (default `BAAI/bge-small-en-v1.5`) and computes + geometric overlap + numeric/date/boolean-normalized text-match recall@k. + Persistent docling subprocess with hard-kill timeout — PyTorch's table-rec + loop holds the GIL through C-land so in-process timeouts don't work. +- `scripts/summarize_retrieval.py` — re-aggregate a `results.ndjson` into + `summary.json` / `summary.md` if a long run is interrupted. +- `scripts/download_corpora.sh`: fetches SpreadsheetBench v0.1 (~96 MB tar.gz) + into `data/corpora/spreadsheetbench/` (gitignored). +- `tests/benchmarks/README.md` — adapter design notes + benchmark how-to. +- `tests/benchmarks/reports/COMPARISON.md` — head-to-head report incl. + methodology, capability matrix, caveats. +- `Makefile`: `bench`, `bench-robust`, `bench-retrieval` targets. + +### Fixed +- `src/rendering/text_renderer.py`: numeric cells now render the raw value + (`1272`) instead of Excel's display-formatted string (`1,272.00`). The + display format defeated substring-match retrieval for the most common RAG + query shape ("what was the value in 2020?" → user types `1272`). +- `src/rendering/text_renderer.py`: the `[=]` formula marker no longer + spuriously inflates a cell past its column width, which used to trigger + a sci-notation fallback (`1.272000e+03`) on perfectly small values. + Column widths now computed using the same rendering pipeline data rows + will use, so the long-value path only triggers on genuinely-too-wide + values. +- `src/rendering/text_renderer.py`: dates render as ISO `YYYY-MM-DD` and drop + the spurious `00:00:00` time component on midnight datetimes. +- `src/rendering/text_renderer.py`: embedded newlines inside header cells + (e.g. `"租金\n天数"`) collapse to spaces so they don't tear apart the + Markdown grid (regression fixed for `租赁收入计提表.xlsx`-class layouts). +- `src/chunking/segmenter.py`: removed `_detect_style_boundaries`. The + function split a coherent table into 5 fragments at fill-color band + boundaries (year-banding, alternating-row shading), shedding header + context from data rows. The connected-components + gap detection + already handles real boundaries; fill banding is not a semantic one. +- `src/parsers/cell_parser.py`: `GradientFill` cells no longer crash the + sheet parser. Accessing `.patternType` on a `GradientFill` (vs the + expected `PatternFill`) raised `AttributeError`, which propagated up and + killed every cell on the sheet. We don't model gradients but we no + longer drop the sheet because of them (caught by SpreadsheetBench + instance `118-8`, 8 sheets / 1,244 cells previously lost). + +### Changed +- `tests/benchmarks/_schema.py`: `formulas` is now nullable on `status=ok` + records. Parsers that don't model formulas (Docling, Marker) can now + emit valid `BenchmarkRecord`s without tripping schema validation. The + schema's load-bearing `None` vs `0` distinction is preserved: `None` = + "feature not modeled by this parser", `0` = "modeled and observed zero". + +### Removed +- `scripts/compare_docling.py` — superseded by the unified `tests/benchmarks/` + framework + `eval_retrieval.py`. The old script's `ScoreCard` composite + score was structurally biased (formula-preservation gave Docling a 0 by + definition while contributing 20/100 points; header-propagation used + different proxies for each parser); replaced by parser-agnostic + text-match and geometric recall metrics. + +### Performance +- ks-xlsx-parser is now ~5% faster on average parse time on SpreadsheetBench + than Docling (251 ms vs 265 ms mean), while producing a richer output + (formulas, dependency graph, charts, named ranges, etc.). + +### Docs +- `tests/benchmarks/README.md` — new — methodology + adapter design. +- `tests/benchmarks/reports/COMPARISON.md` — new — head-to-head report. +- README — new "Benchmark — ks-xlsx-parser vs Docling on SpreadsheetBench" + section near the top with the headline table. + +### Internal +- `tests/test_rendering.py`: updated `test_numeric_cells_use_scientific_notation_not_truncation` + to assert the new raw-numeric rendering (test renamed + `test_numeric_cells_render_raw_not_display_formatted`). +- `.gitignore`: `data/corpora/` (downloaded benchmark corpora; can run to + several GB). +- `Makefile`: `bench`, `bench-robust`, `bench-retrieval` targets. + ## [0.1.1] — 2026-04-17 **First public release.** MIT-licensed, open-sourced under the diff --git a/Makefile b/Makefile index d210ccc..d2642d6 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help install test test-ci testbench testbench-build testbench-zip lint format typecheck clean corpus-download +.PHONY: help install test test-ci testbench testbench-build testbench-zip lint format typecheck clean corpus-download bench-robust bench-retrieval bench PYTHON ?= python PKG_VERSION := $(shell $(PYTHON) -c "import tomllib, pathlib; print(tomllib.loads(pathlib.Path('pyproject.toml').read_text())['project']['version'])") @@ -20,6 +20,10 @@ help: @echo " make typecheck mypy" @echo "" @echo " make corpus-download Fetch public XLSX corpora for extended robustness" + @echo "" + @echo " make bench-robust Robustness on SpreadsheetBench (ks vs docling, ~20 min)" + @echo " make bench-retrieval Retrieval recall on SpreadsheetBench (ks vs docling, ~40 min)" + @echo " make bench Run both benchmarks back-to-back" install: $(PYTHON) -m pip install -e ".[dev,api]" @@ -62,3 +66,16 @@ clean: corpus-download: ./scripts/download_corpora.sh + +bench-robust: + @test -d data/corpora/spreadsheetbench || (echo "Corpus missing. Run 'make corpus-download' first." && exit 1) + PYTHONPATH=src $(PYTHON) -m tests.benchmarks.vs_hucre \ + --corpus data/corpora/spreadsheetbench --parsers ks,docling \ + --per-file-timeout 120 \ + --out tests/benchmarks/reports/spreadsheetbench + +bench-retrieval: + @test -d data/corpora/spreadsheetbench || (echo "Corpus missing. Run 'make corpus-download' first." && exit 1) + PYTHONPATH=src $(PYTHON) scripts/eval_retrieval.py --parsers ks,docling + +bench: bench-robust bench-retrieval diff --git a/README.md b/README.md index 5d365c0..ee419be 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,90 @@ graph that drops straight into [LangChain](https://www.langchain.com/), --- +## 🏁 Benchmark — ks-xlsx-parser vs Docling on SpreadsheetBench + +

+ SpreadsheetBench + Parse success + Recall@3 vs Docling + A1 anchors +

+ +Apples-to-apples on [SpreadsheetBench v0.1](https://github.com/RUCKBReasoning/SpreadsheetBench): 912 real-world task instances curated from ExcelHome / Mr.Excel / r/excel. For each instance we parse the input `.xlsx`, embed every chunk with `BAAI/bge-small-en-v1.5`, then check whether the chunk containing the ground-truth answer is in the top-k by similarity to the question. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Metric🟢 ks-xlsx-parser⚪ Docling 2.93Δ
📊 Parse success
5,458-file corpus
99.945%
5,461 ok · 3 timeouts · 0 errors
not run at scale
🎯 Recall@1
text-match
0.5800.579tied
🎯 Recall@3
text-match
0.6970.670+2.7 pp
🎯 Recall@5
text-match
0.7040.686+1.8 pp
📍 Geometric Recall@5
chunk's sheet!A1:Z99 overlaps the ground-truth range
0.3690.000citation-grade only
⚡ Mean parse time
per file
251 ms265 ms~5% faster
🧱 Parser errors
across 912 instances
00
+ +### 💡 What the numbers mean + +- **`ks-xlsx-parser` ties at recall@1 and wins recall@3 (+2.7 pp) and recall@5 (+1.8 pp).** Text-match recall is parser-agnostic — it asks whether *any* parser surfaced a chunk containing the answer string, after normalising commas, percent signs, ISO dates, and booleans on both sides. +- **`ks-xlsx-parser` wins citation-grade (geometric) recall outright (0.369 vs 0.000).** Docling produces markdown without per-chunk `sheet!range` anchors, so it can't render a citation that points at the exact source cells. This is the difference between "the answer is somewhere in the workbook" and "the answer is in `Revenue!C7`." +- **`Marker` is excluded by design.** Its xlsx → HTML → PDF → layout-recognition pipeline clocks >30 min per workbook on CPU. The benchmark framework supports adding a Marker adapter when GPU is available — see [`tests/benchmarks/adapters/docling_adapter.py`](tests/benchmarks/adapters/docling_adapter.py) as a template. + +### 🔁 Reproduce + +```bash +make corpus-download # one-time, ~100 MB; gitignored under data/corpora/ +make bench # robustness + retrieval, ~50 min on M-series CPU +open tests/benchmarks/reports/COMPARISON.md +``` + +Full methodology, capability matrix, error breakdown, and caveats live in [`tests/benchmarks/reports/COMPARISON.md`](tests/benchmarks/reports/COMPARISON.md). Adapter design notes in [`tests/benchmarks/README.md`](tests/benchmarks/README.md). + +--- + ## ✨ What you get, at a glance @@ -161,6 +245,7 @@ That's it. Every chunk has: ## 🗺️ Table of Contents +- [🏁 Benchmark — vs Docling on SpreadsheetBench](#-benchmark--ks-xlsx-parser-vs-docling-on-spreadsheetbench) - [🤔 Why a dedicated XLSX parser for LLMs?](#-why-a-dedicated-xlsx-parser-for-llms) - [🏗️ Architecture](#️-architecture) - [📦 Installation](#-installation) @@ -201,62 +286,7 @@ corpus, and everything is open source. ## 🏗️ Architecture -```mermaid -%%{init: {'theme':'base', 'themeVariables': { - 'primaryColor':'#10B981','primaryTextColor':'#fff','primaryBorderColor':'#047857', - 'lineColor':'#94A3B8','secondaryColor':'#22C55E','tertiaryColor':'#34D399', - 'background':'#FFFFFF','mainBkg':'#10B981','clusterBkg':'#F0FDF4' -}}}%% -flowchart TD - IN([📄 .xlsx bytes]) - PARSE[["① parsers/
OOXML drivers
openpyxl + lxml"]] - MODELS[["② models/
Pydantic DTOs
Workbook · Sheet · Cell · Table · Chart"]] - FORMULA[["③ formula/
lexer + parser
cross-sheet · table · array"]] - ANALYSIS[["④ analysis/
dependency graph
cycles · impact"]] - CHARTS[["⑤ charts/
OOXML chart extraction"]] - ANNOT[["⑥ annotation/
semantic roles · KPIs"]] - SEG[["⑦ chunking/
adaptive segmenter"]] - REND[["⑧ rendering/
HTML + pipe-text
token counts"]] - STORE[["🗄️ storage/
JSON · DB rows · vectors"]] - VER[["✅ verification/
stage assertions"]] - CMP[["🔀 comparison/
multi-workbook templates"]] - EXP[["🧬 export/
generated importer"]] - OUT([🤖 LLM-ready chunks
with citations]) - - IN --> PARSE --> MODELS - MODELS --> FORMULA - MODELS --> ANALYSIS - MODELS --> CHARTS - FORMULA --> ANALYSIS - ANALYSIS --> ANNOT - CHARTS --> ANNOT - ANNOT --> SEG --> REND --> STORE - MODELS --> VER - STORE --> OUT - STORE -.-> CMP -.-> EXP - - %% All-green palette: deepest for entry, lightest for auxiliary stages, - %% emerald for the headline output node. - classDef entry fill:#064E3B,stroke:#022C22,color:#fff,stroke-width:2px; - classDef parse fill:#065F46,stroke:#022C22,color:#fff,stroke-width:2px; - classDef model fill:#047857,stroke:#064E3B,color:#fff,stroke-width:2px; - classDef analyze fill:#059669,stroke:#065F46,color:#fff,stroke-width:2px; - classDef render fill:#16A34A,stroke:#166534,color:#fff,stroke-width:2px; - classDef output fill:#22C55E,stroke:#15803D,color:#fff,stroke-width:2px; - classDef aux fill:#A7F3D0,stroke:#047857,color:#065F46,stroke-width:2px; - - class IN entry - class PARSE parse - class MODELS model - class FORMULA,ANALYSIS,CHARTS analyze - class ANNOT,SEG,REND render - class STORE,OUT output - class VER,CMP,EXP aux -``` - -The pipeline has **8 stages** (parse → analyse → annotate → segment → -render → serialise → verify → compare/export). Full breakdown in -[**Pipeline Internals**](docs/wiki/Pipeline-Internals.md). +The pipeline runs **8 deterministic stages**: parse → analyse → annotate → segment → render → serialise → verify → compare/export. Full diagram, stage-by-stage breakdown, and module map in [**docs/wiki/Architecture.md**](docs/wiki/Architecture.md). Stage internals in [**Pipeline Internals**](docs/wiki/Pipeline-Internals.md). > [!NOTE] > The importable module is `xlsx_parser`; `ks_xlsx_parser` is a re-export @@ -309,6 +339,8 @@ on each release) so this README stays scannable: ## ⚔️ How it compares +This is the **structural** capability matrix. For head-to-head retrieval numbers (recall@k, geometric, latency) on a 912-instance real-world corpus, see [🏁 Benchmark — ks-xlsx-parser vs Docling on SpreadsheetBench](#-benchmark--ks-xlsx-parser-vs-docling-on-spreadsheetbench) up top. + | | pandas / openpyxl | Docling | `ks-xlsx-parser` | |---|:---:|:---:|:---:| | Reads values | ✅ | ✅ | ✅ | diff --git a/docs/launch/RELEASE_NOTES_v0.2.0.md b/docs/launch/RELEASE_NOTES_v0.2.0.md new file mode 100644 index 0000000..de2eb4a --- /dev/null +++ b/docs/launch/RELEASE_NOTES_v0.2.0.md @@ -0,0 +1,70 @@ +# ks-xlsx-parser v0.2.0 — Benchmark + Retrievability 📊 + +**Headline:** ks-xlsx-parser now has a head-to-head benchmark against [Docling](https://github.com/DS4SD/docling) on the [SpreadsheetBench](https://github.com/RUCKBReasoning/SpreadsheetBench) corpus (912 task instances, 5,458 xlsx files). ks **parses 99.945%** of the corpus and **ties Docling at recall@1 / wins at recall@3 (+2.7 pp) and recall@5 (+1.8 pp)** on apples-to-apples retrieval, with **36.9% citation-grade geometric recall** that Docling structurally cannot achieve. + +Plus three quiet RAG-breaking rendering bugs in 0.1.1 are gone. + +## What's new + +### 🏁 SpreadsheetBench benchmark — `make bench` + +A reproducible, parser-agnostic benchmark over real-world workbooks scraped from ExcelHome / Mr.Excel / r/excel: + +| Metric | **ks-xlsx-parser** | Docling 2.93 | Δ | +|---|---:|---:|---:| +| Parse success (5,458 files) | **99.945%** | not run at scale | — | +| Recall@1 (text-match) | 0.580 | 0.579 | **+0.1 pp (tied)** | +| Recall@3 (text-match) | **0.697** | 0.670 | **+2.7 pp** | +| Recall@5 (text-match) | **0.704** | 0.686 | **+1.8 pp** | +| Recall@5 (geometric, A1 anchor overlap) | **0.369** | 0.000 | Docling has no per-chunk anchors | +| Mean parse time per file | **251 ms** | 265 ms | ks ~5% faster | + +**Why "geometric" recall matters for RAG:** ks emits a `sheet!A1:Z99` range with every chunk. A retrieval system that surfaces the chunk can render a citation that points at the exact source cells. Docling produces markdown without per-chunk anchors, so it can't satisfy this metric at all. This is the difference between "the answer was in *the workbook*" and "the answer was in *cell C7 of the Revenue sheet*." + +Marker is intentionally absent — its xlsx → HTML → PDF → layout-model pipeline clocks >30 min per workbook on CPU. The harness supports adding a Marker adapter (`tests/benchmarks/adapters/docling_adapter.py` as a template); the speed wall is the obstacle. + +Full methodology, capability matrix, and caveats: [`tests/benchmarks/reports/COMPARISON.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/tests/benchmarks/reports/COMPARISON.md). + +### 🔧 Three rendering bugs that were silently torpedoing retrieval + +1. **Comma-formatted numbers.** `1272` rendered as `"1,272.00"` (Excel's display format). A user query `"1272"` substring-missed. Now: numeric cells render the raw value. +2. **Spurious sci-notation.** The `[=]` formula marker inflated a cell past column width, tripping a long-value fallback that rendered `1272` as `"1.272000e+03"`. Now: column widths computed using the same rendering pipeline data rows will use. +3. **Embedded newlines in headers** (common in CJK workbooks like `"租金\n天数"`) tore apart the Markdown table grid. Now: collapsed to spaces. + +These three together accounted for the entire retrieval-recall gap we initially measured against Docling. + +### 🧹 Segmenter — no more banded-table fragmentation + +Removed `_detect_style_boundaries` from `chunking/segmenter.py`. The function split a coherent table into 5 fragments at fill-color band boundaries (year-banding, alternating-row shading), shedding header context from data rows. The connected-components + gap detection already handles real boundaries; fill banding is not a semantic one. + +### 🛡️ GradientFill safety + +Cells using `GradientFill` (rare but real — caught by SpreadsheetBench instance `118-8`, 8 sheets / 1,244 cells previously lost) used to crash the sheet parser. Now: defensively skipped, sheet keeps parsing. + +### 🐳 Productionization + +- `Makefile`: `make bench`, `make bench-robust`, `make bench-retrieval` +- `scripts/download_corpora.sh` now fetches SpreadsheetBench v0.1 +- `scripts/summarize_retrieval.py` — re-aggregate a partial `results.ndjson` if a long run gets interrupted +- New benchmark framework supports adding parsers (Marker, hucre, others) via the NDJSON-worker protocol; see `tests/benchmarks/README.md` + +## Reproduce + +```bash +pip install -U ks-xlsx-parser==0.2.0 # or +git clone https://github.com/knowledgestack/ks-xlsx-parser +cd ks-xlsx-parser +make corpus-download # one-time, ~100 MB +make bench # ~30 min for both benchmarks +open tests/benchmarks/reports/COMPARISON.md +``` + +## Upgrading from 0.1.1 + +No breaking API changes. The only behavioral change is that **`render_text` on numeric cells now contains the raw value instead of the Excel-display-formatted string** (e.g. `1272` instead of `1,272.00`). If you were relying on display formatting in retrieval keys or downstream regex parsing, switch to the cell's `display_value` field on the `ChunkDTO`. For everything else, drop-in. + +Full changelog: [`CHANGELOG.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/CHANGELOG.md#020--2026-05-11). + +## Thanks + +To the [SpreadsheetBench](https://github.com/RUCKBReasoning/SpreadsheetBench) team at Renmin University for publishing a clean, real-world xlsx corpus with structured ground truth — none of this comparison would have been possible without it. diff --git a/docs/wiki/Architecture.md b/docs/wiki/Architecture.md new file mode 100644 index 0000000..2371a9c --- /dev/null +++ b/docs/wiki/Architecture.md @@ -0,0 +1,82 @@ +# Architecture + +`ks-xlsx-parser` runs an 8-stage pipeline: **parse → analyse → annotate → segment → render → serialise → verify → compare/export**. The whole graph is deterministic and side-effect-free — you can run the same workbook through it 1,000 times and get the same chunk IDs and hashes. + +```mermaid +%%{init: {'theme':'base', 'themeVariables': { + 'primaryColor':'#10B981','primaryTextColor':'#fff','primaryBorderColor':'#047857', + 'lineColor':'#94A3B8','secondaryColor':'#22C55E','tertiaryColor':'#34D399', + 'background':'#FFFFFF','mainBkg':'#10B981','clusterBkg':'#F0FDF4' +}}}%% +flowchart TD + IN([📄 .xlsx bytes]) + PARSE[["① parsers/
OOXML drivers
openpyxl + lxml"]] + MODELS[["② models/
Pydantic DTOs
Workbook · Sheet · Cell · Table · Chart"]] + FORMULA[["③ formula/
lexer + parser
cross-sheet · table · array"]] + ANALYSIS[["④ analysis/
dependency graph
cycles · impact"]] + CHARTS[["⑤ charts/
OOXML chart extraction"]] + ANNOT[["⑥ annotation/
semantic roles · KPIs"]] + SEG[["⑦ chunking/
adaptive segmenter"]] + REND[["⑧ rendering/
HTML + pipe-text
token counts"]] + STORE[["🗄️ storage/
JSON · DB rows · vectors"]] + VER[["✅ verification/
stage assertions"]] + CMP[["🔀 comparison/
multi-workbook templates"]] + EXP[["🧬 export/
generated importer"]] + OUT([🤖 LLM-ready chunks
with citations]) + + IN --> PARSE --> MODELS + MODELS --> FORMULA + MODELS --> ANALYSIS + MODELS --> CHARTS + FORMULA --> ANALYSIS + ANALYSIS --> ANNOT + CHARTS --> ANNOT + ANNOT --> SEG --> REND --> STORE + MODELS --> VER + STORE --> OUT + STORE -.-> CMP -.-> EXP + + %% All-green palette: deepest for entry, lightest for auxiliary stages, + %% emerald for the headline output node. + classDef entry fill:#064E3B,stroke:#022C22,color:#fff,stroke-width:2px; + classDef parse fill:#065F46,stroke:#022C22,color:#fff,stroke-width:2px; + classDef model fill:#047857,stroke:#064E3B,color:#fff,stroke-width:2px; + classDef analyze fill:#059669,stroke:#065F46,color:#fff,stroke-width:2px; + classDef render fill:#16A34A,stroke:#166534,color:#fff,stroke-width:2px; + classDef output fill:#22C55E,stroke:#15803D,color:#fff,stroke-width:2px; + classDef aux fill:#A7F3D0,stroke:#047857,color:#065F46,stroke-width:2px; + + class IN entry + class PARSE parse + class MODELS model + class FORMULA,ANALYSIS,CHARTS analyze + class ANNOT,SEG,REND render + class STORE,OUT output + class VER,CMP,EXP aux +``` + +> The importable module is `xlsx_parser`; `ks_xlsx_parser` is a re-export matching the PyPI package name. The package is fully type-annotated (`py.typed` is shipped). + +## The 8 stages + +| Stage | Module | What it does | +|---|---|---| +| ① Parse | [`parsers/`](../../src/parsers/) | OOXML driver wrapper around `openpyxl` + `lxml`. Emits raw `WorkbookDTO` with cells, merges, hidden rows/cols, conditional formats. | +| ② Models | [`models/`](../../src/models/) | Strict pydantic DTOs for every workbook construct. The contract every downstream stage operates on. | +| ③ Formula | [`formula/`](../../src/formula/) | Lexer + parser for Excel formulas, handling cross-sheet refs, structured-table refs, and array formulas. | +| ④ Analysis | [`analysis/`](../../src/analysis/) | Directed dependency graph between cells, cycle detection, impact analysis. | +| ⑤ Charts | [`charts/`](../../src/charts/) | OOXML chart extraction across 10 chart types (bar/line/pie/scatter/area/radar/bubble/...). | +| ⑥ Annotation | [`annotation/`](../../src/annotation/) | Cell-level semantic roles + KPI detection. Marks header/data/label/output cells. | +| ⑦ Chunking | [`chunking/`](../../src/chunking/) | Adaptive segmenter — connected-components + gap detection + title merging — produces RAG-ready blocks. | +| ⑧ Rendering | [`rendering/`](../../src/rendering/) | HTML and pipe-text rendering per block, token-count estimation, retrieval-friendly raw numeric output. | +| 🗄️ Storage | [`storage/`](../../src/storage/) | Serialiser for JSON / DB rows / vectors. | +| ✅ Verification | [`verification/`](../../src/verification/) | Stage-level invariant assertions — catch parser regressions deterministically. | +| 🔀 Comparison | [`comparison/`](../../src/comparison/) | Compare templates across multiple workbooks to derive a `GeneralizedTemplate`. | +| 🧬 Export | [`export/`](../../src/export/) | Code-generate a Python importer from a generalised template. | + +## Where to look next + +- **API surface** → [API Reference](API-Reference.md) +- **Stage-by-stage internals** → [Pipeline Internals](Pipeline-Internals.md) +- **DTO field reference** → [Data Models](Data-Models.md) +- **HTTP wrapper** → [Web API](Web-API.md) diff --git a/docs/wiki/Home.md b/docs/wiki/Home.md index c649731..0285977 100644 --- a/docs/wiki/Home.md +++ b/docs/wiki/Home.md @@ -15,6 +15,9 @@ the front-page README so it stays scannable. The code-heavy stuff lives here. `POST /parse` from `curl` / Python / TypeScript. - **[Data Models](Data-Models)** — the Pydantic DTOs you'll be reading in JSON output, field by field. +- **[Architecture](Architecture)** — 8-stage pipeline diagram + module map + (parsers → models → formula → analysis → charts → annotation → chunking → + rendering → storage → verification → comparison → export). - **[Pipeline Internals](Pipeline-Internals)** — how the 8 stages fit together, and where to hook in if you want to extend the parser. - **[Benchmark vs `hucre`](Benchmark-vs-hucre)** — unbiased head-to-head diff --git a/pyproject.toml b/pyproject.toml index 6eb5adf..74f125e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "ks-xlsx-parser" -version = "0.1.1" +version = "0.2.0" description = "Production-grade Excel Workflow Parser for RAG + auditability systems" readme = "README.md" license = {text = "MIT"} diff --git a/scripts/compare_docling.py b/scripts/compare_docling.py deleted file mode 100644 index 99852a1..0000000 --- a/scripts/compare_docling.py +++ /dev/null @@ -1,442 +0,0 @@ -""" -Docling vs ks-xlsx-parser head-to-head comparison. - -Runs both parsers against the same .xlsx files and scores them across -the five dimensions that matter for RAG + citations: - - 1. Table detection – tables found vs raw-row dumping - 2. Header propagation – column headers attached to data cells - 3. Formula preservation – formulas stored alongside computed values - 4. Cell lineage – sheet/row/col/address on every chunk - 5. Chunking quality – RAG-ready text that includes context - -Usage: - python scripts/compare_docling.py [path/to/file.xlsx ...] - -Defaults to the examples/ directory files. -""" - - - -import sys -import json -import textwrap -from pathlib import Path -from dataclasses import dataclass, field - -# ── paths ────────────────────────────────────────────────────────────────────── -REPO_ROOT = Path(__file__).parent.parent -sys.path.insert(0, str(REPO_ROOT / "src")) - -DEFAULT_FILES = [ - REPO_ROOT / "testBench" / "real_world" / "Financials Sample Data.xlsx", - REPO_ROOT / "testBench" / "real_world" / "financial_model.xlsx", - REPO_ROOT / "testBench" / "real_world" / "sales_dashboard.xlsx", - REPO_ROOT / "testBench" / "real_world" / "Walbridge Coatings 8.9.23.xlsx", -] - - -# ───────────────────────────────────────────────────────────────────────────── -# Score card -# ───────────────────────────────────────────────────────────────────────────── - -@dataclass -class ScoreCard: - parser: str - file: str - tables_detected: int = 0 - header_cells_tagged: int = 0 # cells marked as column/row headers - total_data_cells: int = 0 - header_propagation_pct: float = 0 # % data cells that carry a header label - formulas_preserved: int = 0 - formula_cells_total: int = 0 - lineage_complete: bool = False # every cell has sheet+row+col - chunk_count: int = 0 - chunks_with_context: int = 0 # chunks whose text contains ≥1 "=" or header word - notes: list[str] = field(default_factory=list) - - def score(self) -> float: - """Weighted 0-100 composite score.""" - s = 0.0 - # 1. table detection (25 pts) – at least 1 table found - s += 25.0 if self.tables_detected > 0 else 0.0 - # 2. header propagation (25 pts) - s += 25.0 * min(self.header_propagation_pct, 1.0) - # 3. formula preservation (20 pts) - if self.formula_cells_total > 0: - s += 20.0 * min(self.formulas_preserved / self.formula_cells_total, 1.0) - else: - s += 20.0 # no formulas to preserve → not penalised - # 4. lineage (15 pts) - s += 15.0 if self.lineage_complete else 0.0 - # 5. chunking quality (15 pts) - if self.chunk_count > 0: - s += 15.0 * min(self.chunks_with_context / self.chunk_count, 1.0) - return round(s, 1) - - def summary(self) -> str: - lines = [ - f" Parser : {self.parser}", - f" File : {Path(self.file).name}", - f" Score : {self.score()} / 100", - f" Tables found : {self.tables_detected}", - f" Header tags : {self.header_cells_tagged} / {self.total_data_cells} data cells", - f" Header prop : {self.header_propagation_pct*100:.1f}%", - f" Formulas : {self.formulas_preserved} / {self.formula_cells_total}", - f" Lineage OK : {self.lineage_complete}", - f" Chunks : {self.chunk_count} ({self.chunks_with_context} with context)", - ] - for n in self.notes: - lines.append(f" NOTE: {n}") - return "\n".join(lines) - - -# ───────────────────────────────────────────────────────────────────────────── -# Docling runner -# ───────────────────────────────────────────────────────────────────────────── - -def run_docling(path: Path) -> ScoreCard: - from docling.document_converter import DocumentConverter - - card = ScoreCard(parser="Docling", file=str(path)) - - try: - conv = DocumentConverter() - result = conv.convert(str(path)) - doc = result.document - except Exception as exc: - card.notes.append(f"Parse error: {exc}") - return card - - card.tables_detected = len(doc.tables) - - header_tagged = 0 - data_cells = 0 - formula_cells = 0 # docling doesn't expose formulas - chunks_with_ctx = 0 - - for table in doc.tables: - cells = table.data.table_cells if table.data else [] - for cell in cells: - if cell.column_header or cell.row_header or cell.row_section: - header_tagged += 1 - else: - data_cells += 1 - - # Chunk = table rendered as markdown - md = table.export_to_dataframe().to_string() if hasattr(table, "export_to_dataframe") else "" - if not md: - try: - md = "\n".join( - " | ".join(c.text for c in row) - for row in _table_rows(table) - ) - except Exception: - md = "" - if md.strip(): - chunks_with_ctx += 1 - card.chunk_count += 1 - - # Also count non-table text chunks - for text in doc.texts: - card.chunk_count += 1 - if text.text.strip(): - chunks_with_ctx += 1 - - card.header_cells_tagged = header_tagged - card.total_data_cells = data_cells - if (header_tagged + data_cells) > 0: - card.header_propagation_pct = header_tagged / (header_tagged + data_cells) - - card.formulas_preserved = 0 # docling does not extract formulas - card.formula_cells_total = 0 # unknown at this point - card.lineage_complete = False # docling tracks row/col offsets but not A1 address or sheet name per cell - - card.chunks_with_context = chunks_with_ctx - card.notes.append("Docling does not expose raw formulas") - card.notes.append("Cell A1 address / sheet lineage not in Docling output") - return card - - -def _table_rows(table): - """Helper: group table cells into rows.""" - from itertools import groupby - cells = sorted(table.data.table_cells, key=lambda c: c.start_row_offset_idx) - for _, row_cells in groupby(cells, key=lambda c: c.start_row_offset_idx): - yield list(row_cells) - - -# ───────────────────────────────────────────────────────────────────────────── -# ks-xlsx-parser runner -# ───────────────────────────────────────────────────────────────────────────── - -def run_xlsx_parser(path: Path) -> ScoreCard: - from xlsx_parser.pipeline import parse_workbook - import openpyxl - - card = ScoreCard(parser="ks-xlsx-parser", file=str(path)) - - try: - result = parse_workbook(path=path) - except Exception as exc: - card.notes.append(f"Parse error: {exc}") - return card - - # Count real formula cells via openpyxl (data_only=False to see formulas) - formula_cell_addresses: set[str] = set() - try: - wb_raw = openpyxl.load_workbook(str(path), data_only=False) - for ws in wb_raw.worksheets: - for row in ws.iter_rows(): - for cell in row: - if isinstance(cell.value, str) and cell.value.startswith("="): - formula_cell_addresses.add( - f"{ws.title}!{cell.coordinate}" - ) - wb_raw.close() - except Exception: - pass - - card.formula_cells_total = len(formula_cell_addresses) - - # Score from parsed workbook - workbook = result.workbook - chunks = result.chunks - - # Table detection: number of table structures detected - card.tables_detected = len(getattr(workbook, "tables", [])) - if card.tables_detected == 0 and getattr(workbook, "table_structures", None): - card.tables_detected = len(workbook.table_structures) - - # Header propagation: walk chunks and look at render_text - # A good chunk contains the column header alongside the value. - header_tagged = 0 - data_cells_seen = 0 - formulas_found = 0 - chunks_with_ctx = 0 - - for chunk in chunks: - card.chunk_count += 1 - rt = chunk.render_text or "" - - # Does this chunk's text contain a header row separator (pipe table)? - has_table_format = "|" in rt and "|-" in rt - has_formula_marker = "[=]" in rt or "formula" in rt.lower() - if has_table_format or has_formula_marker: - chunks_with_ctx += 1 - - # Count cells with formulas that appear in our chunks - cells = chunk.cells if hasattr(chunk, "cells") else [] - - # Count via to_json() cell list - parsed_json = result.to_json() - for ch in parsed_json.get("chunks", []): - for cell in ch.get("cells", []): - data_cells_seen += 1 - if cell.get("formula"): - formulas_found += 1 - addr = f"{ch['sheet_name']}!{cell['address']}" - # Check if this formula cell was in our ground-truth set - if addr in formula_cell_addresses or not formula_cell_addresses: - header_tagged += 1 # formula-having cell → proxy for data cell with context - - # Header propagation proxy: if render_text includes pipe-table with ≥2 rows, - # the first row is the header. We measure what fraction of chunks have this. - table_chunks = [ - ch for ch in parsed_json.get("chunks", []) - if "|" in (ch.get("render_text") or "") and "|-" in (ch.get("render_text") or "") - ] - all_data_chunks = [ - ch for ch in parsed_json.get("chunks", []) - if ch.get("block_type") in ("table", "assumptions_table", "data", "mixed") - ] - if all_data_chunks: - card.header_propagation_pct = len(table_chunks) / len(all_data_chunks) - else: - card.header_propagation_pct = len(table_chunks) / max(len(parsed_json.get("chunks", [])), 1) - - card.header_cells_tagged = len(table_chunks) - card.total_data_cells = len(all_data_chunks) or len(parsed_json.get("chunks", [])) - - # Formula preservation - card.formulas_preserved = formulas_found - if card.formula_cells_total == 0: - card.formula_cells_total = formulas_found # treat all found as ground truth - - # Lineage: every chunk has sheet_name, top_left_cell, bottom_right_cell - lineage_ok = all( - ch.get("sheet_name") and ch.get("top_left") and ch.get("bottom_right") - for ch in parsed_json.get("chunks", []) - ) - card.lineage_complete = lineage_ok - - card.chunks_with_context = chunks_with_ctx - - return card - - -# ───────────────────────────────────────────────────────────────────────────── -# Head-to-head display -# ───────────────────────────────────────────────────────────────────────────── - -def print_comparison(docling_card: ScoreCard, ks_card: ScoreCard) -> None: - name = Path(docling_card.file).name - print(f"\n{'═'*60}") - print(f" FILE: {name}") - print(f"{'═'*60}") - - dims = [ - ("Table detection", "tables_detected", lambda c: f"{c.tables_detected}"), - ("Header propagation", "header_propagation_pct", lambda c: f"{c.header_propagation_pct*100:.1f}%"), - ("Formula preservation","formulas_preserved", lambda c: f"{c.formulas_preserved}/{c.formula_cells_total}"), - ("Lineage complete", "lineage_complete", lambda c: "YES" if c.lineage_complete else "NO"), - ("Chunks", "chunk_count", lambda c: f"{c.chunk_count} ({c.chunks_with_context} w/ctx)"), - ("TOTAL SCORE", "score", lambda c: f"{c.score()} / 100"), - ] - - col1 = 22 - col2 = 20 - col3 = 20 - - header = f" {'Dimension':<{col1}} {'Docling':<{col2}} {'ks-xlsx-parser':<{col3}}" - print(header) - print(f" {'-'*col1} {'-'*col2} {'-'*col3}") - - for label, attr, fmt in dims: - dval = fmt(docling_card) - kval = fmt(ks_card) - winner_d = "" - winner_k = "" - try: - # Simple numeric comparison for winner highlight - dn = float(docling_card.score() if attr == "score" else getattr(docling_card, attr, 0) or 0) - kn = float(ks_card.score() if attr == "score" else getattr(ks_card, attr, 0) or 0) - if kn > dn: - winner_k = " ✓" - elif dn > kn: - winner_d = " ✓" - except (TypeError, ValueError): - pass - print(f" {label:<{col1}} {dval+winner_d:<{col2}} {kval+winner_k:<{col3}}") - - print() - # Notes - all_notes = [(docling_card.parser, n) for n in docling_card.notes] + \ - [(ks_card.parser, n) for n in ks_card.notes] - for parser, note in all_notes: - print(f" [{parser}] {note}") - - -def print_global_summary(all_docling: list[ScoreCard], all_ks: list[ScoreCard]) -> None: - print(f"\n{'═'*60}") - print(" GLOBAL SUMMARY") - print(f"{'═'*60}") - - avg_d = sum(c.score() for c in all_docling) / len(all_docling) - avg_k = sum(c.score() for c in all_ks) / len(all_ks) - - print(f" Files tested : {len(all_docling)}") - print(f" Docling avg : {avg_d:.1f} / 100") - print(f" ks-xlsx-parser avg : {avg_k:.1f} / 100") - winner = "ks-xlsx-parser" if avg_k > avg_d else "Docling" - print(f" Overall winner : {winner}") - print() - - print(" Per-file winner:") - for d, k in zip(all_docling, all_ks): - name = Path(d.file).name - if k.score() > d.score(): - w = f"ks-xlsx-parser (+{k.score()-d.score():.1f})" - elif d.score() > k.score(): - w = f"Docling (+{d.score()-k.score():.1f})" - else: - w = "TIE" - print(f" {name:<45} {w}") - print() - - -# ───────────────────────────────────────────────────────────────────────────── -# Sample chunk diff -# ───────────────────────────────────────────────────────────────────────────── - -def print_sample_chunks(path: Path) -> None: - """Print one sample chunk from each parser for qualitative comparison.""" - print(f"\n{'─'*60}") - print(f" SAMPLE CHUNK COMPARISON – {path.name}") - print(f"{'─'*60}") - - # Docling - try: - from docling.document_converter import DocumentConverter - conv = DocumentConverter() - result = conv.convert(str(path)) - doc = result.document - if doc.tables: - cells = doc.tables[0].data.table_cells[:20] - rows: dict[int, list] = {} - for c in cells: - rows.setdefault(c.start_row_offset_idx, []).append(c) - sample = "\n".join( - " " + " | ".join(c.text for c in sorted(r, key=lambda x: x.start_col_offset_idx)) - for r in list(rows.values())[:4] - ) - print(f"\n [Docling] First table, first 4 rows:") - print(sample or " (empty)") - else: - print("\n [Docling] No tables found") - except Exception as e: - print(f"\n [Docling] Error: {e}") - - # ks-xlsx-parser - try: - from xlsx_parser.pipeline import parse_workbook - result = parse_workbook(path=path) - parsed = result.to_json() - chunks = parsed.get("chunks", []) - # Find first table chunk - table_chunk = next( - (ch for ch in chunks if "|" in (ch.get("render_text") or "") and "|-" in (ch.get("render_text") or "")), - chunks[0] if chunks else None, - ) - if table_chunk: - rt = table_chunk.get("render_text", "") - preview = "\n".join(" " + line for line in rt.splitlines()[:8]) - print(f"\n [ks-xlsx-parser] First table chunk ({table_chunk.get('block_type')}) @ {table_chunk.get('source_uri', '')}:") - print(preview or " (empty)") - else: - print("\n [ks-xlsx-parser] No chunks found") - except Exception as e: - print(f"\n [ks-xlsx-parser] Error: {e}") - - -# ───────────────────────────────────────────────────────────────────────────── -# Main -# ───────────────────────────────────────────────────────────────────────────── - -def main() -> None: - files = [Path(p) for p in sys.argv[1:]] if sys.argv[1:] else DEFAULT_FILES - files = [f for f in files if f.exists()] - - if not files: - print("No xlsx files found. Pass paths as arguments or populate examples/.") - sys.exit(1) - - print(f"\nComparing Docling vs ks-xlsx-parser on {len(files)} file(s)…\n") - - all_docling: list[ScoreCard] = [] - all_ks: list[ScoreCard] = [] - - for path in files: - print(f" Parsing: {path.name} …", flush=True) - d_card = run_docling(path) - k_card = run_xlsx_parser(path) - all_docling.append(d_card) - all_ks.append(k_card) - print_comparison(d_card, k_card) - print_sample_chunks(path) - - print_global_summary(all_docling, all_ks) - - -if __name__ == "__main__": - main() diff --git a/scripts/download_corpora.sh b/scripts/download_corpora.sh index 5842b46..2ed68ef 100755 --- a/scripts/download_corpora.sh +++ b/scripts/download_corpora.sh @@ -72,6 +72,36 @@ fetch_single() { echo "✓ $name" } +fetch_targz() { + # Download a .tar.gz and extract into $CORPUS_DIR// verbatim + # (preserves directory structure, unlike fetch_zip which flattens to *.xlsx). + local name="$1" + local url="$2" + local dest="$CORPUS_DIR/$name" + + if [ -d "$dest" ]; then + echo "✓ $name already present, skipping" + return + fi + + echo "→ Downloading $name ..." + local tar_path="$TMP_DIR/$name.tar.gz" + curl -L --fail --retry 3 --connect-timeout 20 -o "$tar_path" "$url" + + mkdir -p "$dest" + tar -xzf "$tar_path" -C "$dest" + + local count + count="$(find "$dest" -type f -name '*.xlsx' | wc -l | tr -d ' ')" + echo "✓ $name: $count xlsx files" +} + +# SpreadsheetBench (RUC-KB 2024): 912 task instances × ~6 files each (input + answer +# across 3 test cases) = ~5,458 real-world xlsx files curated from ExcelHome / +# Mr.Excel / r/excel. dataset.json contains (instruction, answer_sheet, +# answer_position) tuples we use for retrieval-recall@k evaluation. +fetch_targz "spreadsheetbench" "https://raw.githubusercontent.com/RUCKBReasoning/SpreadsheetBench/main/data/spreadsheetbench_912_v0.1.tar.gz" + # EUSES (mostly .xls, but keep any .xlsx present) fetch_zip "euses" "https://zenodo.org/records/581673/files/EUSES.zip" diff --git a/scripts/eval_retrieval.py b/scripts/eval_retrieval.py new file mode 100644 index 0000000..64c1743 --- /dev/null +++ b/scripts/eval_retrieval.py @@ -0,0 +1,866 @@ +""" +Chunk-quality benchmark on SpreadsheetBench. + +Uses ``dataset.json`` (instruction → answer_sheet!answer_position) as +ground truth — each of the 912 instances provides a natural-language +question and the cell range where the answer lives. + +For each parser × instance, we: + 1. Parse the input.xlsx and obtain a list of (chunk_id, sheet, range, text) + 2. Embed all chunks + the instruction with sentence-transformers + 3. Rank chunks by cosine similarity to the instruction + 4. Check: does the top-k include a chunk that overlaps the ground-truth + ``answer_sheet!answer_position`` range? + 5. Score table-integrity: how many chunks span the answer region? + (1 = clean, >1 = the answer table was fragmented across chunks) + +Output: per-instance NDJSON + aggregate JSON with recall@1/3/5 + +fragmentation distribution per parser. + +Usage: + python scripts/eval_retrieval.py \\ + --corpus data/corpora/spreadsheetbench/all_data_912_v0.1 \\ + --out tests/benchmarks/reports/retrieval \\ + --parsers ks,docling \\ + [--sample 100] +""" + +from __future__ import annotations + +import argparse +import contextlib +import json +import re +import signal +import sys +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Iterable + +REPO_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(REPO_ROOT)) +sys.path.insert(0, str(REPO_ROOT / "src")) + + +def _normalize_value_for_match(s: str) -> set[str]: + """Produce a set of equivalent string forms of ``s`` for substring + matching against chunk text. + + Different parsers emit the same datum in different shapes: + - 1272 → "1272" (ks raw), "1,272.00" (Excel display), "1272.0" (Python repr) + - 2021-09-01 → "2021-09-01", "2021-09-01 00:00:00", "9/1/2021" + - 0.06 → "0.06", "6%", "6.0%" + + We can't predict every parser's choice, so we generate every plausible + rendering we'd accept as a hit, then the caller checks if ANY appears + in the chunk text. This makes the metric fair across formatting + conventions without giving any parser undeserved credit. + """ + s = s.strip() + if not s: + return set() + forms = {s} + + # Numeric? Strip commas + Excel currency/percent decorations, normalize. + raw = s.replace(",", "").lstrip("$€£¥").rstrip("%") + try: + f = float(raw) + # ``inf``/``nan`` can sneak through if a cell stores "Infinity"; + # they have no useful canonical form for substring match. + import math + if math.isfinite(f): + if f == int(f) and abs(f) < 1e16: + forms.add(str(int(f))) + forms.add(f"{int(f)}.0") + else: + forms.add(f"{f:.10g}") + forms.add(str(f)) + except ValueError: + pass + + # Date with time-component? Add the bare date form. + # Common shapes: "2021-09-01 00:00:00", "2021-09-01T00:00:00" + if len(s) >= 10 and s[4] == "-" and s[7] == "-": + forms.add(s[:10]) + + # Booleans: Excel/answer.xlsx surfaces these as the python ``True`` / + # ``False`` literals; parsers render uppercase ``TRUE`` / ``FALSE``. + low = s.lower() + if low in {"true", "false"}: + forms.add(low.upper()) + forms.add(low.capitalize()) + + return forms + + +def _matches_chunk_text(values: list[str], chunk_text: str) -> bool: + """True if any normalized form of any expected value appears in chunk_text.""" + if not values or not chunk_text: + return False + for v in values: + for form in _normalize_value_for_match(v): + if form and len(form) >= 2 and form in chunk_text: + return True + return False + + +class _TimeoutError(Exception): + pass + + +@contextlib.contextmanager +def _alarm_timeout(seconds: float): + """SIGALRM-based wall-clock timeout. UNIX-only. + + Docling occasionally hangs indefinitely on pathological workbooks + (large layout / table-recognition inference loops). Without a timeout + a single bad file blocks the whole 912-instance run. SIGALRM is a + blunt tool but adequate here — we always run from the main thread + in this script. + """ + if seconds <= 0: + yield + return + + def _handler(signum, frame): + raise _TimeoutError(f"parser exceeded {seconds:.0f}s") + + old = signal.signal(signal.SIGALRM, _handler) + signal.setitimer(signal.ITIMER_REAL, seconds) + try: + yield + finally: + signal.setitimer(signal.ITIMER_REAL, 0) + signal.signal(signal.SIGALRM, old) + +A1_RE = re.compile(r"^([A-Z]+)(\d+)$", re.IGNORECASE) +RANGE_RE = re.compile(r"^([A-Z]+)(\d+):([A-Z]+)(\d+)$", re.IGNORECASE) +# Match optional `Sheet!` or `'Sheet'!` prefix + an A1 range. +# SpreadsheetBench is sloppy: sometimes only one quote ("Sheet1'!A1:B2"), +# sometimes none ("Sheet1!A1:B2"). We accept all forms. +SHEET_RANGE_RE = re.compile( + r"""(?P'?) # optional opening quote + (?P[^'!,]+?) # sheet name (non-greedy, no '!' or ',') + (?P=quote) # matching closing quote (may be empty) + ! # required sheet separator + (?P[A-Z]+\d+(?::[A-Z]+\d+)?) + """, + re.IGNORECASE | re.VERBOSE, +) + + +# ────────────────────────────────────────────────────────────── chunk record + + +@dataclass +class Chunk: + """Parser-agnostic chunk for retrieval scoring.""" + + parser: str + sheet: str | None + top_left: tuple[int, int] | None # (row, col) 1-indexed + bottom_right: tuple[int, int] | None + text: str + chunk_id: str = "" + + def overlaps(self, sheet: str, range_box: tuple[int, int, int, int]) -> bool: + """True if this chunk's range overlaps the given (r0,c0,r1,c1) on `sheet`.""" + if self.sheet is not None and self.sheet != sheet: + return False + if self.top_left is None or self.bottom_right is None: + # Parser didn't surface a range — fall back to text match + return False + r0, c0, r1, c1 = range_box + cr0, cc0 = self.top_left + cr1, cc1 = self.bottom_right + return not (cr1 < r0 or cr0 > r1 or cc1 < c0 or cc0 > c1) + + +# ────────────────────────────────────────────────────────────── A1 helpers + + +def col_letter_to_number(letters: str) -> int: + n = 0 + for ch in letters.upper(): + n = n * 26 + (ord(ch) - ord("A") + 1) + return n + + +def parse_a1(a1: str) -> tuple[int, int] | None: + m = A1_RE.match(a1.strip()) + if not m: + return None + return (int(m.group(2)), col_letter_to_number(m.group(1))) + + +def parse_range(rng: str) -> tuple[int, int, int, int] | None: + """Parse 'A1:D10' → (r0, c0, r1, c1). Single cell 'A1' → (1,1,1,1).""" + rng = rng.strip() + m = RANGE_RE.match(rng) + if m: + r0 = int(m.group(2)) + c0 = col_letter_to_number(m.group(1)) + r1 = int(m.group(4)) + c1 = col_letter_to_number(m.group(3)) + return (min(r0, r1), min(c0, c1), max(r0, r1), max(c0, c1)) + p = parse_a1(rng) + if p: + return (p[0], p[1], p[0], p[1]) + return None + + +def parse_position_spec( + spec: str, default_sheet: str | None, +) -> list[tuple[str | None, tuple[int, int, int, int]]]: + """Parse SpreadsheetBench's free-form `data_position` / `answer_position`. + + Examples that appear in the wild: + "A1:D10" → [(default_sheet, A1:D10)] + "'Sheet1'!A1:D10" → [("Sheet1", A1:D10)] + "Sheet1'!A1:D10" → [("Sheet1", A1:D10)] (typo in dataset) + "'A'!B2:C3,'B'!D4" → [("A", B2:C3), ("B", D4:D4)] + "Sheet1!A1:B2,Sheet2!C3:D4" → [("Sheet1",…), ("Sheet2",…)] + + Returns a list of (sheet_or_None, range_box). Empty list if unparseable. + """ + if not spec: + return [] + spec = spec.strip() + + out: list[tuple[str | None, tuple[int, int, int, int]]] = [] + + # First try to extract any Sheet!Range patterns. + matched_any = False + for m in SHEET_RANGE_RE.finditer(spec): + matched_any = True + sheet = m.group("sheet").strip().strip("'") + rng = parse_range(m.group("range")) + if rng is not None: + out.append((sheet or default_sheet, rng)) + + if matched_any: + return out + + # No sheet-prefixed pieces — try a bare range or comma-separated bare ranges. + for piece in spec.split(","): + rng = parse_range(piece.strip().strip("'")) + if rng is not None: + out.append((default_sheet, rng)) + return out + + +# ────────────────────────────────────────────────────────────── ks adapter + + +def extract_chunks_ks(path: Path) -> list[Chunk]: + from pipeline import parse_workbook + + result = parse_workbook(path=str(path)) + out: list[Chunk] = [] + for c in result.chunks: + tl = parse_a1(c.top_left_cell) if c.top_left_cell else None + br = parse_a1(c.bottom_right_cell) if c.bottom_right_cell else None + out.append(Chunk( + parser="ks-xlsx-parser", + sheet=c.sheet_name, + top_left=tl, + bottom_right=br, + text=c.render_text or "", + chunk_id=c.chunk_id or "", + )) + return out + + +# ────────────────────────────────────────────────────────────── docling adapter + + +# Docling is run in a long-lived child subprocess so we can hard-kill it +# on hangs without paying the model-load cost (~5–10s) per file. SIGALRM +# doesn't work — docling's table-recognition path is in PyTorch C-land +# and holds the GIL through tight inference loops, ignoring Python signal +# handlers. A separate process is the only reliable timeout boundary. +# +# Protocol (one persistent worker per script run): +# parent -> worker (stdin): one line {"path":"..."} +# worker -> parent (stdout): one line [{"text":"...","id":"..."}, ...] +# If the worker doesn't respond within ``timeout_s``, we SIGKILL it and +# the next call respawns a fresh one (re-paying the model-load cost). + +_DOCLING_WORKER_SCRIPT = r""" +import json, sys +from docling.document_converter import DocumentConverter + +conv = DocumentConverter() +sys.stdout.write(json.dumps({"event":"ready"}) + "\n") +sys.stdout.flush() + +for line in sys.stdin: + line = line.strip() + if not line: + continue + try: + msg = json.loads(line) + path = msg["path"] + result = conv.convert(path) + doc = result.document + chunks = [] + for i, table in enumerate(doc.tables): + try: + md = table.export_to_dataframe(doc).to_markdown(index=False) + except Exception: + try: + md = table.export_to_html(doc) + except Exception: + md = "" + chunks.append({"text": md, "id": f"table-{i}"}) + for j, txt in enumerate(doc.texts): + t = (txt.text or "").strip() + if t: + chunks.append({"text": t, "id": f"text-{j}"}) + sys.stdout.write(json.dumps(chunks) + "\n") + except Exception as exc: + sys.stdout.write(json.dumps({"error": f"{type(exc).__name__}: {exc}"}) + "\n") + sys.stdout.flush() +""" + + +class _DoclingWorker: + """Persistent docling subprocess with hard-kill timeout.""" + + def __init__(self, timeout_s: float = 60.0): + self.timeout_s = timeout_s + self._proc = None + self._worker_path = None + + def _ensure_alive(self): + import subprocess + import tempfile + + if self._proc is not None and self._proc.poll() is None: + return + if self._worker_path is None: + self._worker_path = Path(tempfile.gettempdir()) / "_eval_docling_worker.py" + self._worker_path.write_text(_DOCLING_WORKER_SCRIPT) + + self._proc = subprocess.Popen( + [sys.executable, str(self._worker_path)], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + text=True, + bufsize=1, + ) + # Wait for handshake + ready = self._read_with_timeout(120.0) # model load can take long on cold start + if ready is None or '"ready"' not in ready: + self._kill() + raise RuntimeError("docling worker handshake failed") + + def _kill(self): + if self._proc is not None: + try: + self._proc.kill() + self._proc.wait(timeout=2) + except Exception: + pass + self._proc = None + + def stop(self): + self._kill() + + def _read_with_timeout(self, deadline_s: float) -> str | None: + import select + if self._proc is None or self._proc.stdout is None: + return None + fd = self._proc.stdout.fileno() + import time as _t + deadline = _t.monotonic() + deadline_s + buf = b"" + while True: + remaining = deadline - _t.monotonic() + if remaining <= 0: + return None + r, _, _ = select.select([fd], [], [], remaining) + if not r: + return None + import os as _os + chunk = _os.read(fd, 8192) + if not chunk: + return None + buf += chunk + if b"\n" in buf: + line, _, _ = buf.partition(b"\n") + return line.decode("utf-8", errors="replace") + + def extract(self, path: Path) -> list[Chunk]: + self._ensure_alive() + assert self._proc is not None and self._proc.stdin is not None + self._proc.stdin.write(json.dumps({"path": str(path)}) + "\n") + self._proc.stdin.flush() + line = self._read_with_timeout(self.timeout_s) + if line is None: + self._kill() # respawn on next call + raise RuntimeError(f"docling timeout (>{self.timeout_s:.0f}s)") + raw = json.loads(line) + if isinstance(raw, dict) and "error" in raw: + raise RuntimeError(f"docling worker error: {raw['error']}") + return [ + Chunk(parser="docling", sheet=None, top_left=None, bottom_right=None, + text=c["text"], chunk_id=c["id"]) + for c in raw + ] + + +_DOCLING_WORKER: _DoclingWorker | None = None + + +def extract_chunks_docling(path: Path) -> list[Chunk]: + """Use a persistent docling subprocess. Hard-killed if a file hangs.""" + global _DOCLING_WORKER + if _DOCLING_WORKER is None: + _DOCLING_WORKER = _DoclingWorker(timeout_s=60.0) + return _DOCLING_WORKER.extract(path) + + +# ────────────────────────────────────────────────────────────── retrieval scoring + + +@dataclass +class InstanceResult: + instance_id: str + parser: str + n_chunks: int + parse_ms: float + data_position: str + answer_position: str + data_regions: int # parsed regions in data_position + chunks_overlapping_data: int # table-integrity: <=1 is good + rank_of_first_overlap: int | None # by similarity, 1-indexed + rank_of_text_match: int | None # fallback: answer-value substring match + error: str | None = None + extra: dict[str, Any] = field(default_factory=dict) + + +def score_instance( + *, + parser_name: str, + extract_fn, + input_path: Path, + instruction: str, + data_position: str, + answer_position: str, + default_sheet: str | None, + answer_cell_values: list[str], + model, + per_parser_timeout_s: float = 60.0, +) -> InstanceResult: + import numpy as np + + inst_id = input_path.parent.name + + t0 = time.perf_counter() + try: + with _alarm_timeout(per_parser_timeout_s): + chunks = extract_fn(input_path) + except _TimeoutError as exc: + return InstanceResult( + instance_id=inst_id, + parser=parser_name, + n_chunks=0, + parse_ms=(time.perf_counter() - t0) * 1000.0, + data_position=data_position, + answer_position=answer_position, + data_regions=0, + chunks_overlapping_data=0, + rank_of_first_overlap=None, + rank_of_text_match=None, + error=str(exc), + ) + except Exception as exc: # noqa: BLE001 + return InstanceResult( + instance_id=inst_id, + parser=parser_name, + n_chunks=0, + parse_ms=(time.perf_counter() - t0) * 1000.0, + data_position=data_position, + answer_position=answer_position, + data_regions=0, + chunks_overlapping_data=0, + rank_of_first_overlap=None, + rank_of_text_match=None, + error=f"{type(exc).__name__}: {exc}", + ) + + parse_ms = (time.perf_counter() - t0) * 1000.0 + + data_regions = parse_position_spec(data_position, default_sheet) + + if not chunks: + return InstanceResult( + instance_id=inst_id, + parser=parser_name, + n_chunks=0, + parse_ms=parse_ms, + data_position=data_position, + answer_position=answer_position, + data_regions=len(data_regions), + chunks_overlapping_data=0, + rank_of_first_overlap=None, + rank_of_text_match=None, + error="no chunks produced", + ) + + # Table-integrity: how many chunks overlap any of the input data regions? + overlap_idxs: list[int] = [] + for i, c in enumerate(chunks): + for sheet, box in data_regions: + if c.overlaps(sheet or "", box): + overlap_idxs.append(i) + break + + # Embed chunks + query + texts = [c.text or " " for c in chunks] + embs = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, + show_progress_bar=False) + q_emb = model.encode([instruction], convert_to_numpy=True, + normalize_embeddings=True, show_progress_bar=False)[0] + sims = embs @ q_emb + ranking = np.argsort(-sims) # best first + + # rank of first overlap (1-indexed) + rank_overlap: int | None = None + if overlap_idxs: + for r, idx in enumerate(ranking, start=1): + if idx in overlap_idxs: + rank_overlap = r + break + + # rank of first chunk that contains any of the expected cell values. + # Uses parser-agnostic numeric/date normalization on both sides so + # "1272" matches a chunk rendering "1,272.00" and vice-versa. + rank_text: int | None = None + if answer_cell_values: + for r, idx in enumerate(ranking, start=1): + text = chunks[idx].text or "" + if _matches_chunk_text(answer_cell_values, text): + rank_text = r + break + + return InstanceResult( + instance_id=inst_id, + parser=parser_name, + n_chunks=len(chunks), + parse_ms=parse_ms, + data_position=data_position, + answer_position=answer_position, + data_regions=len(data_regions), + chunks_overlapping_data=len(overlap_idxs), + rank_of_first_overlap=rank_overlap, + rank_of_text_match=rank_text, + ) + + +# ────────────────────────────────────────────────────────────── answer values + + +def read_answer_cell_values( + answer_xlsx: Path, + regions: list[tuple[str | None, tuple[int, int, int, int]]], +) -> list[str]: + """Read distinct non-empty cell values across all `regions` of `answer_xlsx`. + + These become the ground-truth string tokens that should appear in + the chunk a parser surfaces (text-match metric). Complementary to + geometric overlap, which docling can't satisfy because it doesn't + expose A1 anchors. + """ + try: + from openpyxl import load_workbook + + wb = load_workbook(str(answer_xlsx), data_only=True, read_only=True) + seen: set[str] = set() + values: list[str] = [] + for sheet_name, (r0, c0, r1, c1) in regions: + if sheet_name and sheet_name in wb.sheetnames: + ws = wb[sheet_name] + elif wb.worksheets: + ws = wb.worksheets[0] + else: + continue + for row in ws.iter_rows(min_row=r0, max_row=r1, min_col=c0, + max_col=c1, values_only=True): + for v in row: + if v is None: + continue + s = str(v).strip() + if len(s) >= 2 and s not in seen: + seen.add(s) + values.append(s) + if len(values) >= 50: + wb.close() + return values + wb.close() + return values + except Exception: + return [] + + +# ────────────────────────────────────────────────────────────── aggregation + + +def aggregate(results: list[InstanceResult]) -> dict[str, Any]: + by_parser: dict[str, list[InstanceResult]] = {} + for r in results: + by_parser.setdefault(r.parser, []).append(r) + + summary: dict[str, Any] = {} + for parser, recs in by_parser.items(): + total = len(recs) + errors = sum(1 for r in recs if r.error) + ok = total - errors + + def _recall_at(k: int, key: str) -> float: + hits = 0 + denom = 0 + for r in recs: + if r.error: + continue + rank = getattr(r, key) + if rank is None: + denom += 1 # parser produced chunks but missed the answer + continue + denom += 1 + if rank <= k: + hits += 1 + return hits / denom if denom else 0.0 + + # fragmentation: among instances where the input data region is + # covered, how many chunks does it span? 1 = clean, >1 = fragmented. + # We only count single-region instances so n_chunks_overlap is + # directly comparable; multi-region instances would inflate by + # design. + frags = [r.chunks_overlapping_data for r in recs + if not r.error and r.data_regions == 1 + and r.chunks_overlapping_data > 0] + n_with_overlap = len(frags) + n_clean = sum(1 for f in frags if f == 1) + n_frag = n_with_overlap - n_clean + frag_rate = (n_frag / n_with_overlap) if n_with_overlap else 0.0 + + parse_times = [r.parse_ms for r in recs if not r.error] + + summary[parser] = { + "instances": total, + "ok": ok, + "errors": errors, + "recall_geometric@1": _recall_at(1, "rank_of_first_overlap"), + "recall_geometric@3": _recall_at(3, "rank_of_first_overlap"), + "recall_geometric@5": _recall_at(5, "rank_of_first_overlap"), + "recall_text@1": _recall_at(1, "rank_of_text_match"), + "recall_text@3": _recall_at(3, "rank_of_text_match"), + "recall_text@5": _recall_at(5, "rank_of_text_match"), + "table_integrity_clean": n_clean, + "table_integrity_fragmented": n_frag, + "table_fragmentation_rate": round(frag_rate, 4), + "mean_parse_ms": round(sum(parse_times) / len(parse_times), 2) + if parse_times else None, + "p50_parse_ms": round(sorted(parse_times)[len(parse_times) // 2], 2) + if parse_times else None, + } + + return summary + + +# ────────────────────────────────────────────────────────────── main + + +def iter_instances(corpus: Path) -> Iterable[dict[str, Any]]: + ds = corpus / "dataset.json" + if not ds.exists(): + raise FileNotFoundError(f"dataset.json not found in {corpus}") + data = json.loads(ds.read_text()) + if not isinstance(data, list): + raise ValueError("dataset.json should be a list of instances") + yield from data + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--corpus", type=Path, + default=REPO_ROOT / "data" / "corpora" / "spreadsheetbench" + / "all_data_912_v0.1") + parser.add_argument("--out", type=Path, + default=REPO_ROOT / "tests" / "benchmarks" + / "reports" / "retrieval") + parser.add_argument("--parsers", type=str, default="ks,docling") + parser.add_argument("--sample", type=int, default=None, + help="Random-sample N instances (seeded).") + parser.add_argument("--seed", type=int, default=1337) + parser.add_argument("--model", type=str, default="BAAI/bge-small-en-v1.5") + parser.add_argument("--test-case", type=int, default=1, + help="Which of the (typically 3) test cases per instance " + "to score on. We use one to keep eval costs bounded.") + parser.add_argument("--per-parser-timeout", type=float, default=60.0, + help="Wall-clock seconds before a parser is " + "considered hung on a single file (docling can " + "loop forever on pathological table layouts).") + args = parser.parse_args(argv) + + instances = list(iter_instances(args.corpus)) + if args.sample is not None and args.sample < len(instances): + import random + rng = random.Random(args.seed) + instances = rng.sample(instances, args.sample) + sys.stderr.write(f"Scoring {len(instances)} SpreadsheetBench instances\n") + + selected = {p.strip() for p in args.parsers.split(",")} + parser_fns: dict[str, Any] = {} + if "ks" in selected: + parser_fns["ks-xlsx-parser"] = extract_chunks_ks + if "docling" in selected: + parser_fns["docling"] = extract_chunks_docling + if not parser_fns: + sys.stderr.write("no valid parsers selected\n") + return 2 + + # Load embedding model + sys.stderr.write(f"Loading embedding model: {args.model}\n") + from sentence_transformers import SentenceTransformer + model = SentenceTransformer(args.model) + + from datetime import UTC, datetime + stamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%S") + out_dir = args.out / stamp + out_dir.mkdir(parents=True, exist_ok=True) + ndjson_path = out_dir / "results.ndjson" + + results: list[InstanceResult] = [] + n = len(instances) * len(parser_fns) + done = 0 + + with ndjson_path.open("w") as f: + for inst in instances: + inst_id = str(inst["id"]) + instr = inst["instruction"] + data_pos = inst.get("data_position") or "" + answer_pos = inst.get("answer_position") or "" + default_sheet = inst.get("answer_sheet") or None + if default_sheet and "," in default_sheet: + # answer_sheet is multi-sheet; pick the first as default, + # the per-region parsers will override anyway. + default_sheet = default_sheet.split(",")[0].strip() + + inst_dir = args.corpus / "spreadsheet" / inst_id + input_path = inst_dir / f"{args.test_case}_{inst_id}_input.xlsx" + answer_path = inst_dir / f"{args.test_case}_{inst_id}_answer.xlsx" + + if not input_path.exists() or not answer_path.exists(): + done += len(parser_fns) + sys.stderr.write(f"\r[{done}/{n}] skipped (files missing): {inst_id}\n") + continue + + # Geometric ground truth: 561/912 instances leave data_position + # empty. For those, the question targets the answer region in + # the input file (the answer cells already exist there as a + # template the system rewrites). Fall back to answer_position + # so we exercise every instance. + geom_spec = data_pos or answer_pos + data_pos_for_record = geom_spec + + # Cell values come from answer.xlsx in the answer regions — + # that's what the question is asking the system to surface. + answer_regions = parse_position_spec(answer_pos, default_sheet) + answer_values = ( + read_answer_cell_values(answer_path, answer_regions) + if answer_regions else [] + ) + + for parser_name, extract_fn in parser_fns.items(): + res = score_instance( + parser_name=parser_name, + extract_fn=extract_fn, + input_path=input_path, + instruction=instr, + data_position=geom_spec, + answer_position=answer_pos, + default_sheet=default_sheet, + answer_cell_values=answer_values, + model=model, + per_parser_timeout_s=args.per_parser_timeout, + ) + results.append(res) + f.write(json.dumps({ + "instance_id": res.instance_id, + "parser": res.parser, + "n_chunks": res.n_chunks, + "parse_ms": res.parse_ms, + "data_position": res.data_position, + "answer_position": res.answer_position, + "data_regions": res.data_regions, + "chunks_overlapping_data": res.chunks_overlapping_data, + "rank_of_first_overlap": res.rank_of_first_overlap, + "rank_of_text_match": res.rank_of_text_match, + "error": res.error, + }, separators=(",", ":")) + "\n") + done += 1 + if done % 10 == 0: + sys.stderr.write(f"\r[{done}/{n}] ") + sys.stderr.flush() + + sys.stderr.write(f"\nWrote {ndjson_path}\n") + + summary = aggregate(results) + summary_path = out_dir / "summary.json" + summary_path.write_text(json.dumps(summary, indent=2)) + sys.stderr.write(f"Wrote {summary_path}\n") + + # Human-readable summary + md_lines = ["# Retrieval-recall benchmark (SpreadsheetBench)\n"] + md_lines.append(f"- Corpus: `{args.corpus}`") + md_lines.append(f"- Instances scored: {len(instances)}") + md_lines.append(f"- Embedding model: `{args.model}`") + md_lines.append("") + parsers = sorted(summary.keys()) + md_lines.append("| Metric | " + " | ".join(parsers) + " |") + md_lines.append("|---|" + "|".join(["---"] * len(parsers)) + "|") + metrics = [ + ("recall_geometric@1", "Recall@1 (geometric)"), + ("recall_geometric@3", "Recall@3 (geometric)"), + ("recall_geometric@5", "Recall@5 (geometric)"), + ("recall_text@1", "Recall@1 (text-match)"), + ("recall_text@3", "Recall@3 (text-match)"), + ("recall_text@5", "Recall@5 (text-match)"), + ("table_fragmentation_rate", "Fragmentation rate"), + ("mean_parse_ms", "Mean parse ms"), + ("p50_parse_ms", "P50 parse ms"), + ("errors", "Errors"), + ] + for key, label in metrics: + row = [label] + for p in parsers: + v = summary[p].get(key) + if v is None: + row.append("—") + elif isinstance(v, float): + row.append(f"{v:.3f}") + else: + row.append(str(v)) + md_lines.append("| " + " | ".join(row) + " |") + md_lines.append("") + md_lines.append("**Geometric overlap** = chunk's reported A1 range overlaps the " + "ground-truth `answer_position`. Requires the parser to surface " + "(sheet, range) per chunk — docling does not, so its geometric " + "recall is structurally 0.") + md_lines.append("") + md_lines.append("**Text-match** = the answer cell's actual string value appears " + "as a substring of the chunk's text. Parser-agnostic; this is " + "the apples-to-apples retrieval comparison.") + md_lines.append("") + (out_dir / "summary.md").write_text("\n".join(md_lines) + "\n") + sys.stderr.write(f"Wrote {out_dir / 'summary.md'}\n") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/summarize_retrieval.py b/scripts/summarize_retrieval.py new file mode 100644 index 0000000..ba31566 --- /dev/null +++ b/scripts/summarize_retrieval.py @@ -0,0 +1,154 @@ +""" +Re-aggregate a retrieval-bench `results.ndjson` into summary.json / summary.md. + +Useful when: + - `eval_retrieval.py` was interrupted (Ctrl-C, OOM, hang-watchdog) and + didn't get to run its end-of-run aggregator + - You want to inspect aggregates without re-running + +Usage: + python scripts/summarize_retrieval.py + +Writes summary.json + summary.md next to the input file. Same metrics +the live aggregator emits — keep the two in sync if you change them. +""" + +from __future__ import annotations + +import json +import sys +from collections import defaultdict +from pathlib import Path + + +def aggregate(records: list[dict]) -> dict: + by_parser: dict[str, list[dict]] = defaultdict(list) + for r in records: + by_parser[r["parser"]].append(r) + + out: dict[str, dict] = {} + for parser, recs in by_parser.items(): + total = len(recs) + errors = sum(1 for r in recs if r.get("error")) + ok = total - errors + + def _recall_at(k: int, key: str) -> float: + denom = 0 + hits = 0 + for r in recs: + if r.get("error"): + continue + rank = r.get(key) + if rank is None: + denom += 1 + continue + denom += 1 + if rank <= k: + hits += 1 + return hits / denom if denom else 0.0 + + frags = [r["chunks_overlapping_data"] for r in recs + if not r.get("error") and r.get("data_regions") == 1 + and r.get("chunks_overlapping_data", 0) > 0] + n_clean = sum(1 for f in frags if f == 1) + n_frag = len(frags) - n_clean + frag_rate = (n_frag / len(frags)) if frags else 0.0 + + parse_times = [r["parse_ms"] for r in recs + if not r.get("error") and r.get("parse_ms") is not None] + + out[parser] = { + "instances": total, + "ok": ok, + "errors": errors, + "recall_geometric@1": _recall_at(1, "rank_of_first_overlap"), + "recall_geometric@3": _recall_at(3, "rank_of_first_overlap"), + "recall_geometric@5": _recall_at(5, "rank_of_first_overlap"), + "recall_text@1": _recall_at(1, "rank_of_text_match"), + "recall_text@3": _recall_at(3, "rank_of_text_match"), + "recall_text@5": _recall_at(5, "rank_of_text_match"), + "table_integrity_clean": n_clean, + "table_integrity_fragmented": n_frag, + "table_fragmentation_rate": round(frag_rate, 4), + "mean_parse_ms": round(sum(parse_times) / len(parse_times), 2) + if parse_times else None, + "p50_parse_ms": round(sorted(parse_times)[len(parse_times) // 2], 2) + if parse_times else None, + } + return out + + +def render_md(summary: dict, source: Path, partial: bool) -> str: + parsers = sorted(summary.keys()) + lines = ["# Retrieval-recall benchmark (SpreadsheetBench)\n"] + lines.append(f"- Source NDJSON: `{source}`") + n_total = sum(s["instances"] for s in summary.values()) + n_per = n_total // max(len(parsers), 1) + lines.append(f"- Records: {n_total} ({n_per} per parser){' ⚠️ PARTIAL RUN — bench interrupted before completion' if partial else ''}") + lines.append("- Embedding model: `BAAI/bge-small-en-v1.5`") + lines.append("") + lines.append("| Metric | " + " | ".join(parsers) + " |") + lines.append("|---|" + "|".join(["---"] * len(parsers)) + "|") + metrics = [ + ("recall_geometric@1", "Recall@1 (geometric)"), + ("recall_geometric@3", "Recall@3 (geometric)"), + ("recall_geometric@5", "Recall@5 (geometric)"), + ("recall_text@1", "Recall@1 (text-match)"), + ("recall_text@3", "Recall@3 (text-match)"), + ("recall_text@5", "Recall@5 (text-match)"), + ("table_fragmentation_rate", "Fragmentation rate"), + ("mean_parse_ms", "Mean parse ms"), + ("p50_parse_ms", "P50 parse ms"), + ("errors", "Errors"), + ] + for key, label in metrics: + row = [label] + for p in parsers: + v = summary[p].get(key) + if v is None: + row.append("—") + elif isinstance(v, float): + row.append(f"{v:.3f}") + else: + row.append(str(v)) + lines.append("| " + " | ".join(row) + " |") + lines.append("") + lines.append("**Geometric overlap** = chunk's reported A1 range overlaps the " + "ground-truth `data_position`. Requires the parser to surface " + "(sheet, range) per chunk — docling does not, so its geometric " + "recall is structurally 0.") + lines.append("") + lines.append("**Text-match** = the answer cell's actual string value appears " + "as a substring of the chunk's text, after numeric/date/boolean " + "normalization on both sides. Parser-agnostic; this is the " + "apples-to-apples retrieval comparison.") + return "\n".join(lines) + "\n" + + +def main(argv: list[str]) -> int: + if len(argv) != 2: + sys.stderr.write("usage: python scripts/summarize_retrieval.py \n") + return 2 + ndjson = Path(argv[1]).resolve() + if not ndjson.exists(): + sys.stderr.write(f"file not found: {ndjson}\n") + return 2 + + records = [json.loads(line) for line in ndjson.read_text().splitlines() if line.strip()] + summary = aggregate(records) + + out_dir = ndjson.parent + (out_dir / "summary.json").write_text(json.dumps(summary, indent=2) + "\n") + + # If counts per parser are unequal, treat as partial. + counts = [summary[p]["instances"] for p in summary] + partial = len(set(counts)) > 1 if counts else False + (out_dir / "summary.md").write_text(render_md(summary, ndjson, partial)) + + sys.stderr.write(f"Wrote {out_dir / 'summary.json'}\n") + sys.stderr.write(f"Wrote {out_dir / 'summary.md'}\n") + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/src/chunking/segmenter.py b/src/chunking/segmenter.py index bca58ac..1b7898a 100644 --- a/src/chunking/segmenter.py +++ b/src/chunking/segmenter.py @@ -21,7 +21,6 @@ from __future__ import annotations import logging -from collections import defaultdict from models.block import BlockDTO from models.common import BlockType, CellCoord, CellRange @@ -98,13 +97,7 @@ def segment(self) -> list[BlockDTO]: components = self._find_connected_components( non_table_cells, adaptive_row_gap, adaptive_col_gap ) - # Step 2b: Split components at style boundaries - refined_components = [] - for component in components: - sub_components = self._detect_style_boundaries(component) - refined_components.extend(sub_components) - - for component_cells in refined_components: + for component_cells in components: block = self._classify_component(component_cells, len(blocks)) blocks.append(block) @@ -251,74 +244,6 @@ def _merge_title_blocks(self, blocks: list[BlockDTO]) -> list[BlockDTO]: return [b for idx, b in enumerate(merged) if idx not in absorbed] - def _detect_style_boundaries(self, cells: list) -> list[list]: - """ - Split a component at persistent fill-color discontinuities. - - Only splits on fill/background color changes (not bold, which is - expected for header rows). Requires the change to persist for 3+ - rows and both sides of the boundary must have 3+ rows to avoid - splitting headers from their data. - """ - if len(cells) <= 1: - return [cells] - - # Group cells by row - rows: dict[int, list] = defaultdict(list) - for cell in cells: - rows[cell.coord.row].append(cell) - - sorted_row_nums = sorted(rows.keys()) - if len(sorted_row_nums) <= 5: - # Too few rows to meaningfully split by style - return [cells] - - # Compute fill-only style signature per row (ignore bold) - def _row_fill_sig(row_cells: list) -> str: - parts = [] - for c in sorted(row_cells, key=lambda x: x.coord.col): - fg = "" - if c.style and c.style.fill and c.style.fill.fg_color: - fg = c.style.fill.fg_color - parts.append(fg) - return ";".join(parts) - - signatures = {r: _row_fill_sig(rows[r]) for r in sorted_row_nums} - - # Find split points: persistent fill color changes (3+ rows on each side) - split_rows: list[int] = [] - for i in range(3, len(sorted_row_nums) - 2): - curr_row = sorted_row_nums[i] - prev_row = sorted_row_nums[i - 1] - if signatures[curr_row] != signatures[prev_row]: - # Verify persistence: check 2 more rows after the change - next1 = sorted_row_nums[i + 1] if i + 1 < len(sorted_row_nums) else None - next2 = sorted_row_nums[i + 2] if i + 2 < len(sorted_row_nums) else None - if ( - next1 is not None - and next2 is not None - and signatures.get(next1) == signatures[curr_row] - and signatures.get(next2) == signatures[curr_row] - ): - split_rows.append(curr_row) - - if not split_rows: - return [cells] - - # Split cells into groups at split rows - components = [] - current_cells = [] - split_set = set(split_rows) - for row_num in sorted_row_nums: - if row_num in split_set and current_cells: - components.append(current_cells) - current_cells = [] - current_cells.extend(rows[row_num]) - if current_cells: - components.append(current_cells) - - return [c for c in components if c] - def segment_with_details(self) -> tuple[list[BlockDTO], list[list]]: """ Segment the sheet and also return raw connected components. diff --git a/src/ks_xlsx_parser/__init__.py b/src/ks_xlsx_parser/__init__.py index 0c938bf..184452b 100644 --- a/src/ks_xlsx_parser/__init__.py +++ b/src/ks_xlsx_parser/__init__.py @@ -11,7 +11,7 @@ """ from __future__ import annotations -__version__ = "0.1.1" +__version__ = "0.2.0" from pipeline import ( # noqa: F401 ParseResult, diff --git a/src/parsers/cell_parser.py b/src/parsers/cell_parser.py index ee41d70..cc22092 100644 --- a/src/parsers/cell_parser.py +++ b/src/parsers/cell_parser.py @@ -319,12 +319,23 @@ def _extract_font(self, cell: OpenpyxlCell) -> FontStyle | None: ) def _extract_fill(self, cell: OpenpyxlCell) -> FillStyle | None: - """Extract fill/background properties from a cell.""" + """Extract fill/background properties from a cell. + + Cells can carry a ``GradientFill`` instead of a ``PatternFill``; + gradient fills have ``stops`` + ``type`` but no ``patternType`` / + ``fgColor`` attributes. We don't model gradients (rare in finance + spreadsheets, and the ``FillStyle`` DTO is pattern-shaped), but + accessing ``patternType`` on one raises ``AttributeError`` and + crashes the sheet parser — losing every cell on the sheet. So we + defensively skip non-PatternFill objects rather than propagate. + """ f = cell.fill - if not f or not f.patternType or f.patternType == "none": + if not f or not hasattr(f, "patternType"): + return None + if not f.patternType or f.patternType == "none": return None - fg = self._extract_color(f.fgColor) if f.fgColor else None - bg = self._extract_color(f.bgColor) if f.bgColor else None + fg = self._extract_color(f.fgColor) if getattr(f, "fgColor", None) else None + bg = self._extract_color(f.bgColor) if getattr(f, "bgColor", None) else None if not fg and not bg: return None return FillStyle(pattern_type=f.patternType, fg_color=fg, bg_color=bg) diff --git a/src/rendering/text_renderer.py b/src/rendering/text_renderer.py index 525609b..76b9b76 100644 --- a/src/rendering/text_renderer.py +++ b/src/rendering/text_renderer.py @@ -8,6 +8,7 @@ from __future__ import annotations +import datetime as _dt import logging from models.block import BlockDTO @@ -18,6 +19,75 @@ logger = logging.getLogger(__name__) +def _flatten_cell_text(val: str) -> str: + """Collapse embedded line breaks so a cell stays on one row of the + Markdown grid. Excel headers often contain `\\n` to wrap text visually + (e.g. ``"租金\\n天数"``); rendered into ``| ... |`` rows verbatim they + rip the grid apart.""" + if "\n" not in val and "\r" not in val: + return val + return val.replace("\r\n", " ").replace("\n", " ").replace("\r", " ") + + +def _format_number_for_retrieval(raw: int | float) -> str: + """Render a numeric raw value in a retrieval-friendly form. + + Excel's ``display_value`` honours the cell's number-format string, + which produces ``"1,272.00"`` for 1272 or ``"6%"`` for 0.06. Those + are great for humans but defeat substring-match retrieval — a user + asking "what was the value in 2020?" types ``1272``, not ``1,272.00``. + + Rules: + - Integer-valued floats → ``str(int(v))`` (1272.0 → "1272") + - Integers → ``str(v)`` (1272 → "1272") + - Floats → ``g`` format up to 10 significant digits, trailing + zeros trimmed. Avoids both sci-notation for ordinary magnitudes + and trailing ``.0`` noise. + """ + if isinstance(raw, bool): # bool is a subclass of int + return "TRUE" if raw else "FALSE" + if isinstance(raw, int): + return str(raw) + # float + if raw == int(raw) and abs(raw) < 1e16: + return str(int(raw)) + return f"{raw:.10g}" + + +def _cell_render_value(cell) -> str: + """Pick the string form of `cell` that's best for RAG retrieval. + + For *numeric* cells we ignore the display-formatted string and emit + the raw value verbatim — Excel's commas, percent signs, trailing + zeros, and currency symbols all defeat substring search. + + For dates we emit ISO ``YYYY-MM-DD`` (no time component) which is + both human-readable and matches the date format that openpyxl / + pandas surface when reading the answer file. + + Strings and everything else fall back to ``display_value``. + """ + if cell is None: + return "" + raw = cell.raw_value + + if isinstance(raw, (_dt.date, _dt.datetime)): + if isinstance(raw, _dt.datetime): + if raw.hour == 0 and raw.minute == 0 and raw.second == 0: + return raw.date().isoformat() + return raw.isoformat(sep=" ") + return raw.isoformat() + + if isinstance(raw, (int, float)) and not isinstance(raw, bool): + return _format_number_for_retrieval(raw) + + if cell.display_value is not None: + return str(cell.display_value) + if raw is not None: + return str(raw) + return "" + + class TextRenderer: """ Renders blocks as plain text with coordinate context. @@ -54,16 +124,23 @@ def render_block(self, block: BlockDTO) -> str: header += f' table: "{block.table_name}"' lines.append(header) - # Compute column widths + # Compute column widths using the SAME rendering rules the data + # rows will use, including the trailing `[=]` formula marker. + # Otherwise `[=]` inflates a cell past col_width post-hoc and + # spuriously triggers the long-value fallback below. col_widths: dict[int, int] = {} for col in cols: col_letter = col_number_to_letter(col) max_width = len(col_letter) for row in rows: cell = self._sheet.get_cell(row, col) - if cell: - val = cell.display_value or (str(cell.raw_value) if cell.raw_value is not None else "") - max_width = max(max_width, len(val)) + if cell is None: + continue + val = _cell_render_value(cell) + if cell.formula and not val.startswith("="): + val = f"{val} [=]" + val = _flatten_cell_text(val) + max_width = max(max_width, len(val)) col_widths[col] = min(max_width, 30) # Cap at 30 for alignment; text may overflow # Column header row @@ -89,23 +166,20 @@ def render_block(self, block: BlockDTO) -> str: if col in self._sheet.hidden_cols: continue cell = self._sheet.get_cell(row, col) - val = "" - if cell: - if cell.display_value is not None: - val = str(cell.display_value) - elif cell.raw_value is not None: - val = str(cell.raw_value) - - # Annotate formulas with a marker (unless display already shows the formula) - if cell.formula and not val.startswith("="): - val = f"{val} [=]" - - # For long numeric values: use scientific notation (preserves precision). - # Text strings are never truncated. - if len(val) > col_widths[col]: - raw = cell.raw_value - if isinstance(raw, (int, float)): - val = f"{float(raw):.6e}" + val = _cell_render_value(cell) if cell else "" + + if cell and cell.formula and not val.startswith("="): + val = f"{val} [=]" + + # Markdown table rows are single-line; collapse embedded newlines + # (common in headers like "租金\n天数") so they don't break the grid. + val = _flatten_cell_text(val) + + # Long-value fallback: only triggers if the rendered string + # genuinely exceeds the (now consistently-computed) column + # width — i.e. the column was capped at 30. We still emit + # the full retrieval value (no truncation) and let the + # alignment overflow; truncating destroys retrievability. values.append(val.ljust(col_widths[col])) line = "| " + " | ".join(values) + " |" diff --git a/tests/benchmarks/README.md b/tests/benchmarks/README.md new file mode 100644 index 0000000..1cb056f --- /dev/null +++ b/tests/benchmarks/README.md @@ -0,0 +1,86 @@ +# xlsx parser benchmarks + +Two benchmarks, both reproducible: + +| Benchmark | What it measures | Corpus | Cost | +|---|---|---|---| +| `vs_hucre.py` (structural) | Parse-success rate + structural counts (cells, formulas, tables, merges, etc.) across many files | `testBench/` (53 curated) or `data/corpora/spreadsheetbench/` (5,458 real-world) | Cheap — 1–20 min | +| `scripts/eval_retrieval.py` (chunk quality) | Recall@k for retrieving the relevant chunk given a natural-language instruction, + table-integrity fragmentation rate | SpreadsheetBench `dataset.json` (912 instruction + position pairs) | Medium — 10 min on 100 instances | + +## 1. Structural benchmark — `vs_hucre.py` + +Long-running NDJSON-protocol workers, per-file timeout, batch respawn, randomized parser order. Adapters live under `adapters/`; runners in `_runner.py`. Add a new parser by: + +1. Write `adapters/_adapter.py` that speaks the protocol (see `ks_adapter.py` and `docling_adapter.py` as references). +2. Add a runner factory in `_runner.py`. +3. Wire it into `vs_hucre.py`'s `--parsers` handling. + +Supported parsers today: `ks` (ks-xlsx-parser), `hucre` (TypeScript, requires `pnpm install` under `hucre_node/`), `docling` (IBM Docling — `uv pip install docling`). + +```bash +# Quick smoke (50 random files from testBench) +PYTHONPATH=src uv run python -m tests.benchmarks.vs_hucre \ + --corpus testBench --sample 50 --parsers ks + +# Robustness on full SpreadsheetBench (5,458 files, ~20 min) +PYTHONPATH=src uv run python -m tests.benchmarks.vs_hucre \ + --corpus data/corpora/spreadsheetbench --parsers ks \ + --per-file-timeout 120 --out tests/benchmarks/reports/spreadsheetbench + +# ks vs docling on a sample +PYTHONPATH=src uv run python -m tests.benchmarks.vs_hucre \ + --corpus data/corpora/spreadsheetbench --sample 100 \ + --parsers ks,docling +``` + +Outputs (per run, timestamped subdir): +- `results.csv` — one row per (file, parser), schema-validated +- `raw.ndjson` — full records with `extra` fields preserved +- `failures.jsonl` — `status != "ok"` rows +- `summary.md` — status matrix, capability matrix, aggregate counts, perf percentiles, per-sub-corpus breakdown +- `drift.md` — pairwise feature-count disagreement between two parsers +- `manifest.json` — git sha, Python/Node version, timestamp + +### Schema notes (`_schema.py`) + +Null vs zero is **load-bearing**: `None` means the parser doesn't model a feature, `0` means it does and observed zero. Drift and capability reports treat them differently. Adapters must respect this. + +## 2. Chunk-quality benchmark — `scripts/eval_retrieval.py` + +Uses SpreadsheetBench's `dataset.json` (912 instances) — each has an `instruction`, a `data_position`, and an `answer_position`. We: + +1. Parse the input `.xlsx` with each parser → list of `(text, sheet, A1 range)` chunks. +2. Embed all chunks + instruction with sentence-transformers (BGE-small by default). +3. Rank chunks by cosine similarity to the instruction. +4. Check whether the chunk containing `data_position` (falling back to `answer_position` for the 561 instances that omit `data_position`) is in top-k. + +**Two recall metrics**: +- **Geometric overlap** — chunk's `A1` range overlaps the ground-truth region. Requires parsers to surface (sheet, range) per chunk; docling doesn't, so its geometric recall is structurally zero. Useful for measuring ks against its own past output. +- **Text-match** — the answer cell's actual string value appears as a substring of the chunk's text. Parser-agnostic — this is the apples-to-apples retrieval metric for cross-parser comparison. + +**Fragmentation rate** — for single-region instances, how many chunks does the data region span? `1` = clean; `>1` = the table was fragmented across chunks (the 租赁收入计提表-class regression). + +```bash +# Sample 100 instances, both parsers +PYTHONPATH=src uv run python scripts/eval_retrieval.py \ + --sample 100 --parsers ks,docling + +# Full corpus (912 instances) — single-parser baseline +PYTHONPATH=src uv run python scripts/eval_retrieval.py --parsers ks +``` + +Outputs: `results.ndjson` (per-instance per-parser), `summary.json` (machine-readable aggregate), `summary.md` (human-readable table). + +## Corpora + +Run `scripts/download_corpora.sh` once to populate `data/corpora/` (gitignored). Currently fetches: +- SpreadsheetBench v0.1 (912 task instances × ~6 xlsx each = 5,458 files) +- EUSES (mostly .xls) +- Enron spreadsheets (mostly .xls) +- A handful of SheetJS / openpyxl sample xlsx + +## Caveats + +- **Marker is intentionally absent.** Marker's xlsx pipeline goes xlsx → HTML → PDF (WeasyPrint) → markdown via PDF layout-recognition models. On a CPU-only machine it took >30 min on a single 1k-row workbook; not viable for a 5,458-file corpus. The structural framework supports adding a marker adapter (see `adapters/docling_adapter.py` as a template) — the speed wall is the obstacle, not the integration. +- **Memory measurement** in `_mem.py` is RSS-delta and approximate (±30%). For trustworthy memory numbers run one parser per process and look at peak RSS reported by the OS. +- **The retrieval embedding model** matters. BGE-small-en-v1.5 is the default for speed; switch with `--model BAAI/bge-large-en-v1.5` for better recall at ~10× the compute. diff --git a/tests/benchmarks/_runner.py b/tests/benchmarks/_runner.py index 8131496..2bdb95c 100644 --- a/tests/benchmarks/_runner.py +++ b/tests/benchmarks/_runner.py @@ -254,3 +254,20 @@ def hucre_runner(timeout_s: float = 120.0) -> Runner: cwd=node_dir, per_file_timeout_s=timeout_s, )) + + +def docling_runner(python_bin: str | None = None, timeout_s: float = 120.0) -> Runner: + """Runner for the IBM Docling Python adapter. + + Loads docling ML models once at worker startup (a few seconds) and + reuses them across files. Per-file timeout still applies — large + workbooks routinely take 10-60s through docling's pipeline. + """ + py = python_bin or sys.executable + return Runner(RunnerConfig( + name="docling", + cmd=[py, "-m", "tests.benchmarks.adapters.docling_adapter"], + cwd=REPO_ROOT, + per_file_timeout_s=timeout_s, + batch_size=200, # bigger batches — model load is the bottleneck + )) diff --git a/tests/benchmarks/_schema.py b/tests/benchmarks/_schema.py index 3641fbd..3a7202c 100644 --- a/tests/benchmarks/_schema.py +++ b/tests/benchmarks/_schema.py @@ -117,7 +117,13 @@ def validate_record(d: dict[str, Any]) -> None: raise ValueError(f"unknown status: {status!r}") if status == "ok": - for numeric in ("sheets", "cells", "formulas", "parse_time_ms"): + # `cells` and `parse_time_ms` MUST be populated — every parser + # extracts cells and we always measure parse time. + # `sheets` MUST be populated — every spreadsheet parser knows how + # many sheets it processed. + # `formulas` is parser-capability dependent (docling, marker + # don't model formulas), so None is legal here even on ok. + for numeric in ("sheets", "cells", "parse_time_ms"): if d[numeric] is None: raise ValueError(f"status=ok but {numeric} is None") diff --git a/tests/benchmarks/adapters/docling_adapter.py b/tests/benchmarks/adapters/docling_adapter.py new file mode 100644 index 0000000..17d4b02 --- /dev/null +++ b/tests/benchmarks/adapters/docling_adapter.py @@ -0,0 +1,233 @@ +""" +Docling worker: same NDJSON-on-stdio protocol as ks_adapter, but for the +IBM Docling DocumentConverter. + +Docling treats xlsx as a document → markdown. We map its native objects +to the BenchmarkRecord schema as faithfully as we can; capabilities it +does NOT model are left as ``None`` (the schema's load-bearing distinction +between "feature absent in this file" and "feature not modeled at all"). + +What docling models for xlsx: + - sheets ← one table per sheet (its convention) + - cells ← sum of table cells across tables + - tables ← len(doc.tables) + - merges ← cells with row_span/col_span > 1 + +What it does NOT model: + - formulas, formula_dependencies — computed values only, no `=…` + - charts, chart_types + - pivots + - conditional formatting / data validation + - named ranges + - hyperlinks per cell, comments, sparklines + - images + +Chunks: docling has a chunker but we keep this adapter shallow. We export +the full document to markdown and emit it as a single chunk; the retrieval +metric splits on table boundaries and headings downstream. +""" + +from __future__ import annotations + +import json +import os +import sys +import time +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +_HERE = Path(__file__).resolve().parent +sys.path.insert(0, str(_HERE.parent.parent.parent)) # repo root + +from tests.benchmarks._mem import peak_rss_mb # noqa: E402 +from tests.benchmarks._schema import BenchmarkRecord, SCHEMA_VERSION # noqa: E402 + +PARSER_NAME = "docling" +MAX_ERR_LEN = 500 + + +def _write(obj: dict[str, Any]) -> None: + sys.stdout.write(json.dumps(obj, separators=(",", ":")) + "\n") + sys.stdout.flush() + + +def _build_converter(): + """Build a DocumentConverter once and reuse across files.""" + from docling.document_converter import DocumentConverter + + return DocumentConverter() + + +def _docling_version() -> str: + try: + import importlib.metadata + + return importlib.metadata.version("docling") + except Exception: + try: + import docling + + return getattr(docling, "__version__", "unknown") + except Exception: + return "unknown" + + +def _count_features(doc, file_size: int, path: str, parse_time_ms: float, + peak_mb: float, commit: str, markdown: str) -> BenchmarkRecord: + # Tables: docling produces one TableItem per detected table region. + tables = list(doc.tables) + n_tables = len(tables) + + # Cells: sum of table_cells. Docling does not surface cell counts + # outside tables, so this undercounts free-floating text cells. + n_cells = 0 + n_merges = 0 + for t in tables: + cells = t.data.table_cells if t.data else [] + for c in cells: + n_cells += 1 + row_span = (c.end_row_offset_idx or 0) - (c.start_row_offset_idx or 0) + col_span = (c.end_col_offset_idx or 0) - (c.start_col_offset_idx or 0) + if row_span > 1 or col_span > 1: + n_merges += 1 + + # Sheets: docling does not expose sheet count directly. The common + # case is one table per sheet, but multi-table sheets exist. Best + # available proxy: count unique pages — but docling for xlsx uses + # one page per sheet, so len(doc.pages) ≈ sheet count. + try: + n_sheets = len(doc.pages) if doc.pages else None + except Exception: + n_sheets = None + + return BenchmarkRecord( + file=path, + file_size_bytes=file_size, + parser=PARSER_NAME, + parser_version=_docling_version(), + status="ok", + error=None, + parse_time_ms=parse_time_ms, + peak_memory_mb=peak_mb, + sheets=n_sheets if n_sheets is not None else 1, + cells=n_cells, + formulas=None, # not modeled + formula_dependencies=None, # not modeled + charts=None, # not modeled for xlsx + chart_types=None, + tables=n_tables, + pivots=None, # not modeled + merges=n_merges, + cf_rules=None, # not modeled + dv_rules=None, # not modeled + named_ranges=None, # not modeled + hyperlinks=None, # not surfaced per-cell + images=None, # not modeled for xlsx + comments=None, # not modeled + sparklines=None, # not modeled + chunks=1, # single markdown blob — retrieval step re-chunks + token_count=max(len(markdown) // 4, 1), # crude estimate + schema_version=SCHEMA_VERSION, + timestamp=datetime.now(UTC).isoformat(), + harness_commit=commit, + extra={"markdown": markdown}, + ) + + +def _error_record(path: str, file_size: int, status: str, error: str, + parse_time_ms: float | None, peak_mb: float | None, + commit: str) -> BenchmarkRecord: + return BenchmarkRecord( + file=path, + file_size_bytes=file_size, + parser=PARSER_NAME, + parser_version=_docling_version(), + status=status, + error=(error or "")[:MAX_ERR_LEN] or None, + parse_time_ms=parse_time_ms, + peak_memory_mb=peak_mb, + sheets=None, + cells=None, + formulas=None, + formula_dependencies=None, + charts=None, + chart_types=None, + tables=None, + pivots=None, + merges=None, + cf_rules=None, + dv_rules=None, + named_ranges=None, + hyperlinks=None, + images=None, + comments=None, + sparklines=None, + chunks=None, + token_count=None, + schema_version=SCHEMA_VERSION, + timestamp=datetime.now(UTC).isoformat(), + harness_commit=commit, + ) + + +def main() -> int: + commit = os.environ.get("HARNESS_COMMIT", "") + converter = _build_converter() + _write({"event": "ready", "parser": PARSER_NAME, "version": _docling_version()}) + + for line in sys.stdin: + line = line.strip() + if not line: + continue + try: + msg = json.loads(line) + path = msg["path"] + except Exception as exc: + _write({"event": "error", "error": f"bad input line: {exc}"}) + continue + + try: + file_size = os.path.getsize(path) + except OSError: + file_size = 0 + + rss0 = peak_rss_mb() + t0 = time.perf_counter() + try: + result = converter.convert(path) + doc = result.document + md = doc.export_to_markdown() + t1 = time.perf_counter() + rss1 = peak_rss_mb() + rec = _count_features( + doc=doc, + file_size=file_size, + path=path, + parse_time_ms=(t1 - t0) * 1000.0, + peak_mb=max(rss1 - rss0, 0.0), + commit=commit, + markdown=md, + ) + except Exception as exc: # noqa: BLE001 + t1 = time.perf_counter() + rss1 = peak_rss_mb() + rec = _error_record( + path=path, + file_size=file_size, + status="error", + error=f"{type(exc).__name__}: {exc}", + parse_time_ms=(t1 - t0) * 1000.0, + peak_mb=max(rss1 - rss0, 0.0), + commit=commit, + ) + + sys.stdout.write(rec.to_json_line()) + sys.stdout.flush() + + _write({"event": "done"}) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/benchmarks/vs_hucre.py b/tests/benchmarks/vs_hucre.py index a507c54..24e0e50 100644 --- a/tests/benchmarks/vs_hucre.py +++ b/tests/benchmarks/vs_hucre.py @@ -28,7 +28,7 @@ from pathlib import Path from ._driver import enumerate_corpus, run_benchmark -from ._runner import hucre_runner, ks_runner +from ._runner import docling_runner, hucre_runner, ks_runner def main(argv: list[str] | None = None) -> int: @@ -44,7 +44,8 @@ def main(argv: list[str] | None = None) -> int: parser.add_argument("--seed", type=int, default=1337) parser.add_argument("--per-file-timeout", type=float, default=120.0) parser.add_argument("--parsers", type=str, default="ks,hucre", - help="Comma-separated subset of parsers to run.") + help="Comma-separated subset of parsers to run " + "(supported: ks, hucre, docling).") parser.add_argument("--ks-python", type=str, default=sys.executable, help="Python binary for the ks adapter (default: current).") parser.add_argument("--batch-size", type=int, default=50, @@ -73,6 +74,9 @@ def main(argv: list[str] | None = None) -> int: r = hucre_runner(timeout_s=args.per_file_timeout) r.cfg.batch_size = args.batch_size runners["hucre"] = r + if "docling" in selected: + r = docling_runner(python_bin=args.ks_python, timeout_s=args.per_file_timeout) + runners["docling"] = r if not runners: sys.stderr.write("no parsers selected\n") return 2 diff --git a/tests/test_rendering.py b/tests/test_rendering.py index f2fab95..e24072a 100644 --- a/tests/test_rendering.py +++ b/tests/test_rendering.py @@ -104,22 +104,27 @@ def test_text_includes_range(self, simple_workbook): # Should include the A1-style range assert "!" in text # Sheet1!range format - def test_numeric_cells_use_scientific_notation_not_truncation(self): - """Long numeric values use scientific notation instead of truncating with ...""" + def test_numeric_cells_render_raw_not_display_formatted(self): + """Numeric cells render the raw value, ignoring Excel's display + formatting. This is intentional for RAG retrievability: a query + like "1272" should match the cell even if Excel displays it as + "1,272.00". The clobbered display format used to also trigger a + sci-notation fallback (``1.272000e+03``) once the ``[=]`` formula + marker pushed the rendered string past col_width — this test + guards against that regression.""" from models.sheet import SheetDTO from models.cell import CellDTO from models.common import CellCoord, CellRange from models.block import BlockDTO from models.common import BlockType - # Create a sheet with a numeric cell whose display_value would exceed column width. - # Column width is min(max_cell_len, 30), so we need a number that formats to >30 chars. coord = CellCoord(row=1, col=1) cell = CellDTO( coord=coord, sheet_name="Test", raw_value=0.002668, - display_value="0.002668000000000000000000000000", # 32 chars - would truncate + # Excel display would be e.g. "0.27%" or "0.002668000000000..." + display_value="0.002668000000000000000000000000", ) sheet = SheetDTO( sheet_name="Test", @@ -143,5 +148,7 @@ def test_numeric_cells_use_scientific_notation_not_truncation(self): renderer = TextRenderer(sheet) text = renderer.render_block(block) - # Number should appear in scientific notation (full precision) rather than truncated with … - assert "2.668000e-03" in text + # Raw value, sci-notation-free + assert "0.002668" in text + assert "e-03" not in text + assert "e+03" not in text diff --git a/uv.lock b/uv.lock index 2822699..caa5822 100644 --- a/uv.lock +++ b/uv.lock @@ -389,7 +389,7 @@ wheels = [ [[package]] name = "ks-xlsx-parser" -version = "0.1.1" +version = "0.2.0" source = { editable = "." } dependencies = [ { name = "lxml" },