diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index 1c5778c..186f82d 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -56,10 +56,10 @@ body:
label: Traceback (if any)
render: shell
- type: checkboxes
- id: testbench
+ id: benchmark
attributes:
- label: testBench check
- description: Does your file already fail `make testbench`? If so, please note which group.
+ label: Benchmark check
+ description: Did your file surface in `make bench-robust` (SpreadsheetBench)?
options:
- - label: "I ran `make testbench` and my file failed (attach `metrics/testbench/failures.json`)."
- - label: "The file is not in the bench; I can contribute it as a new fixture."
+ - label: "I ran `make bench-robust` and my file failed (attach the row from results.csv if you can)."
+ - label: "The file is from outside SpreadsheetBench; I can attach a minimal reproducer."
diff --git a/.github/ISSUE_TEMPLATE/parser_edge_case.yml b/.github/ISSUE_TEMPLATE/parser_edge_case.yml
index 3cb51a5..a4fb527 100644
--- a/.github/ISSUE_TEMPLATE/parser_edge_case.yml
+++ b/.github/ISSUE_TEMPLATE/parser_edge_case.yml
@@ -6,8 +6,8 @@ body:
- type: markdown
attributes:
value: |
- Every edge-case report ideally becomes a new fixture in `testBench/`. Bonus points
- for a minimal generator in `scripts/build_testbench.py`.
+ Every edge-case report ideally becomes a regression test. Bonus points
+ for a minimal `openpyxl` generator that reproduces it.
- type: textarea
id: pattern
attributes:
@@ -33,6 +33,6 @@ body:
attributes:
label: What would you like next?
options:
- - label: "Add it to `testBench/` as a new stress fixture."
+ - label: "Land it as a new regression test in `tests/`."
- label: "Open a PR fixing the parser."
- label: "Triage help — I'm stuck."
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 4aec668..6a0e550 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -6,7 +6,7 @@
- [ ] 🐞 Bug fix
- [ ] ✨ New feature
-- [ ] 🧪 Parser edge case / new `testBench/` fixture
+- [ ] 🧪 Parser edge case / new regression test
- [ ] 📚 Docs
- [ ] 🧹 Refactor / chore
- [ ] 🚀 Performance
@@ -14,7 +14,7 @@
## Checklist
- [ ] `make test` passes locally
-- [ ] `make testbench` still shows 1054/1054 (or the delta is explained below)
+- [ ] If parser/chunker internals changed: ran `make bench-robust` against SpreadsheetBench (call out any regressions below)
- [ ] Added/updated tests covering the change
- [ ] `ruff check` is clean
- [ ] Updated docs if user-facing behaviour changed
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a84e5e0..1168a01 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -50,34 +50,3 @@ jobs:
name: junit-${{ matrix.os }}-py${{ matrix.python-version }}
path: reports/junit.xml
if-no-files-found: ignore
-
- testbench:
- name: testBench round-trip (ubuntu / py3.12)
- runs-on: ubuntu-latest
- needs: test
- steps:
- - uses: actions/checkout@v4
- - uses: actions/setup-python@v5
- with:
- python-version: "3.12"
- cache: pip
- cache-dependency-path: pyproject.toml
-
- - name: Install
- run: |
- python -m pip install --upgrade pip
- pip install -e ".[dev,api]"
-
- - name: Build generated testBench
- run: make testbench-build
-
- - name: Run round-trip tests
- run: make testbench
-
- - name: Upload failure log
- if: always()
- uses: actions/upload-artifact@v4
- with:
- name: testbench-failures
- path: metrics/testbench/failures.json
- if-no-files-found: ignore
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index ee6a82f..5cf43b6 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -37,9 +37,6 @@ jobs:
- name: Build wheel + sdist
run: python -m build
- - name: Build testBench zip
- run: make testbench-zip
-
- name: Upload distribution artifacts
uses: actions/upload-artifact@v4
with:
@@ -47,7 +44,6 @@ jobs:
path: |
dist/*.whl
dist/*.tar.gz
- dist/testBench-v*.zip
github-release:
needs: build
@@ -81,7 +77,6 @@ jobs:
files: |
dist/*.whl
dist/*.tar.gz
- dist/testBench-v*.zip
body_path: ${{ steps.notes.outputs.path }}
generate_release_notes: ${{ steps.notes.outputs.auto == 'true' }}
@@ -97,9 +92,6 @@ jobs:
name: dist
path: dist
- - name: Strip non-PyPI artifacts
- run: rm -f dist/testBench-v*.zip
-
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
diff --git a/.gitignore b/.gitignore
index 67bad3c..6a13083 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,17 +51,12 @@ tests/fixtures/corpus/
# Corpus & metrics outputs
metrics/corpus/
metrics/corpus_summary.json
-metrics/testbench/
-# Generated stress test artifacts — the 1000-file bench is re-built on demand
-testBench/generated/
+# Generated stress test artifacts
examples/stress_test/stress_results.json
examples/stress_test/built_reference.json
examples/stress_test/STRESS_TEST_RESULTS.md
-# Packaged dataset (produced by `make testbench-zip`)
-dist/testBench*.zip
-
# Local benchmark harness (private, not pushed)
tests/benchmarks/reports/
tests/benchmarks/hucre_node/node_modules/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 934380f..682f088 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -4,12 +4,10 @@ repos:
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- exclude: "^testBench/"
- id: check-yaml
- id: check-toml
- id: check-added-large-files
- args: ["--maxkb=5120"] # 5 MB ceiling per file — testBench fixtures are larger, excluded below
- exclude: "^testBench/"
+ args: ["--maxkb=5120"]
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.9
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 918d0d6..d7527b2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -45,7 +45,28 @@ Template for a new release (copy this block, fill in, move Unreleased items in):
## [Unreleased]
-Nothing yet. Open a PR and add your entry under the appropriate heading.
+### ⚠️ BREAKING
+- Retired the in-tree `testBench/` corpus. The 1054-workbook stress dataset
+ and `make testbench*` targets are gone — benchmarks now run against the
+ public SpreadsheetBench v0.1 corpus, downloaded on demand to `data/corpora/`
+ (gitignored). See `docs/corpora.md`.
+
+### Removed
+- `testBench/` directory and all bundled real-world / generated workbooks.
+- `make testbench-build`, `make testbench`, `make testbench-zip` targets.
+- `testbench` job in `.github/workflows/ci.yml`.
+- `testBench-vX.Y.Z.zip` release asset from the release workflow.
+- `tests/test_testbench_roundtrip.py`, `tests/test_enterprise_scoring.py`,
+ `tests/test_real_world_datasets.py`, `tests/test_cross_validation.py`.
+- `scripts/build_testbench.py`, `scripts/generate_enterprise_fixtures.py`.
+- `static_xlsx` pytest fixture (the test bench it iterated is gone).
+
+### Changed
+- README, wiki, examples, and contributor docs now point at SpreadsheetBench
+ (`make bench-robust` / `make bench-retrieval`) as the canonical benchmark.
+- `examples/demo.py` + `examples/generate_examples.py` now write/read fixtures
+ under `examples/fixtures/` instead of the (removed) `testBench/real_world/`.
+
## [0.2.0] — 2026-05-11
@@ -173,7 +194,7 @@ announcement: [`docs/launch/RELEASE_NOTES_v0.1.1.md`](docs/launch/RELEASE_NOTES_
### Performance
- Chunk builder caches `detect_circular_refs()` per workbook instead of
- re-running it per block. Real 21k-cell financial model (Walbridge):
+ re-running it per block. Real 21k-cell financial model:
**307 s → 4.6 s (66×)**.
- Sheet parser iterates openpyxl's `_cells` dict instead of `iter_rows()`
over the full bounding box. Workbooks with extreme sparse addresses
@@ -185,9 +206,8 @@ announcement: [`docs/launch/RELEASE_NOTES_v0.1.1.md`](docs/launch/RELEASE_NOTES_
non-existent `dxfId=0` in generated fixtures, so openpyxl can load them
back without an `IndexError`.
- `test_formula_cached_values_match` now applies a 15 % threshold for
- workbooks with known openpyxl `data_only` caching gaps (Walbridge),
- 5 % everywhere else. See
- [`docs/PARSER_KNOWN_ISSUES.md`](docs/PARSER_KNOWN_ISSUES.md).
+ workbooks with known openpyxl `data_only` caching gaps, 5 % everywhere
+ else. See [`docs/PARSER_KNOWN_ISSUES.md`](docs/PARSER_KNOWN_ISSUES.md).
### Docs
- New README positioned as *"Make XLSX LLM Ready"* with architecture
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a270bbf..c9bdabf 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -13,12 +13,12 @@ bug or send a small PR. If that's you, thank you.
## Ways to help (in order of preference for first-time contributors)
-1. **Run `make testbench` and report a file that breaks.** We actively want
- edge-case `.xlsx` fixtures — use the
+1. **Run `make bench-robust` on SpreadsheetBench and report a file that
+ breaks.** We actively want edge-case `.xlsx` fixtures — use the
[Parser edge case issue template](https://github.com/knowledgestack/ks-xlsx-parser/issues/new?template=parser_edge_case.yml).
-2. **Add a new workbook to `testBench/`.** Either drop a file under
- `testBench/stress/` or add a builder to `scripts/build_testbench.py`. If
- the parser crashes on it, even better.
+2. **Submit an adversarial workbook.** Attach a `.xlsx` (or a generator
+ that builds one) to a Parser edge case issue. If the parser crashes
+ on it, even better.
3. **Fix one of the flagged issues** in [`docs/PARSER_KNOWN_ISSUES.md`](docs/PARSER_KNOWN_ISSUES.md).
4. **Improve docs.** The README, the architecture diagram, the examples —
if something confused you, it confuses everyone.
@@ -32,8 +32,9 @@ git clone https://github.com/knowledgestack/ks-xlsx-parser.git
cd ks-xlsx-parser
make install # pip install -e ".[dev,api]"
make test # fast, default suite
-make testbench-build # regenerate 1000-file stress corpus (~1 min)
-make testbench # round-trip every workbook; parallel
+make corpus-download # fetch SpreadsheetBench (5,458 real-world xlsx)
+make bench-robust # parse-success + structural counts vs Docling
+make bench-retrieval # retrieval recall@k vs Docling
```
Prerequisites: Python 3.10+, `pip`, optionally `make`. We use `ruff` for
@@ -44,7 +45,8 @@ linting/formatting — install it with the `[dev]` extra.
Your PR should:
1. Have tests. `pytest` must stay green: `make test`.
-2. Keep `make testbench` at 1054/1054 (or explain the delta in the PR description).
+2. If touching parser or chunker internals, run `make bench-robust` against
+ SpreadsheetBench and call out any regressions in the PR description.
3. Pass `ruff check` (`make lint`) and be formatted with `make format`.
4. Include one sentence in the PR description that starts with *"This change…"*.
5. Use [conventional-commit style](https://www.conventionalcommits.org/)
@@ -74,7 +76,7 @@ Helpful things to include:
- Type hints everywhere that's practical.
- Tests live in `tests/`; programmatic workbook fixtures live in `tests/conftest.py`.
- Cross-validation against calamine uses the `crossval` marker.
-- Long-running bench tests use `@pytest.mark.testbench` and are skipped by default.
+- The benchmark harness (`tests/benchmarks/`) lives outside `pytest` — invoke via `make bench-robust` / `make bench-retrieval`.
- Keep public-API changes additive; if you can't, note it in the PR and the
maintainers will line up the deprecation.
diff --git a/Makefile b/Makefile
index d2642d6..9bedb5b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,25 +1,20 @@
-.PHONY: help install test test-ci testbench testbench-build testbench-zip lint format typecheck clean corpus-download bench-robust bench-retrieval bench
+.PHONY: help install test test-ci lint format typecheck clean corpus-download bench-robust bench-retrieval bench
PYTHON ?= python
PKG_VERSION := $(shell $(PYTHON) -c "import tomllib, pathlib; print(tomllib.loads(pathlib.Path('pyproject.toml').read_text())['project']['version'])")
-TESTBENCH_ZIP := dist/testBench-v$(PKG_VERSION).zip
help:
@echo "ks-xlsx-parser — common targets"
@echo ""
@echo " make install Install package and dev deps (editable)"
- @echo " make test Run the default test suite (skips corpus + testbench)"
+ @echo " make test Run the default test suite"
@echo " make test-ci Run the suite with verbose output for CI"
@echo ""
- @echo " make testbench-build Generate the 1000-file testBench dataset"
- @echo " make testbench Run parser round-trip across the full testBench"
- @echo " make testbench-zip Package testBench into $(TESTBENCH_ZIP) for GitHub release"
- @echo ""
@echo " make lint Ruff lint"
@echo " make format Ruff format"
@echo " make typecheck mypy"
@echo ""
- @echo " make corpus-download Fetch public XLSX corpora for extended robustness"
+ @echo " make corpus-download Fetch SpreadsheetBench for benchmark runs"
@echo ""
@echo " make bench-robust Robustness on SpreadsheetBench (ks vs docling, ~20 min)"
@echo " make bench-retrieval Retrieval recall on SpreadsheetBench (ks vs docling, ~40 min)"
@@ -34,23 +29,6 @@ test:
test-ci:
$(PYTHON) -m pytest tests/ -v --tb=short -W ignore::UserWarning --junitxml=reports/junit.xml
-testbench-build:
- $(PYTHON) scripts/build_testbench.py --clean
-
-testbench:
- @test -d testBench/generated || (echo "testBench/generated missing. Run 'make testbench-build' first." && exit 1)
- $(PYTHON) -m pytest tests/test_testbench_roundtrip.py -m testbench --tb=short -W ignore::UserWarning
-
-testbench-zip: testbench-build
- @mkdir -p dist
- @echo "→ packaging testBench into $(TESTBENCH_ZIP)"
- @rm -f $(TESTBENCH_ZIP)
- @cd . && zip -qr $(TESTBENCH_ZIP) testBench \
- -x "testBench/**/__pycache__/*" \
- -x "testBench/**/.DS_Store"
- @ls -lh $(TESTBENCH_ZIP)
- @echo "→ attach with: gh release create v$(PKG_VERSION) $(TESTBENCH_ZIP) --generate-notes"
-
lint:
$(PYTHON) -m ruff check src/ tests/ scripts/
diff --git a/README.md b/README.md
index 613270a..f39f718 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@
-
+
@@ -72,7 +72,7 @@ graph that drops straight into [LangChain](https://www.langchain.com/),
-
+
---
@@ -203,7 +203,8 @@ are all first-class ways to keep the lights on.
- 🙌 [Contribute](CONTRIBUTING.md) — every PR is reviewed; `good-first-issue` labels live on Issues.
- 🧰 [Knowledge Stack org](https://github.com/knowledgestack) — see the rest of the ecosystem (ks-cookbook, ks-xlsx-parser, more on the way).
-Not sure where to start? Run `make testbench`, find a file that breaks, open a
+Not sure where to start? Run `make bench-robust` on SpreadsheetBench, find a
+file that breaks, open a
[Parser edge case](https://github.com/knowledgestack/ks-xlsx-parser/issues/new?template=parser_edge_case.yml).
That's the fastest path to a merged PR.
@@ -250,7 +251,7 @@ That's it. Every chunk has:
- [📚 Documentation](#-documentation)
- [⚔️ How it compares](#️-how-it-compares)
- [🎯 Who this is for](#-who-this-is-for)
-- [🧪 The testBench dataset](#-the-testbench-dataset)
+- [📊 Benchmarks](#-benchmarks)
- [🚧 Limitations](#-limitations)
- [🧰 Knowledge Stack ecosystem](#-knowledge-stack-ecosystem)
- [📡 Stay in touch](#-stay-in-touch)
@@ -310,8 +311,9 @@ git clone https://github.com/knowledgestack/ks-xlsx-parser.git
cd ks-xlsx-parser
make install # pip install -e ".[dev,api]"
make test # default suite
-make testbench-build # generate the 1000-file stress corpus
-make testbench # round-trip every workbook through the parser
+make corpus-download # fetch SpreadsheetBench (5,458 real-world xlsx)
+make bench-robust # parse-success + structural counts vs Docling
+make bench-retrieval # retrieval recall@k vs Docling
```
Runtime deps: `openpyxl`, `pydantic`, `lxml`, `xxhash`, `tiktoken`.
@@ -361,7 +363,7 @@ Most tools give you a dataframe. `ks-xlsx-parser` gives you a **graph an LLM can
> Looking for a tiny, edge-runtime I/O library with write support? See
> [**`hucre`**](https://github.com/productdevbook/hucre) by
> [**@productdevbook**](https://github.com/productdevbook). For an unbiased
-> head-to-head on the 1053-workbook testBench corpus — perf numbers,
+> head-to-head on the SpreadsheetBench corpus — perf numbers,
> extraction-count parity, where each side wins — see the wiki:
> [**`ks-xlsx-parser` vs `hucre`**](docs/wiki/Benchmark-vs-hucre.md).
@@ -387,31 +389,21 @@ Teams shipping agents, RAG pipelines, or auditing tools that ingest Excel.
---
-## 🧪 The testBench dataset
+## 📊 Benchmarks
-A **1054-workbook stress corpus** ships under [`testBench/`](testBench/) and
-is round-tripped in CI on every commit. It's the easiest way to see whether
-the parser does the right thing on *your* kind of workbook.
+We benchmark against **SpreadsheetBench v0.1** — 912 instruction × xlsx tasks
+(5,458 unique workbooks) covering financial models, project trackers,
+HR records, scientific data, and a long tail of small business spreadsheets.
-| Group | Files | What it covers |
-|-------|------:|----------------|
-| `real_world/` | 8 | Real anonymised workbooks (financial, engineering, project tracking) |
-| `enterprise/` | 4 | Deterministic enterprise templates |
-| `github_datasets/` | 10 | Public datasets (iris, titanic, superstore, …) |
-| `stress/curated/` | 26 | 26 progressive stress levels authored by hand |
-| `stress/merges/` | 5 | Pathological merge patterns |
-| `generated/matrix/` | 297 | One feature per file across 18 categories |
-| `generated/combo/` | 400 | Deterministic feature cocktails (5 densities × 80 seeds) |
-| `generated/adversarial/` | 300 | Unicode bombs, circular refs, 32k-char cells, deep formula chains, sparse 1M-row sheets, 250-sheet workbooks |
+| Benchmark | What it measures | Cost |
+|---|---|---|
+| `make bench-robust` | Parse-success rate + structural counts vs Docling | ~20 min |
+| `make bench-retrieval` | Top-k retrieval recall + table fragmentation rate vs Docling | ~40 min |
-```bash
-make testbench-build # regenerate testBench/generated/ (~1 minute)
-make testbench # 1054/1054 in ~70 seconds
-make testbench-zip # package as dist/testBench-vX.Y.Z.zip for a GitHub release
-```
-
-The zipped dataset is attached to every [release](https://github.com/knowledgestack/ks-xlsx-parser/releases)
-— pull it if you don't want to clone the full repo.
+Headline numbers and methodology live in
+[`tests/benchmarks/reports/COMPARISON.md`](tests/benchmarks/reports/COMPARISON.md).
+The corpus is downloaded on demand (`make corpus-download`) and gitignored —
+nothing is committed to the repo.
---
@@ -461,10 +453,9 @@ or the [#showcase](https://discord.gg/4uaGhJcx) channel on Discord.
- 🐙 **[Follow @knowledgestack](https://github.com/knowledgestack)** on GitHub for new releases across the ecosystem.
- 📣 Watch this repo (→ *Releases only*) to get pinged when `ks-xlsx-parser` ships an update.
-If you'd rather just peek first — thousands of parsed workbooks live in the
-[testBench release](https://github.com/knowledgestack/ks-xlsx-parser/releases)
-as a single zip. Pull it, diff it, file an issue if your Excel does something
-weirder than ours.
+If you'd rather just peek first — run the benchmark suite against the
+public SpreadsheetBench corpus (`make corpus-download && make bench-robust`)
+and file an issue if your Excel does something weirder than ours.
---
@@ -472,12 +463,11 @@ weirder than ours.
We love contributions. Three paths, in order of speed-to-merge:
-1. **Report a testBench failure** — run `make testbench`, find a file that
- breaks, attach it to a
+1. **Report a benchmark failure** — run `make bench-robust` on SpreadsheetBench,
+ find a file that breaks, attach it to a
[Parser edge case issue](https://github.com/knowledgestack/ks-xlsx-parser/issues/new?template=parser_edge_case.yml).
-2. **Add a new adversarial workbook** — contribute a builder to
- `scripts/build_testbench.py`. Any file that makes the parser crash or
- lose information is welcome.
+2. **Submit an adversarial workbook** — open a Parser edge case issue with the
+ file attached; we'll fold it into the suite.
3. **Fix a flagged issue** — see [`docs/PARSER_KNOWN_ISSUES.md`](docs/PARSER_KNOWN_ISSUES.md).
Full dev loop, PR checklist, and code style in [`CONTRIBUTING.md`](CONTRIBUTING.md).
@@ -544,7 +534,7 @@ No. The library reads `.xlsx` files; it never executes them. VBA macros are flag
How fast is it?
-The full 1054-workbook testBench round-trips in ~70 s on a single machine. A real 21k-cell, 13-sheet financial model parses in ~4.6 s (down from 307 s pre-0.1.1 after a circular-ref caching fix). Sparse workbooks with extreme addresses parse in under 200 ms.
+SpreadsheetBench's full 5,458-workbook corpus parses end-to-end in roughly 20 minutes on a single machine (P50 parse time low double-digit ms). A real 21k-cell, 13-sheet financial model parses in ~4.6 s (down from 307 s pre-0.1.1 after a circular-ref caching fix). Sparse workbooks with extreme addresses parse in under 200 ms.
diff --git a/docs/MAINTAINERS.md b/docs/MAINTAINERS.md
index a6cfbe6..6378cb2 100644
--- a/docs/MAINTAINERS.md
+++ b/docs/MAINTAINERS.md
@@ -30,7 +30,6 @@ Enable:
- `tests (ubuntu-latest / py3.11)`
- `tests (ubuntu-latest / py3.12)`
- `tests (macos-latest / py3.12)`
- - `testBench round-trip (ubuntu / py3.12)`
- ✅ Require branches to be up to date before merging
- ✅ Require conversation resolution before merging
- ✅ Require signed commits (soft lock — can relax if it slows contributors)
@@ -66,16 +65,15 @@ Create categories (click *New Category* for each):
- **🎯 Show and tell** (open) — projects built with ks-xlsx-parser
- Attach the template in `.github/DISCUSSION_TEMPLATE/show-and-tell.yml`
- **🙏 Q&A** (open, answerable) — usage and "does it handle X" questions
-- **🧪 testBench findings** (open) — edge cases that shouldn't be issues yet
+- **🧪 Benchmark findings** (open) — edge cases that shouldn't be issues yet
### Releases
Pushing a `vX.Y.Z` tag triggers `.github/workflows/release.yml` which will:
1. Build the wheel + sdist
-2. Build `dist/testBench-v.zip`
-3. Attach all three to the GitHub Release
-4. Publish to PyPI via Trusted Publishing
+2. Attach both to the GitHub Release
+3. Publish to PyPI via Trusted Publishing
One-time PyPI setup: go to PyPI → *your project* → *Publishing* → *Add a new
pending publisher* with:
@@ -97,8 +95,9 @@ without a human click.
line; update the compare-link footer at the bottom.
3. Regenerate the full release notes in `../docs/launch/RELEASE_NOTES_vX.Y.Z.md`
(copy from the previous release, edit for the new highlights).
-4. `make testbench` → expect 1054/1054.
-5. `make test` → clean.
+4. `make test` → clean.
+5. If touching parser internals, run `make bench-robust` against
+ SpreadsheetBench and confirm no regressions.
6. Commit with `chore(release): vX.Y.Z`.
7. `git tag -s vX.Y.Z -m "vX.Y.Z"` (signed tag; required by branch protection).
8. `git push && git push --tags` — the tag triggers the release workflow.
diff --git a/docs/PARSER_KNOWN_ISSUES.md b/docs/PARSER_KNOWN_ISSUES.md
index 7667475..e1f5029 100644
--- a/docs/PARSER_KNOWN_ISSUES.md
+++ b/docs/PARSER_KNOWN_ISSUES.md
@@ -38,11 +38,11 @@ promoted to the master cell.
## Documented Limitations (No Hard Fail)
-### `Walbridge Coatings 8.9.23.xlsx` — formula cached-value drift
+### Formula cached-value drift on dynamic-array / volatile formulas
-**Symptom**: ~11% of formula cells in this real-world workbook produce a
-different cached value than calamine reads. Hard failures are zero; parsing
-and serialization succeed end-to-end.
+**Symptom**: A small fraction of formula cells in some real-world workbooks
+produce a different cached value than calamine reads. Hard failures are zero;
+parsing and serialization succeed end-to-end.
**Root cause**: openpyxl's `data_only=True` reader does not always surface the
most recently written cached value for complex dynamic-array or volatile
@@ -50,10 +50,6 @@ formulas when the calc chain references across multiple sheets. This is an
openpyxl limitation, not an ks-xlsx-parser bug; calamine reads from the raw XML
and catches the newer values.
-**Current mitigation**: `tests/test_cross_validation.py::test_formula_cached_values_match`
-uses a 15% threshold for files in a `known_loose_files` set and the default
-5% threshold for everything else.
-
**Potential fixes** (tracked):
1. Read cached values directly from the OOXML XML instead of via openpyxl (like
we already do for empty merge masters).
diff --git a/docs/RELEASE_PROCESS.md b/docs/RELEASE_PROCESS.md
index e2b9717..14c89a8 100644
--- a/docs/RELEASE_PROCESS.md
+++ b/docs/RELEASE_PROCESS.md
@@ -1,6 +1,6 @@
# Release process
-This document is the **operational** companion to [`.github/workflows/release.yml`](../.github/workflows/release.yml). The workflow is tag-triggered (`v*.*.*`); pushing such a tag builds wheel + sdist, attaches a `testBench-vX.Y.Z.zip`, creates a GitHub Release, and publishes to PyPI. **All three actions are partially or fully irreversible** — PyPI in particular does not allow re-publishing a version. Run through this checklist before tagging.
+This document is the **operational** companion to [`.github/workflows/release.yml`](../.github/workflows/release.yml). The workflow is tag-triggered (`v*.*.*`); pushing such a tag builds wheel + sdist, creates a GitHub Release, and publishes to PyPI. **All three actions are partially or fully irreversible** — PyPI in particular does not allow re-publishing a version. Run through this checklist before tagging.
## One-time setup
@@ -53,7 +53,6 @@ gh api -X PUT repos/knowledgestack/ks-xlsx-parser/branches/main/protection \
-F 'required_status_checks[contexts][]=tests (macos-latest / py3.10)' \
-F 'required_status_checks[contexts][]=tests (macos-latest / py3.11)' \
-F 'required_status_checks[contexts][]=tests (macos-latest / py3.12)' \
- -F 'required_status_checks[contexts][]=testBench round-trip (ubuntu / py3.12)' \
-F enforce_admins=false \
-F required_pull_request_reviews[required_approving_review_count]=1 \
-F restrictions= 2>/dev/null
@@ -82,7 +81,7 @@ For every new version `X.Y.Z`:
8. **Watch the workflow.** https://github.com/knowledgestack/ks-xlsx-parser/actions — the `Release` workflow should run `build` → `github-release` → `pypi`. If the `pypi` job is gated on a reviewer, approve it in the Actions UI.
9. **Verify post-release:**
- PyPI: https://pypi.org/project/ks-xlsx-parser/X.Y.Z/ resolves and `pip install ks-xlsx-parser==X.Y.Z` works in a fresh venv.
- - GitHub Release: https://github.com/knowledgestack/ks-xlsx-parser/releases/tag/vX.Y.Z shows the release notes + wheel + sdist + `testBench-vX.Y.Z.zip`.
+ - GitHub Release: https://github.com/knowledgestack/ks-xlsx-parser/releases/tag/vX.Y.Z shows the release notes + wheel + sdist.
- The `[Unreleased]` heading at the top of `CHANGELOG.md` is reset to "Nothing yet" for the next cycle (manual; do this in a follow-up PR).
## Common failure modes
diff --git a/docs/corpora.md b/docs/corpora.md
index f04f3aa..0896e3e 100644
--- a/docs/corpora.md
+++ b/docs/corpora.md
@@ -1,36 +1,31 @@
# Corpus & Benchmarks
-The ks-xlsx-parser test bench is split into two tiers.
+ks-xlsx-parser benchmarks against public corpora that are downloaded on demand —
+nothing large is committed to the repo.
-## 1. `testBench/` — checked into the repo
+## Primary corpus — SpreadsheetBench v0.1
-A 1053-workbook corpus shipped with every clone, exercising the full extraction
-spec. Round-tripped on every CI run. See [`testBench/README.md`](../testBench/README.md)
-for the layout.
+912 instruction × xlsx tasks (5,458 unique workbooks) covering financial models,
+project trackers, HR records, scientific data, and a long tail of small-business
+spreadsheets. Each task ships with an `instruction`, a `data_position`, and
+(usually) an `answer_position`, which gives us ground truth for retrieval recall.
```bash
-make testbench-build # regenerate the 1000-file `generated/` subtree
-make testbench # parse every workbook, record failures to metrics/testbench/
-make testbench-zip # package as a GitHub release asset
+make corpus-download # fetch SpreadsheetBench + a few smaller corpora under data/corpora/
+make bench-robust # parse-success rate + structural counts vs Docling (~20 min)
+make bench-retrieval # top-k retrieval recall + table fragmentation rate vs Docling (~40 min)
```
-## 2. External public corpora — downloaded on demand
+Reports land in `tests/benchmarks/reports/_/`. The headline
+numbers and methodology live in
+[`tests/benchmarks/reports/COMPARISON.md`](../tests/benchmarks/reports/COMPARISON.md).
-Heavier public datasets (EUSES, Enron `.xlsx` subset, SheetJS/openpyxl samples)
-stay out of git and download under `tests/fixtures/corpus/`.
+## Other public corpora — opt-in robustness
-```bash
-make corpus-download # fetch external corpora
-python -m pytest -m corpus -v # opt-in robustness run
-```
-
-## Enterprise scorecard (runs by default)
+`scripts/download_corpora.sh` also fetches a handful of smaller xlsx corpora
+(EUSES, Enron `.xlsx` subset, SheetJS / openpyxl samples) under
+`data/corpora/`. These are useful for spot-checking specific failure modes.
```bash
-python -m pytest tests/test_enterprise_scoring.py -v
+python -m pytest -m corpus -v # opt-in robustness run against external corpora
```
-
-Four small deterministic fixtures under `testBench/enterprise/` are regenerated
-if missing by `scripts/generate_enterprise_fixtures.py`. Per-file scorecards
-are written to `metrics/corpus/`; git ignores the `metrics/` tree so CI can
-upload the artifacts without polluting history.
diff --git a/docs/launch/MEDIUM_ARTICLE.md b/docs/launch/MEDIUM_ARTICLE.md
index f882c6c..1b76ccb 100644
--- a/docs/launch/MEDIUM_ARTICLE.md
+++ b/docs/launch/MEDIUM_ARTICLE.md
@@ -106,7 +106,7 @@ Prepping the library for the public release, we hit two bottlenecks that are int
`detect_circular_refs()` on the dependency graph is O(V+E) with DFS + memoisation. Fine. But our chunk builder was calling it **once per chunk** inside `_build_dependency_summary()`, because every chunk's `has_circular` flag needed the global cycle set.
-On a small workbook: invisible. On a 13-sheet, 21k-cell real-world financial model (Walbridge Coatings, now our favourite regression fixture): **115 chunks × ~2.6 s each = 307 s of CPU.** The chunker was dominating the parse.
+On a small workbook: invisible. On a 13-sheet, 21k-cell real-world financial model: **115 chunks × ~2.6 s each = 307 s of CPU.** The chunker was dominating the parse.
The fix is almost embarrassing:
diff --git a/docs/launch/RELEASE_NOTES_v0.1.1.md b/docs/launch/RELEASE_NOTES_v0.1.1.md
index e5ef79c..dc9ccd4 100644
--- a/docs/launch/RELEASE_NOTES_v0.1.1.md
+++ b/docs/launch/RELEASE_NOTES_v0.1.1.md
@@ -24,8 +24,8 @@ ecosystem. Now open for the rest of the world.
asset attached to this release.
- ⚡ **Parser perf fixes** — real-world workbooks that used to hang now
finish in under a second.
- - Cached `detect_circular_refs()` per workbook: Walbridge Coatings
- **307 s → 4.6 s (66×)**.
+ - Cached `detect_circular_refs()` per workbook: real 21k-cell financial
+ model **307 s → 4.6 s (66×)**.
- Sparse-cell iteration: files with two non-empty cells at `A1` and
`XFD1048576` drop from 60 s timeout → **135 ms**.
- 🧰 **Framework-agnostic** — drops straight into
diff --git a/docs/wiki/Benchmark-vs-hucre.md b/docs/wiki/Benchmark-vs-hucre.md
index 3fe4f1f..246b1b9 100644
--- a/docs/wiki/Benchmark-vs-hucre.md
+++ b/docs/wiki/Benchmark-vs-hucre.md
@@ -24,7 +24,12 @@ Pick `ks-xlsx-parser` for Python LLM / RAG / auditing pipelines.
---
-## Performance — 1053-workbook testBench corpus
+## Performance — historical 1053-workbook curated corpus
+
+> *This page reflects the v0.1.x benchmark run on a curated stress corpus that
+> shipped with earlier releases. Current head benchmarks SpreadsheetBench
+> (5,458 real-world workbooks); see
+> [COMPARISON.md](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/tests/benchmarks/reports/COMPARISON.md).*
Same machine, same run, same OS page cache. `parse_workbook(mode="fast")`
is the apples-to-apples configuration for hucre's read-only path (it skips
@@ -38,7 +43,7 @@ metadata feature hucre extracts).
| P99 parse time | **30.2 ms** | 469 ms | 246 ms |
| mean parse time | **2.7 ms** | 73.9 ms | 39.5 ms |
| total wall-clock | **2.8 s** | 77.8 s | 41.6 s |
-| Walbridge Coatings
(17.6k formulas, worst real-world file) | **139 ms** | 1413 ms | 686 ms |
+| Worst real-world file
(17.6k formulas) | **139 ms** | 1413 ms | 686 ms |
### Ratio to hucre
@@ -101,9 +106,9 @@ On every feature **both** parsers extract, the drift is zero or near-zero:
| comments | 486 | 486 | **0** |
| named ranges | 822 | 809 | 1.6% (tracked) |
-The 22-formula disagreement is dominated by one workbook
-(`real_world/Walbridge Coatings 8.9.23.xlsx`) where we parse 16 formulas
-that hucre misses — we surface this in the drift report, not hide it.
+The 22-formula disagreement is dominated by one real-world workbook where
+we parse 16 formulas that hucre misses — we surface this in the drift
+report, not hide it.
The cell-count difference on adversarial merge-heavy files (we emit ~50%
more rows) is a **methodology difference**: `ks-xlsx-parser` counts every
@@ -119,7 +124,7 @@ Every perf change in `ks-xlsx-parser` has to pass, in order:
1. The **1631-test pytest suite** (unit + integration + corpus-slice)
2. **Cross-validation** against [`calamine`](https://github.com/tafia/calamine) — the Rust reference parser — on a golden fixture set
-3. **Zero regressions** on the 1053-file testBench across eight sub-corpora (`real_world/`, `enterprise/`, `github_datasets/`, `stress/curated/`, `stress/merges/`, `generated/matrix/`, `generated/combo/`, `generated/adversarial/`)
+3. **Zero regressions** on the SpreadsheetBench robustness baseline (5,458 real-world workbooks)
4. **Feature-count stability** vs. the hucre benchmark above
That's the order. If a perf change breaks any gate, we don't ship it.
@@ -144,12 +149,16 @@ but the short version:
cd tests/benchmarks/hucre_node && pnpm install --frozen-lockfile
cd ../../..
+# Download SpreadsheetBench once
+make corpus-download
+
# Full mode (default)
-python -m tests.benchmarks.vs_hucre --corpus testBench --out tests/benchmarks/reports
+python -m tests.benchmarks.vs_hucre \
+ --corpus data/corpora/spreadsheetbench --out tests/benchmarks/reports
# Fast mode
KS_PARSE_MODE=fast python -m tests.benchmarks.vs_hucre \
- --corpus testBench --out tests/benchmarks/reports
+ --corpus data/corpora/spreadsheetbench --out tests/benchmarks/reports
```
Outputs (under `tests/benchmarks/reports/_/`):
diff --git a/docs/wiki/Home.md b/docs/wiki/Home.md
index 0285977..997a9a9 100644
--- a/docs/wiki/Home.md
+++ b/docs/wiki/Home.md
@@ -22,8 +22,8 @@ the front-page README so it stays scannable. The code-heavy stuff lives here.
together, and where to hook in if you want to extend the parser.
- **[Benchmark vs `hucre`](Benchmark-vs-hucre)** — unbiased head-to-head
against the [hucre](https://github.com/productdevbook/hucre) TypeScript
- engine on the 1053-workbook testBench corpus: perf, extraction-count
- parity, and where each tool wins.
+ engine on the SpreadsheetBench corpus: perf, extraction-count parity,
+ and where each tool wins.
## Related docs in the main repo
@@ -34,7 +34,7 @@ the front-page README so it stays scannable. The code-heavy stuff lives here.
- [`docs/PARSER_KNOWN_ISSUES.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/docs/PARSER_KNOWN_ISSUES.md) —
known edge cases and how we handle them.
- [`docs/corpora.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/docs/corpora.md) —
- the testBench stress corpus and public-corpus benchmarks.
+ public benchmark corpora (SpreadsheetBench, EUSES, Enron).
- [`CONTRIBUTING.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/CONTRIBUTING.md) —
dev loop, PR checklist, community channels.
- [`CHANGELOG.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/CHANGELOG.md) —
diff --git a/docs/wiki/Pipeline-Internals.md b/docs/wiki/Pipeline-Internals.md
index 8913c05..eb5dcf5 100644
--- a/docs/wiki/Pipeline-Internals.md
+++ b/docs/wiki/Pipeline-Internals.md
@@ -52,7 +52,7 @@ resolve references (cell / range / cross-sheet / table / external).
Circular-reference detection is O(V+E) DFS with memoisation at the
edge level. It's cached per workbook inside `ChunkBuilder` — running it
-per chunk is how Walbridge Coatings used to take 307 s.
+per chunk is how a real 21k-cell workbook used to take 307 s.
## 3. Annotate
@@ -129,7 +129,6 @@ parser writes the importer for you.
| Add a verification stage | `verification/stage_verifier.py` |
| Add a new DTO field | `models/*.py` (+ serializer + renderer) |
-When in doubt, write the test first — the
-[`testBench/`](https://github.com/knowledgestack/ks-xlsx-parser/tree/main/testBench)
-corpus is the fastest signal that a pipeline change didn't regress
-anything else.
+When in doubt, write the test first — the SpreadsheetBench benchmark
+(`make bench-robust`) is the fastest signal that a pipeline change didn't
+regress anything else.
diff --git a/examples/demo.py b/examples/demo.py
index ceeb75f..dbf6118 100644
--- a/examples/demo.py
+++ b/examples/demo.py
@@ -15,7 +15,7 @@
from xlsx_parser.pipeline import parse_workbook
from xlsx_parser.utils.logging_config import configure_logging
-EXAMPLES_DIR = Path(__file__).parent.parent / "testBench" / "real_world"
+EXAMPLES_DIR = Path(__file__).parent / "fixtures"
def demo_financial_model():
diff --git a/examples/generate_examples.py b/examples/generate_examples.py
index 9fecb1f..8d25e01 100644
--- a/examples/generate_examples.py
+++ b/examples/generate_examples.py
@@ -20,7 +20,7 @@
from openpyxl.worksheet.datavalidation import DataValidation
from openpyxl.worksheet.table import Table, TableStyleInfo
-EXAMPLES_DIR = Path(__file__).parent.parent / "testBench" / "real_world"
+EXAMPLES_DIR = Path(__file__).parent / "fixtures"
EXAMPLES_DIR.mkdir(parents=True, exist_ok=True)
diff --git a/pyproject.toml b/pyproject.toml
index 74f125e..a425853 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,10 +75,8 @@ markers = [
"invariant: structural invariant tests",
"corpus: external corpus tests (skipped by default)",
"slow: tests taking >10 seconds",
- "enterprise: enterprise scorecard benchmarks",
- "testbench: 1000-file testBench round-trip (skipped by default; run with -m testbench)",
]
-addopts = "-m 'not corpus and not testbench'"
+addopts = "-m 'not corpus'"
[tool.setuptools.packages.find]
where = ["src"]
@@ -86,7 +84,7 @@ where = ["src"]
[tool.ruff]
line-length = 110
target-version = "py310"
-extend-exclude = ["testBench", "examples/stress_test", "dist", "build"]
+extend-exclude = ["examples/stress_test", "dist", "build"]
[tool.ruff.lint]
select = [
diff --git a/scripts/build_testbench.py b/scripts/build_testbench.py
deleted file mode 100644
index 29f3535..0000000
--- a/scripts/build_testbench.py
+++ /dev/null
@@ -1,1667 +0,0 @@
-#!/usr/bin/env python3
-"""
-build_testbench.py — deterministic generator for the ks-xlsx-parser testBench.
-
-Produces ~1000 `.xlsx` workbooks under ``testBench/generated/`` organised into
-three groups:
-
-* ``matrix/`` — one feature-per-file across every knob the parser exercises
- (formulas, merges, named ranges, CF, DV, tables, charts,
- styles, dates, errors, hidden rows/cols, hyperlinks,
- comments, rich text, number formats, edge addresses,
- array formulas, 3D refs, pivot placeholders, huge sheet
- names).
-* ``combo/`` — randomised combinations of the above at five density
- levels (5/10/25/50/100 operations per file) × 80 seeds.
-* ``adversarial/`` — files engineered to break parsers: circular formulas,
- deep formula chains, 1M-row sparse sheets, 255-sheet
- workbooks, unicode/RTL/emoji stress, oversized merges,
- broken references, long formula strings.
-
-Usage
------
-
- python scripts/build_testbench.py # builds everything
- python scripts/build_testbench.py --force # regenerates even if present
- python scripts/build_testbench.py --group matrix
- python scripts/build_testbench.py --limit 50 # first 50 files only (smoke)
-
-The generator is fully deterministic: identical invocations produce
-byte-identical files (modulo openpyxl's own timestamping, which we neutralise).
-Every file is accompanied by one row in ``testBench/generated/MANIFEST.json``
-describing its group, feature tags, expected cell count, and SHA256.
-"""
-
-
-import argparse
-import hashlib
-import json
-import random
-import string
-import sys
-from collections.abc import Callable
-from dataclasses import dataclass, field
-from datetime import date, datetime, time
-from pathlib import Path
-
-from openpyxl import Workbook
-from openpyxl.chart import (
- AreaChart,
- BarChart,
- BubbleChart,
- LineChart,
- PieChart,
- RadarChart,
- Reference,
- ScatterChart,
-)
-from openpyxl.comments import Comment
-from openpyxl.formatting.rule import (
- CellIsRule,
- ColorScaleRule,
- DataBarRule,
- FormulaRule,
- IconSetRule,
- Rule,
-)
-from openpyxl.styles import (
- Alignment,
- Border,
- Font,
- PatternFill,
- Side,
-)
-from openpyxl.utils import get_column_letter
-from openpyxl.workbook.defined_name import DefinedName
-from openpyxl.worksheet.datavalidation import DataValidation
-from openpyxl.worksheet.table import Table, TableStyleInfo
-
-ROOT = Path(__file__).resolve().parent.parent
-OUT_ROOT = ROOT / "testBench" / "generated"
-MANIFEST_PATH = OUT_ROOT / "MANIFEST.json"
-
-# ----------------------------------------------------------------------------
-# Data classes
-# ----------------------------------------------------------------------------
-
-
-@dataclass
-class GeneratedFile:
- path: Path
- group: str
- features: list[str] = field(default_factory=list)
- expected_sheets: int = 1
- expected_cells: int = 0
- expected_formulas: int = 0
- notes: str = ""
-
- def to_manifest_row(self) -> dict:
- return {
- "path": str(self.path.relative_to(OUT_ROOT)),
- "group": self.group,
- "features": self.features,
- "expected_sheets": self.expected_sheets,
- "expected_cells": self.expected_cells,
- "expected_formulas": self.expected_formulas,
- "sha256": sha256_of(self.path),
- "size_bytes": self.path.stat().st_size,
- "notes": self.notes,
- }
-
-
-def sha256_of(path: Path) -> str:
- h = hashlib.sha256()
- with path.open("rb") as f:
- for chunk in iter(lambda: f.read(65536), b""):
- h.update(chunk)
- return h.hexdigest()
-
-
-def _finalize(wb: Workbook, out: Path) -> None:
- """Save workbook with deterministic metadata."""
- wb.properties.created = datetime(2025, 1, 1, 0, 0, 0)
- wb.properties.modified = datetime(2025, 1, 1, 0, 0, 0)
- wb.properties.creator = "ks-xlsx-parser testBench generator"
- wb.properties.title = out.stem
- out.parent.mkdir(parents=True, exist_ok=True)
- wb.save(out)
-
-
-# ----------------------------------------------------------------------------
-# Matrix group — one feature per file
-# ----------------------------------------------------------------------------
-
-
-MATRIX_DIR = OUT_ROOT / "matrix"
-
-
-def _matrix_path(slug: str) -> Path:
- return MATRIX_DIR / f"{slug}.xlsx"
-
-
-# --- formulas -------------------------------------------------------------
-
-FORMULA_RECIPES: list[tuple[str, str, str]] = [
- # (slug, label, formula expression — evaluated in B1 with constants in A1:A5)
- ("formula_sum", "SUM", "=SUM(A1:A5)"),
- ("formula_average", "AVERAGE", "=AVERAGE(A1:A5)"),
- ("formula_min_max", "MIN/MAX", "=MAX(A1:A5)-MIN(A1:A5)"),
- ("formula_count", "COUNT", "=COUNT(A1:A5)"),
- ("formula_counta", "COUNTA", "=COUNTA(A1:A5)"),
- ("formula_sumif", "SUMIF", "=SUMIF(A1:A5,\">2\")"),
- ("formula_sumifs", "SUMIFS", "=SUMIFS(A1:A5,A1:A5,\">1\",A1:A5,\"<5\")"),
- ("formula_countif", "COUNTIF", "=COUNTIF(A1:A5,\">2\")"),
- ("formula_countifs", "COUNTIFS", "=COUNTIFS(A1:A5,\">0\",A1:A5,\"<5\")"),
- ("formula_averageif", "AVERAGEIF", "=AVERAGEIF(A1:A5,\">1\")"),
- ("formula_if_basic", "IF", "=IF(A1>2,\"big\",\"small\")"),
- ("formula_if_nested", "nested IF", "=IF(A1>4,\"high\",IF(A1>2,\"mid\",\"low\"))"),
- ("formula_ifs", "IFS", "=IFS(A1>4,\"high\",A1>2,\"mid\",TRUE,\"low\")"),
- ("formula_ifna", "IFNA", "=IFNA(VLOOKUP(99,A1:B5,2,FALSE),\"missing\")"),
- ("formula_iferror", "IFERROR", "=IFERROR(1/0,\"err\")"),
- ("formula_and_or_not", "AND/OR/NOT", "=AND(A1>0,OR(A2>0,NOT(A3<0)))"),
- ("formula_concat", "CONCAT", "=CONCAT(A1,\"-\",A2)"),
- ("formula_textjoin", "TEXTJOIN", "=TEXTJOIN(\",\",TRUE,A1:A5)"),
- ("formula_left_right_mid", "LEFT/RIGHT/MID", "=LEFT(\"abcdef\",3)&RIGHT(\"abcdef\",2)&MID(\"abcdef\",3,2)"),
- ("formula_substitute", "SUBSTITUTE", "=SUBSTITUTE(\"foo-bar\",\"-\",\"_\")"),
- ("formula_find_search", "FIND/SEARCH", "=FIND(\"b\",\"foobar\")+SEARCH(\"B\",\"foobar\")"),
- ("formula_len_trim", "LEN/TRIM", "=LEN(TRIM(\" hi \"))"),
- ("formula_upper_lower_proper", "case fns", "=UPPER(\"a\")&LOWER(\"B\")&PROPER(\"hello world\")"),
- ("formula_round_roundup_rounddown", "ROUND*", "=ROUND(A1,1)+ROUNDUP(A1,0)+ROUNDDOWN(A1,0)"),
- ("formula_int_mod", "INT/MOD", "=INT(A1)+MOD(A1,2)"),
- ("formula_abs_sign", "ABS/SIGN", "=ABS(-5)+SIGN(A1)"),
- ("formula_sqrt_power", "SQRT/POWER", "=SQRT(16)+POWER(A1,2)"),
- ("formula_log_ln_exp", "LOG/LN/EXP", "=LOG(10)+LN(EXP(1))"),
- ("formula_date_functions", "DATE fns", "=YEAR(TODAY())+MONTH(TODAY())+DAY(TODAY())"),
- ("formula_datedif", "DATEDIF", "=DATEDIF(DATE(2020,1,1),DATE(2025,1,1),\"Y\")"),
- ("formula_edate_eomonth", "EDATE/EOMONTH", "=EDATE(DATE(2020,1,1),12)+EOMONTH(DATE(2020,1,1),3)"),
- ("formula_weekday_workday", "WEEKDAY/WORKDAY", "=WEEKDAY(TODAY())+WORKDAY(TODAY(),5)"),
- ("formula_vlookup", "VLOOKUP", "=VLOOKUP(A1,A1:B5,2,FALSE)"),
- ("formula_hlookup", "HLOOKUP", "=HLOOKUP(A1,A1:E2,2,FALSE)"),
- ("formula_xlookup", "XLOOKUP", "=XLOOKUP(A1,A1:A5,B1:B5,\"not found\")"),
- ("formula_index_match", "INDEX/MATCH", "=INDEX(A1:A5,MATCH(A2,A1:A5,0))"),
- ("formula_offset", "OFFSET", "=OFFSET(A1,2,0)"),
- ("formula_indirect", "INDIRECT", "=INDIRECT(\"A\"&2)"),
- ("formula_rank", "RANK", "=RANK(A1,A1:A5,0)"),
- ("formula_large_small", "LARGE/SMALL", "=LARGE(A1:A5,2)+SMALL(A1:A5,2)"),
- ("formula_choose", "CHOOSE", "=CHOOSE(2,\"a\",\"b\",\"c\")"),
- ("formula_switch", "SWITCH", "=SWITCH(A1,1,\"one\",2,\"two\",\"other\")"),
- ("formula_array_cse", "array CSE", "{=SUM(A1:A5*A1:A5)}"),
- ("formula_long", "8000-char expression", "=" + "+".join(f"A{((i % 5) + 1)}" for i in range(400))),
-]
-
-
-def build_formula_files() -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
- for slug, label, formula in FORMULA_RECIPES:
- wb = Workbook()
- ws = wb.active
- ws.title = "Formula"
- for i in range(1, 6):
- ws.cell(row=i, column=1, value=i * 1.5)
- ws["B1"] = formula
- ws["D1"] = f"Test: {label}"
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(
- GeneratedFile(
- path=out,
- group="matrix/formula",
- features=["formula", slug.replace("formula_", "")],
- expected_cells=7,
- expected_formulas=1,
- )
- )
- return files
-
-
-# --- merged cells ---------------------------------------------------------
-
-
-def build_merge_files() -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
- recipes = [
- ("merge_horizontal_small", [("A1:C1",)]),
- ("merge_horizontal_wide", [(f"A1:{get_column_letter(20)}1",)]),
- ("merge_vertical_small", [("A1:A5",)]),
- ("merge_vertical_tall", [("A1:A100",)]),
- ("merge_rectangular", [("A1:E5",)]),
- ("merge_many_horizontal", [(f"A{r}:C{r}",) for r in range(1, 51)]),
- ("merge_many_vertical", [(f"{get_column_letter(c)}1:{get_column_letter(c)}30",) for c in range(1, 11)]),
- ("merge_grid_5x5", [(f"{get_column_letter(2*c-1)}{2*r-1}:{get_column_letter(2*c)}{2*r}",) for r in range(1, 6) for c in range(1, 6)]),
- ("merge_diagonal_steps", [(f"{get_column_letter(2*i-1)}{2*i-1}:{get_column_letter(2*i)}{2*i}",) for i in range(1, 8)]),
- ("merge_header_3_levels", [("A1:F1",), ("A2:C2",), ("D2:F2",), ("A3:B3",), ("C3:C3",), ("D3:E3",), ("F3:F3",)]),
- ("merge_with_value_only_in_master", [("A1:C3",)]),
- ("merge_around_data", [("A1:C1",), ("A5:C5",)]),
- ("merge_single_cell_noop", [("A1:A1",)]), # degenerate
- ("merge_adjacent_row_pair", [("A1:B1",), ("A2:B2",)]),
- ("merge_wide_header_narrow_data", [("A1:J1",)]),
- ("merge_mixed_sizes", [("A1:B2",), ("C1:E1",), ("A4:A10",), ("D4:F6",)]),
- ("merge_100_singletons", [(f"{get_column_letter(((i-1) % 20)+1)}{((i-1)//20)+1}:{get_column_letter(((i-1) % 20)+1)}{((i-1)//20)+1}",) for i in range(1, 101)]),
- ("merge_full_row", [("A1:Z1",)]),
- ("merge_full_column_short", [("A1:A50",)]),
- ("merge_nonadjacent_blocks", [("A1:C3",), ("F1:H3",), ("A5:C7",), ("F5:H7",)]),
- ("merge_within_table_header", [("A1:D1",)]), # we'll add a table below
- ("merge_empty_range", [("B2:D4",)]), # no data in master
- ("merge_unicode_content", [("A1:C1",)]),
- ("merge_with_rich_formatting", [("A1:C1",)]),
- ("merge_column_header_stack", [("A1:A2",), ("B1:B2",), ("C1:C2",)]),
- ("merge_report_grid", [("A1:D1",), ("A2:A10",), ("B2:D2",), ("B3:B10",), ("C3:D3",)]),
- ("merge_large_single", [("A1:Z100",)]),
- ("merge_thousand_cells", [("A1:J100",)]),
- ("merge_within_table_footer", [("A11:D11",)]),
- ("merge_spanning_formula_range", [("A1:C1",)]),
- ]
- for slug, ranges in recipes:
- wb = Workbook()
- ws = wb.active
- ws.title = "Merges"
- for i, (rng,) in enumerate(ranges):
- anchor = rng.split(":")[0]
- try:
- ws[anchor] = f"m{i+1}" # must write before merging; skip if cell is already merged
- except AttributeError:
- pass
- try:
- ws.merge_cells(rng)
- except Exception:
- pass
- if slug == "merge_with_value_only_in_master":
- ws["A1"] = "only-master"
- if slug == "merge_within_table_header":
- for c, h in enumerate(["a", "b", "c", "d"], 1):
- ws.cell(row=2, column=c, value=h)
- for r in range(3, 8):
- for c in range(1, 5):
- ws.cell(row=r, column=c, value=r * c)
- ws.add_table(Table(displayName="T1", ref="A2:D7"))
- if slug == "merge_unicode_content":
- ws["A1"] = "éñÜ日本語 🚀 حرف"
- if slug == "merge_with_rich_formatting":
- ws["A1"].font = Font(bold=True, size=14, color="FF0000")
- ws["A1"].fill = PatternFill("solid", start_color="FFFF00")
- ws["A1"].alignment = Alignment(horizontal="center", vertical="center")
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(
- GeneratedFile(
- path=out,
- group="matrix/merge",
- features=["merged_cells", slug],
- expected_cells=len(ranges),
- )
- )
- return files
-
-
-# --- named ranges ---------------------------------------------------------
-
-
-def build_named_range_files() -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
- recipes = [
- ("named_workbook_scope", "Total", "Sheet1!$A$1", None),
- ("named_sheet_scope", "SheetLocal", "Sheet1!$B$1", "Sheet1"),
- ("named_constant", "TaxRate", "0.07", None),
- ("named_range_multi_cell", "Prices", "Sheet1!$A$1:$A$10", None),
- ("named_formula", "Doubled", "Sheet1!$A$1*2", None),
- ("named_with_unicode", "Mẹtá", "Sheet1!$A$1", None),
- ("named_long_identifier", "very_long_identifier_" + "x" * 50, "Sheet1!$A$1", None),
- ("named_escaped_sheet", "Quoted", "'Sheet 2'!$A$1", None), # needs 'Sheet 2'
- ("named_external_like", "ExternalLike", "[Budget.xlsx]Sheet1!$A$1", None),
- ("named_list_variation", "ChoiceList", "Sheet1!$D$1:$D$5", None),
- ("named_col_range", "FullColumn", "Sheet1!$A:$A", None),
- ("named_row_range", "FullRow", "Sheet1!$1:$1", None),
- ("named_cross_sheet", "CrossRef", "Other!$A$1", None), # needs Other sheet
- ("named_multi_area", "Islands", "Sheet1!$A$1,Sheet1!$C$3", None),
- ("named_with_hash_prefix", "_Prefix", "Sheet1!$A$1", None),
- ("named_digits", "X1", "Sheet1!$A$1", None),
- ("named_empty_formula_error", "ErrRef", "#REF!", None),
- ("named_boolean_constant", "IsOn", "TRUE", None),
- ("named_string_constant", "Greeting", '"hello"', None),
- ("named_table_column_ref", "TableCol", "Table1[Value]", None), # needs table
- ]
- for slug, name, ref, scope in recipes:
- wb = Workbook()
- ws = wb.active
- ws.title = "Sheet1"
- for i in range(1, 11):
- ws.cell(row=i, column=1, value=i)
- ws.cell(row=i, column=4, value=f"item{i}")
- if scope == "Sheet1":
- ws.defined_names.add(DefinedName(name, attr_text=ref))
- elif slug == "named_escaped_sheet":
- wb.create_sheet("Sheet 2")["A1"] = 42
- wb.defined_names.add(DefinedName(name, attr_text=ref))
- elif slug == "named_cross_sheet":
- wb.create_sheet("Other")["A1"] = 99
- wb.defined_names.add(DefinedName(name, attr_text=ref))
- elif slug == "named_table_column_ref":
- for c, h in enumerate(["ID", "Value"], 1):
- ws.cell(row=1, column=c, value=h)
- for r in range(2, 6):
- ws.cell(row=r, column=1, value=r)
- ws.cell(row=r, column=2, value=r * 10)
- ws.add_table(Table(displayName="Table1", ref="A1:B5"))
- wb.defined_names.add(DefinedName(name, attr_text=ref))
- else:
- wb.defined_names.add(DefinedName(name, attr_text=ref))
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(
- GeneratedFile(
- path=out,
- group="matrix/named_range",
- features=["named_range", slug],
- expected_cells=14,
- )
- )
- return files
-
-
-# --- data validation ------------------------------------------------------
-
-
-def build_data_validation_files() -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
- recipes = [
- ("dv_list_literal", {"type": "list", "formula1": '"Red,Green,Blue"'}),
- ("dv_list_range", {"type": "list", "formula1": "=$D$1:$D$5"}),
- ("dv_whole_between", {"type": "whole", "operator": "between", "formula1": "1", "formula2": "100"}),
- ("dv_decimal_gt", {"type": "decimal", "operator": "greaterThan", "formula1": "0.5"}),
- ("dv_date_after", {"type": "date", "operator": "greaterThan", "formula1": "DATE(2024,1,1)"}),
- ("dv_time_before", {"type": "time", "operator": "lessThan", "formula1": "TIME(12,0,0)"}),
- ("dv_textlength", {"type": "textLength", "operator": "lessThan", "formula1": "10"}),
- ("dv_custom", {"type": "custom", "formula1": "=A1>0"}),
- ("dv_list_unicode", {"type": "list", "formula1": '"红,绿,蓝"'}),
- ("dv_list_one_item", {"type": "list", "formula1": '"Only"'}),
- ("dv_list_many_items", {"type": "list", "formula1": '"' + ",".join(f"opt{i}" for i in range(1, 31)) + '"'}),
- ("dv_with_error_message", {"type": "list", "formula1": '"A,B"', "error": "pick A or B", "errorTitle": "Err"}),
- ("dv_with_prompt", {"type": "list", "formula1": '"A,B"', "prompt": "select letter", "promptTitle": "Hint"}),
- ("dv_ignore_blank", {"type": "list", "formula1": '"A,B"', "allowBlank": True}),
- ("dv_multiple_ranges", {"type": "list", "formula1": '"A,B"'}), # will apply to multiple ranges
- ("dv_whole_equal", {"type": "whole", "operator": "equal", "formula1": "42"}),
- ("dv_date_between", {"type": "date", "operator": "between", "formula1": "DATE(2020,1,1)", "formula2": "DATE(2025,12,31)"}),
- ("dv_decimal_not_between", {"type": "decimal", "operator": "notBetween", "formula1": "0", "formula2": "1"}),
- ("dv_textlength_greater", {"type": "textLength", "operator": "greaterThan", "formula1": "3"}),
- ("dv_custom_cross_cell", {"type": "custom", "formula1": "=AND(A1>0,B1<100)"}),
- ]
- for slug, kwargs in recipes:
- wb = Workbook()
- ws = wb.active
- ws.title = "DV"
- for r in range(1, 6):
- ws.cell(row=r, column=4, value=f"Option{r}")
- dv_kwargs = {k: v for k, v in kwargs.items() if k not in {"error", "errorTitle", "prompt", "promptTitle", "allowBlank"}}
- dv = DataValidation(**dv_kwargs)
- if "error" in kwargs:
- dv.error = kwargs["error"]
- dv.errorTitle = kwargs.get("errorTitle", "Err")
- dv.showErrorMessage = True
- if "prompt" in kwargs:
- dv.prompt = kwargs["prompt"]
- dv.promptTitle = kwargs.get("promptTitle", "Hint")
- dv.showInputMessage = True
- if kwargs.get("allowBlank"):
- dv.allowBlank = True
- ws.add_data_validation(dv)
- if slug == "dv_multiple_ranges":
- dv.add("A1:A5")
- dv.add("C1:C5")
- else:
- dv.add("A1:A10")
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(
- GeneratedFile(
- path=out,
- group="matrix/data_validation",
- features=["data_validation", slug],
- expected_cells=5,
- )
- )
- return files
-
-
-# --- conditional formatting -----------------------------------------------
-
-
-def build_conditional_formatting_files() -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
-
- def _seed_ws(ws):
- for r in range(1, 11):
- ws.cell(row=r, column=1, value=r)
- ws.cell(row=r, column=2, value=11 - r)
- ws.cell(row=r, column=3, value=(r * 7) % 10)
-
- recipes: list[tuple[str, Callable[[object], None]]] = [
- ("cf_cellis_greater", lambda ws: ws.conditional_formatting.add(
- "A1:A10",
- CellIsRule(operator="greaterThan", formula=["5"], fill=PatternFill("solid", start_color="FFC7CE")),
- )),
- ("cf_cellis_less", lambda ws: ws.conditional_formatting.add(
- "A1:A10",
- CellIsRule(operator="lessThan", formula=["3"], fill=PatternFill("solid", start_color="C6EFCE")),
- )),
- ("cf_cellis_between", lambda ws: ws.conditional_formatting.add(
- "A1:A10",
- CellIsRule(operator="between", formula=["3", "7"], fill=PatternFill("solid", start_color="FFEB9C")),
- )),
- ("cf_color_scale_2", lambda ws: ws.conditional_formatting.add(
- "A1:A10",
- ColorScaleRule(start_type="min", start_color="FFAA0000",
- end_type="max", end_color="FF00AA00"),
- )),
- ("cf_color_scale_3", lambda ws: ws.conditional_formatting.add(
- "B1:B10",
- ColorScaleRule(start_type="min", start_color="FFAA0000",
- mid_type="percentile", mid_value=50, mid_color="FFFFFFFF",
- end_type="max", end_color="FF00AA00"),
- )),
- ("cf_databar", lambda ws: ws.conditional_formatting.add(
- "C1:C10",
- DataBarRule(start_type="min", end_type="max", color="FF638EC6"),
- )),
- ("cf_iconset_3traffic", lambda ws: ws.conditional_formatting.add(
- "A1:A10",
- IconSetRule("3TrafficLights1", "percent", [0, 33, 67]),
- )),
- ("cf_iconset_5arrows", lambda ws: ws.conditional_formatting.add(
- "B1:B10",
- IconSetRule("5Arrows", "percent", [0, 20, 40, 60, 80]),
- )),
- ("cf_formula_rule", lambda ws: ws.conditional_formatting.add(
- "A1:A10",
- FormulaRule(formula=["MOD(ROW(),2)=0"], fill=PatternFill("solid", start_color="DDDDDD")),
- )),
- # Note: omit dxfId; openpyxl cannot round-trip Rule(dxfId=0) unless
- # the differential style table has a matching entry.
- ("cf_top10", lambda ws: ws.conditional_formatting.add(
- "A1:C10", Rule(type="top10", rank=3),
- )),
- ("cf_unique_values", lambda ws: ws.conditional_formatting.add(
- "A1:A10", Rule(type="uniqueValues"),
- )),
- ("cf_duplicate_values", lambda ws: ws.conditional_formatting.add(
- "A1:A10", Rule(type="duplicateValues"),
- )),
- ("cf_contains_text", lambda ws: ws.conditional_formatting.add(
- "A1:A10", Rule(type="containsText", operator="containsText", text="5"),
- )),
- ("cf_above_average", lambda ws: ws.conditional_formatting.add(
- "A1:A10", Rule(type="aboveAverage", aboveAverage=True),
- )),
- ("cf_below_average", lambda ws: ws.conditional_formatting.add(
- "A1:A10", Rule(type="aboveAverage", aboveAverage=False),
- )),
- ("cf_multiple_rules_same_range", lambda ws: (
- ws.conditional_formatting.add("A1:A10", CellIsRule(operator="greaterThan", formula=["7"], fill=PatternFill("solid", start_color="FF0000"))),
- ws.conditional_formatting.add("A1:A10", CellIsRule(operator="lessThan", formula=["3"], fill=PatternFill("solid", start_color="00FF00"))),
- )),
- ("cf_overlapping_ranges", lambda ws: (
- ws.conditional_formatting.add("A1:B5", ColorScaleRule(start_type="min", start_color="FFFF0000", end_type="max", end_color="FF00FF00")),
- ws.conditional_formatting.add("B3:C10", DataBarRule(start_type="min", end_type="max", color="FF0000FF")),
- )),
- ("cf_single_cell", lambda ws: ws.conditional_formatting.add(
- "A1", CellIsRule(operator="equal", formula=["1"], fill=PatternFill("solid", start_color="FFFF00")),
- )),
- ("cf_large_range", lambda ws: ws.conditional_formatting.add(
- "A1:Z100", CellIsRule(operator="greaterThan", formula=["0"], fill=PatternFill("solid", start_color="EEEEEE")),
- )),
- ("cf_entire_column", lambda ws: ws.conditional_formatting.add(
- "A1:A1048576", CellIsRule(operator="greaterThan", formula=["5"], fill=PatternFill("solid", start_color="FFC7CE")),
- )),
- ("cf_formula_complex", lambda ws: ws.conditional_formatting.add(
- "A1:A10",
- FormulaRule(formula=["AND(A1>3,A1<8)"], fill=PatternFill("solid", start_color="99FF99")),
- )),
- ("cf_iconset_3signs", lambda ws: ws.conditional_formatting.add(
- "A1:A10",
- IconSetRule("3Signs", "percent", [0, 33, 67]),
- )),
- ("cf_iconset_4ratings", lambda ws: ws.conditional_formatting.add(
- "A1:A10",
- IconSetRule("4Rating", "percent", [0, 25, 50, 75]),
- )),
- ("cf_color_scale_percentile", lambda ws: ws.conditional_formatting.add(
- "A1:A10",
- ColorScaleRule(start_type="percentile", start_value=10, start_color="FF0000FF",
- end_type="percentile", end_value=90, end_color="FFFF0000"),
- )),
- ("cf_databar_negative", lambda ws: ws.conditional_formatting.add(
- "C1:C10",
- DataBarRule(start_type="min", end_type="max", color="FFFF0000", showValue=False),
- )),
- ]
-
- for slug, apply in recipes:
- wb = Workbook()
- ws = wb.active
- ws.title = "CF"
- _seed_ws(ws)
- apply(ws)
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(
- GeneratedFile(
- path=out,
- group="matrix/conditional_formatting",
- features=["conditional_formatting", slug],
- expected_cells=30,
- )
- )
- return files
-
-
-# --- tables ---------------------------------------------------------------
-
-
-def build_table_files() -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
- for idx, (rows, cols, style, totals) in enumerate([
- (3, 2, "TableStyleLight1", False),
- (10, 3, "TableStyleMedium2", False),
- (50, 5, "TableStyleMedium9", True),
- (100, 8, "TableStyleDark1", False),
- (5, 20, "TableStyleLight9", False),
- (30, 4, "TableStyleMedium1", True),
- (3, 1, "TableStyleLight5", False),
- (3, 26, "TableStyleMedium3", False),
- (3, 2, None, False),
- (10, 3, "TableStyleMedium4", True),
- (200, 6, "TableStyleMedium5", False),
- (3, 2, "TableStyleLight13", False),
- (3, 2, "TableStyleLight14", False),
- (3, 2, "TableStyleLight15", False),
- (3, 2, "TableStyleLight16", False),
- (3, 2, "TableStyleLight17", False),
- (3, 2, "TableStyleLight18", False),
- (3, 2, "TableStyleLight19", False),
- (3, 2, "TableStyleLight20", False),
- (3, 2, "TableStyleLight21", False),
- ]):
- slug = f"table_{idx:02d}_{rows}r_{cols}c"
- wb = Workbook()
- ws = wb.active
- ws.title = "Table"
- for c in range(1, cols + 1):
- ws.cell(row=1, column=c, value=f"H{c}")
- for r in range(2, rows + 2):
- for c in range(1, cols + 1):
- ws.cell(row=r, column=c, value=(r + c) % 97)
- ref = f"A1:{get_column_letter(cols)}{rows + 1}"
- tab = Table(displayName=f"Tbl{idx}", ref=ref)
- if style:
- tab.tableStyleInfo = TableStyleInfo(name=style, showRowStripes=True)
- if totals:
- tab.totalsRowShown = False # openpyxl can be finicky about totals; keep simple
- ws.add_table(tab)
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(
- GeneratedFile(
- path=out,
- group="matrix/table",
- features=["table", f"{rows}r{cols}c"],
- expected_cells=(rows + 1) * cols,
- )
- )
- return files
-
-
-# --- charts ---------------------------------------------------------------
-
-
-def build_chart_files() -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
- chart_types = [
- ("chart_bar", BarChart, {"type": "col"}),
- ("chart_bar_stacked", BarChart, {"type": "col", "grouping": "stacked", "overlap": 100}),
- ("chart_bar_horizontal", BarChart, {"type": "bar"}),
- ("chart_line", LineChart, {}),
- ("chart_pie", PieChart, {}),
- ("chart_area", AreaChart, {}),
- ("chart_radar", RadarChart, {}),
- ("chart_scatter", ScatterChart, {}),
- ("chart_bubble", BubbleChart, {}),
- ("chart_with_title", BarChart, {"title": "Q1 Sales"}),
- ("chart_no_title", BarChart, {}),
- ("chart_many_series", BarChart, {"series_count": 6}),
- ("chart_one_datapoint", BarChart, {"rows": 2}),
- ("chart_long_labels", BarChart, {"long_labels": True}),
- ("chart_unicode_labels", BarChart, {"unicode": True}),
- ("chart_two_charts_one_sheet", BarChart, {"double": True}),
- ("chart_chart_plus_table", BarChart, {"with_table": True}),
- ("chart_line_dashed", LineChart, {"smooth": True}),
- ("chart_pie_exploded", PieChart, {}),
- ("chart_scatter_with_lines", ScatterChart, {"scatterStyle": "lineMarker"}),
- ]
- for slug, ChartCls, opts in chart_types:
- wb = Workbook()
- ws = wb.active
- ws.title = "Data"
- rows = opts.pop("rows", 6)
- series_count = opts.pop("series_count", 2)
- long_labels = opts.pop("long_labels", False)
- unicode_flag = opts.pop("unicode", False)
- double = opts.pop("double", False)
- with_table = opts.pop("with_table", False)
-
- ws.cell(row=1, column=1, value="Label")
- for s in range(1, series_count + 1):
- ws.cell(row=1, column=1 + s, value=f"Series{s}")
- for r in range(2, rows + 1):
- label = f"Item{r-1}"
- if long_labels:
- label = "A very long label " * 5 + str(r)
- if unicode_flag:
- label = f"标签{r} 🚀"
- ws.cell(row=r, column=1, value=label)
- for s in range(1, series_count + 1):
- ws.cell(row=r, column=1 + s, value=((r * s * 7) % 50) + 1)
-
- chart = ChartCls()
- for k, v in opts.items():
- try:
- setattr(chart, k, v)
- except Exception:
- pass
- data = Reference(ws, min_col=2, min_row=1, max_col=1 + series_count, max_row=rows)
- cats = Reference(ws, min_col=1, min_row=2, max_row=rows)
- chart.add_data(data, titles_from_data=True)
- try:
- chart.set_categories(cats)
- except Exception:
- pass
- ws.add_chart(chart, f"{get_column_letter(series_count + 3)}2")
-
- if double:
- chart2 = BarChart()
- chart2.add_data(data, titles_from_data=True)
- chart2.set_categories(cats)
- ws.add_chart(chart2, "H20")
- if with_table:
- ws.add_table(Table(displayName="ChartTable", ref=f"A1:{get_column_letter(series_count + 1)}{rows}"))
-
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(
- GeneratedFile(
- path=out,
- group="matrix/chart",
- features=["chart", slug],
- expected_cells=rows * (series_count + 1),
- )
- )
- return files
-
-
-# --- rich text / styles / fonts ------------------------------------------
-
-
-def build_style_files() -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
- styles = [
- ("style_bold", lambda c: setattr(c, "font", Font(bold=True))),
- ("style_italic", lambda c: setattr(c, "font", Font(italic=True))),
- ("style_underline", lambda c: setattr(c, "font", Font(underline="single"))),
- ("style_strike", lambda c: setattr(c, "font", Font(strike=True))),
- ("style_color_red", lambda c: setattr(c, "font", Font(color="FF0000"))),
- ("style_font_size_24", lambda c: setattr(c, "font", Font(size=24))),
- ("style_font_family_courier", lambda c: setattr(c, "font", Font(name="Courier New"))),
- ("style_bg_yellow", lambda c: setattr(c, "fill", PatternFill("solid", start_color="FFFF00"))),
- ("style_bg_pattern_gray125", lambda c: setattr(c, "fill", PatternFill(patternType="gray125"))),
- ("style_border_thin_all", lambda c: setattr(c, "border", Border(left=Side(style="thin"), right=Side(style="thin"), top=Side(style="thin"), bottom=Side(style="thin")))),
- ("style_border_thick_bottom", lambda c: setattr(c, "border", Border(bottom=Side(style="thick")))),
- ("style_border_dashed", lambda c: setattr(c, "border", Border(top=Side(style="dashed")))),
- ("style_border_double", lambda c: setattr(c, "border", Border(bottom=Side(style="double")))),
- ("style_alignment_center", lambda c: setattr(c, "alignment", Alignment(horizontal="center", vertical="center"))),
- ("style_alignment_wrap", lambda c: setattr(c, "alignment", Alignment(wrap_text=True))),
- ("style_alignment_rotate_45", lambda c: setattr(c, "alignment", Alignment(text_rotation=45))),
- ("style_alignment_rotate_90", lambda c: setattr(c, "alignment", Alignment(text_rotation=90))),
- ("style_indent", lambda c: setattr(c, "alignment", Alignment(indent=3))),
- ("style_shrink_to_fit", lambda c: setattr(c, "alignment", Alignment(shrink_to_fit=True))),
- ("style_vertical_text", lambda c: setattr(c, "alignment", Alignment(text_rotation=255))),
- ("style_combined", lambda c: (
- setattr(c, "font", Font(bold=True, italic=True, size=16, color="0000FF")),
- setattr(c, "fill", PatternFill("solid", start_color="FFE0E0")),
- setattr(c, "alignment", Alignment(horizontal="center", vertical="center", wrap_text=True)),
- setattr(c, "border", Border(left=Side("thin"), right=Side("thin"), top=Side("medium"), bottom=Side("medium"))),
- )),
- ("style_number_format_currency", lambda c: setattr(c, "number_format", "$#,##0.00")),
- ("style_number_format_percent", lambda c: setattr(c, "number_format", "0.0%")),
- ("style_number_format_scientific", lambda c: setattr(c, "number_format", "0.00E+00")),
- ("style_number_format_date_iso", lambda c: setattr(c, "number_format", "yyyy-mm-dd")),
- ("style_number_format_date_long", lambda c: setattr(c, "number_format", "dddd, mmmm dd, yyyy")),
- ("style_number_format_time", lambda c: setattr(c, "number_format", "hh:mm:ss")),
- ("style_number_format_negative_red", lambda c: setattr(c, "number_format", "#,##0;[Red]-#,##0")),
- ("style_number_format_accounting", lambda c: setattr(c, "number_format", "_($* #,##0.00_)")),
- ("style_number_format_fraction", lambda c: setattr(c, "number_format", "# ?/?")),
- ]
- for slug, apply in styles:
- wb = Workbook()
- ws = wb.active
- ws.title = "Style"
- ws["A1"] = "Styled Cell"
- if "number_format" in slug:
- ws["A1"] = 1234.567
- if "date" in slug or "time" in slug:
- ws["A1"] = datetime(2024, 6, 15, 14, 30, 45)
- apply(ws["A1"])
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(
- GeneratedFile(
- path=out,
- group="matrix/style",
- features=["style", slug],
- expected_cells=1,
- )
- )
- return files
-
-
-# --- dates & times --------------------------------------------------------
-
-
-def build_date_files() -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
- entries = [
- ("date_today", datetime.now()),
- ("date_epoch_1900", datetime(1900, 1, 1)),
- ("date_epoch_1904", datetime(1904, 1, 2)),
- ("date_y2k", datetime(2000, 1, 1)),
- ("date_future_2099", datetime(2099, 12, 31)),
- ("date_leap_year", datetime(2020, 2, 29)),
- ("date_weird_feb28", datetime(1900, 2, 28)),
- ("date_first_valid", datetime(1900, 3, 1)),
- ("date_midnight", datetime(2024, 6, 1, 0, 0, 0)),
- ("date_nearmidnight", datetime(2024, 6, 1, 23, 59, 59)),
- ("date_iso_string", "2024-06-15"),
- ("date_us_string", "06/15/2024"),
- ("date_eu_string", "15/06/2024"),
- ("date_just_time", time(13, 30, 0)),
- ("date_date_only", date(2024, 6, 15)),
- ("date_with_timedelta_format", datetime(2024, 6, 15)),
- ("date_mixed_formats_in_column", None),
- ("date_fractional_days", 44500.5), # excel serial
- ("date_negative_serial", -1), # invalid
- ("date_text_like_date", "2024-06-15 but not really"),
- ]
- for slug, val in entries:
- wb = Workbook()
- ws = wb.active
- ws.title = "Dates"
- if slug == "date_mixed_formats_in_column":
- ws["A1"] = datetime(2024, 1, 1)
- ws["A2"] = "2024-02-01"
- ws["A3"] = 44593
- ws["A4"] = date(2024, 4, 1)
- ws["A5"] = datetime(2024, 5, 1, 12, 30)
- else:
- ws["A1"] = val
- ws["A1"].number_format = "yyyy-mm-dd hh:mm:ss"
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(
- GeneratedFile(path=out, group="matrix/date", features=["date", slug], expected_cells=1),
- )
- return files
-
-
-# --- errors ---------------------------------------------------------------
-
-
-def build_error_files() -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
- errors = [
- ("error_div_zero", "=1/0"),
- ("error_name", "=UNKNOWN_FN()"),
- ("error_ref", "=#REF!"),
- ("error_value", "=\"a\"+1"),
- ("error_num", "=SQRT(-1)"),
- ("error_null", "=A1 A2"), # intersection of disjoint ranges
- ("error_na", "=NA()"),
- ("error_getting_data", "=VLOOKUP(999,A1:B2,2,FALSE)"),
- ("error_mixed_with_text", "=IF(TRUE,1/0,\"ok\")"),
- ("error_chained", "=1/0+2"),
- ("error_deliberate_bad_ref", "=BadSheet!A1"),
- ("error_unclosed_paren", "=SUM(A1"), # may get rewritten by openpyxl
- ("error_bad_range", "=SUM(A1:)"),
- ("error_too_many_args", "=IF(1,2,3,4,5)"),
- ("error_circular_simple", "=A1"), # A1 refers to itself
- ]
- for slug, formula in errors:
- wb = Workbook()
- ws = wb.active
- ws.title = "Err"
- try:
- if slug == "error_circular_simple":
- ws["A1"] = formula
- else:
- ws["A2"] = 1
- ws["A1"] = formula
- except Exception:
- pass # a few are too malformed even for openpyxl to accept
- out = _matrix_path(slug)
- try:
- _finalize(wb, out)
- except Exception:
- continue
- files.append(
- GeneratedFile(path=out, group="matrix/error", features=["error", slug], expected_cells=2, expected_formulas=1),
- )
- return files
-
-
-# --- hidden rows/cols/sheets ---------------------------------------------
-
-
-def build_hidden_files() -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
- specs = [
- ("hidden_single_row", "row", [3]),
- ("hidden_single_col", "col", ["B"]),
- ("hidden_many_rows", "row", list(range(2, 20, 2))),
- ("hidden_many_cols", "col", ["B", "D", "F", "H"]),
- ("hidden_first_row", "row", [1]),
- ("hidden_last_row", "row", [100]),
- ("hidden_row_at_boundary", "row", [50, 51, 52]),
- ("hidden_entire_block", "row", list(range(5, 15))),
- ("hidden_sheet_tab", "sheet", None),
- ("hidden_very_hidden_sheet", "veryhidden", None),
- ("hidden_with_outline_group", "outline", None),
- ("hidden_mixed_rows_cols", "mixed", None),
- ]
- for slug, kind, items in specs:
- wb = Workbook()
- ws = wb.active
- ws.title = "Main"
- for r in range(1, 30):
- for c in range(1, 10):
- ws.cell(row=r, column=c, value=(r + c) % 100)
- if kind == "row":
- for r in items:
- ws.row_dimensions[r].hidden = True
- elif kind == "col":
- for col in items:
- ws.column_dimensions[col].hidden = True
- elif kind == "sheet":
- hs = wb.create_sheet("HiddenSheet")
- hs["A1"] = "hidden content"
- hs.sheet_state = "hidden"
- elif kind == "veryhidden":
- hs = wb.create_sheet("VeryHidden")
- hs["A1"] = "very hidden"
- hs.sheet_state = "veryHidden"
- elif kind == "outline":
- for r in range(5, 15):
- ws.row_dimensions[r].outline_level = 1
- ws.row_dimensions[r].hidden = True
- elif kind == "mixed":
- ws.row_dimensions[3].hidden = True
- ws.row_dimensions[5].hidden = True
- ws.column_dimensions["C"].hidden = True
- ws.column_dimensions["E"].hidden = True
- hs = wb.create_sheet("MixedHidden")
- hs.sheet_state = "hidden"
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(
- GeneratedFile(path=out, group="matrix/hidden", features=["hidden", slug], expected_cells=29 * 9),
- )
- return files
-
-
-# --- edge addresses -------------------------------------------------------
-
-
-def build_edge_address_files() -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
- specs = [
- ("addr_xfd1", "XFD1", "lastcol_row1"),
- ("addr_a1048576", "A1048576", "col_a_lastrow"),
- ("addr_xfd1048576", "XFD1048576", "last_cell"),
- ("addr_zz1000", "ZZ1000", "mid_extreme"),
- ("addr_aaa1", "AAA1", "col_aaa"),
- ("addr_aa500", "AA500", "col_aa_500"),
- ("addr_very_sparse", None, "sparse"),
- ("addr_column_1000", f"{get_column_letter(1000)}1", "col_1000"),
- ("addr_row_100000", "A100000", "row_100k"),
- ("addr_gaps", None, "gaps"),
- ]
- for slug, addr, kind in specs:
- wb = Workbook()
- ws = wb.active
- ws.title = "Edge"
- ws["A1"] = "anchor"
- if kind == "sparse":
- ws["A1"] = "tl"
- ws["XFD1048576"] = "br"
- elif kind == "gaps":
- for offset in [0, 100, 1000, 10000]:
- ws.cell(row=1 + offset, column=1 + min(offset // 100, 50), value=f"v{offset}")
- elif addr:
- ws[addr] = f"marker_{slug}"
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(
- GeneratedFile(path=out, group="matrix/edge_address", features=["edge_address", slug], expected_cells=2),
- )
- return files
-
-
-# --- sheet name variations ------------------------------------------------
-
-
-SHEET_NAME_VARIANTS = [
- ("sheetname_ascii", "Simple"),
- ("sheetname_spaces", "Has Spaces"),
- ("sheetname_quote", "Has'Quote"),
- ("sheetname_unicode_jp", "日本語シート"),
- ("sheetname_unicode_emoji", "📊 Sheet"),
- ("sheetname_leading_digits", "1stSheet"),
- ("sheetname_long_30chars", "X" * 30),
- ("sheetname_dash_underscore", "my-sheet_name"),
- ("sheetname_hash_unicode", "Résumé-2025"),
- ("sheetname_parens", "Sheet (copy)"),
-]
-
-
-def build_sheet_name_files() -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
- for slug, name in SHEET_NAME_VARIANTS:
- wb = Workbook()
- ws = wb.active
- try:
- ws.title = name[:31] # Excel limit
- except Exception:
- ws.title = "Fallback"
- ws["A1"] = f"in {name!r}"
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(
- GeneratedFile(path=out, group="matrix/sheet_name", features=["sheet_name", slug], expected_cells=1),
- )
- return files
-
-
-# --- hyperlinks / comments / misc ----------------------------------------
-
-
-def build_misc_files() -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
-
- # hyperlinks
- hl_specs = [
- ("hyperlink_external_http", "https://example.com"),
- ("hyperlink_external_https", "https://www.anthropic.com"),
- ("hyperlink_mailto", "mailto:test@example.com"),
- ("hyperlink_file", "file:///tmp/x.txt"),
- ("hyperlink_internal_cell", "#Sheet1!B5"),
- ("hyperlink_internal_named", "#NamedRng"),
- ("hyperlink_many_links", None),
- ]
- for slug, url in hl_specs:
- wb = Workbook()
- ws = wb.active
- ws.title = "Sheet1"
- if slug == "hyperlink_many_links":
- for i in range(1, 21):
- ws.cell(row=i, column=1, value=f"link{i}").hyperlink = f"https://example.com/page/{i}"
- else:
- ws["A1"].hyperlink = url
- ws["A1"].value = f"click ({slug})"
- if slug == "hyperlink_internal_named":
- wb.defined_names.add(DefinedName("NamedRng", attr_text="Sheet1!$A$1"))
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(GeneratedFile(path=out, group="matrix/hyperlink", features=["hyperlink", slug], expected_cells=20 if url is None else 1))
-
- # comments
- comment_specs = [
- ("comment_short", "Quick note"),
- ("comment_multiline", "line1\nline2\nline3"),
- ("comment_unicode", "注释 🔍 ملاحظة"),
- ("comment_long", "Note " * 500),
- ("comment_many_cells", None),
- ]
- for slug, text in comment_specs:
- wb = Workbook()
- ws = wb.active
- ws.title = "Comments"
- if slug == "comment_many_cells":
- for i in range(1, 21):
- ws.cell(row=i, column=1, value=f"c{i}").comment = Comment(f"comment on row {i}", "Builder")
- else:
- ws["A1"] = "Cell with comment"
- ws["A1"].comment = Comment(text, "Builder")
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(GeneratedFile(path=out, group="matrix/comment", features=["comment", slug], expected_cells=20 if text is None else 1))
-
- # freeze panes
- for slug, freeze in [
- ("freeze_row_1", "A2"),
- ("freeze_col_a", "B1"),
- ("freeze_both_a1", "B2"),
- ("freeze_mid_sheet", "C5"),
- ("freeze_deep", "E10"),
- ]:
- wb = Workbook()
- ws = wb.active
- ws.title = "Freeze"
- for r in range(1, 21):
- for c in range(1, 10):
- ws.cell(row=r, column=c, value=f"{r},{c}")
- ws.freeze_panes = freeze
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(GeneratedFile(path=out, group="matrix/freeze_panes", features=["freeze_panes", slug], expected_cells=20 * 9))
-
- # rich text (mixed fonts within a cell) — openpyxl exposes this via CellRichText
- try:
- from openpyxl.cell.rich_text import CellRichText, TextBlock
- from openpyxl.cell.text import InlineFont
- for slug, blocks in [
- ("rich_text_bold_plain", [TextBlock(InlineFont(b=True), "Bold "), TextBlock(InlineFont(), "plain")]),
- ("rich_text_colors", [TextBlock(InlineFont(color="FF0000"), "Red "), TextBlock(InlineFont(color="0000FF"), "Blue")]),
- ("rich_text_sizes", [TextBlock(InlineFont(sz="8"), "small "), TextBlock(InlineFont(sz="18"), "BIG")]),
- ]:
- wb = Workbook()
- ws = wb.active
- ws.title = "Rich"
- ws["A1"] = CellRichText(blocks)
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(GeneratedFile(path=out, group="matrix/rich_text", features=["rich_text", slug], expected_cells=1))
- except Exception:
- pass
-
- # 3D refs / cross-sheet
- for slug in ["threed_sum_across_sheets"]:
- wb = Workbook()
- ws = wb.active
- ws.title = "A"
- for r in range(1, 6):
- ws.cell(row=r, column=1, value=r)
- wb.create_sheet("B")
- for r in range(1, 6):
- wb["B"].cell(row=r, column=1, value=r * 10)
- summary = wb.create_sheet("Summary")
- summary["A1"] = "=SUM(A:B!A1:A5)" # Excel 3D ref syntax
- out = _matrix_path(slug)
- _finalize(wb, out)
- files.append(GeneratedFile(path=out, group="matrix/3d_ref", features=["3d_ref", slug], expected_cells=11, expected_formulas=1))
-
- return files
-
-
-MATRIX_BUILDERS: list[Callable[[], list[GeneratedFile]]] = [
- build_formula_files,
- build_merge_files,
- build_named_range_files,
- build_data_validation_files,
- build_conditional_formatting_files,
- build_table_files,
- build_chart_files,
- build_style_files,
- build_date_files,
- build_error_files,
- build_hidden_files,
- build_edge_address_files,
- build_sheet_name_files,
- build_misc_files,
-]
-
-
-# ----------------------------------------------------------------------------
-# Combinatoric group — randomised feature cocktails
-# ----------------------------------------------------------------------------
-
-
-COMBO_DIR = OUT_ROOT / "combo"
-DENSITIES = [5, 10, 25, 50, 100]
-SEEDS_PER_DENSITY = 80 # → 400 combo files
-
-
-def _rand_cell_value(rng: random.Random):
- kind = rng.choice(["int", "float", "str", "bool", "date", "blank"])
- if kind == "int":
- return rng.randint(-10_000, 10_000)
- if kind == "float":
- return rng.uniform(-1000.0, 1000.0)
- if kind == "str":
- return "".join(rng.choices(string.ascii_letters + string.digits + " ", k=rng.randint(1, 30)))
- if kind == "bool":
- return rng.choice([True, False])
- if kind == "date":
- return date(rng.randint(2000, 2030), rng.randint(1, 12), rng.randint(1, 28))
- return None
-
-
-def _safe_set(ws, row: int, col: int, value) -> bool:
- """Try to set ws cell; return True on success, False if cell is part of a merge."""
- try:
- ws.cell(row=row, column=col, value=value)
- return True
- except (AttributeError, TypeError):
- return False
-
-
-def build_combo_file(seed: int, density: int) -> GeneratedFile | None:
- rng = random.Random(seed * 10_000 + density)
- wb = Workbook()
- ws = wb.active
- ws.title = f"Main_{seed}_{density}"
- cells_written = 0
- formulas = 0
- features: set[str] = set()
-
- for _ in range(density):
- op = rng.choices(
- population=["cell", "formula", "merge", "style", "comment", "hyperlink", "validation", "table", "named"],
- weights=[45, 20, 8, 12, 3, 3, 3, 3, 3],
- k=1,
- )[0]
- r = rng.randint(1, 100)
- c = rng.randint(1, 30)
- if op == "cell":
- if _safe_set(ws, r, c, _rand_cell_value(rng)):
- cells_written += 1
- features.add("cells")
- elif op == "formula":
- if _safe_set(ws, r, c, f"=SUM({get_column_letter(c)}1:{get_column_letter(c)}{max(1, r-1)})"):
- formulas += 1
- features.add("formulas")
- elif op == "merge":
- try:
- r2 = min(r + rng.randint(0, 3), 100)
- c2 = min(c + rng.randint(0, 3), 30)
- if (r, c) != (r2, c2):
- _safe_set(ws, r, c, f"m{seed}") # write before merge
- ws.merge_cells(start_row=r, start_column=c, end_row=r2, end_column=c2)
- features.add("merge")
- except Exception:
- pass
- elif op == "style":
- try:
- cell = ws.cell(row=r, column=c)
- if cell.value is None:
- if _safe_set(ws, r, c, rng.randint(0, 99)):
- cells_written += 1
- cell = ws.cell(row=r, column=c)
- cell.font = Font(bold=rng.choice([True, False]), italic=rng.choice([True, False]), color=f"{rng.randint(0, 0xFFFFFF):06X}")
- cell.fill = PatternFill("solid", start_color=f"{rng.randint(0xAAAAAA, 0xFFFFFF):06X}")
- features.add("style")
- except AttributeError:
- pass
- elif op == "comment":
- try:
- if _safe_set(ws, r, c, "c"):
- ws.cell(row=r, column=c).comment = Comment(f"seed{seed}", "combo")
- cells_written += 1
- features.add("comment")
- except Exception:
- pass
- elif op == "hyperlink":
- try:
- if _safe_set(ws, r, c, "lnk"):
- ws.cell(row=r, column=c).hyperlink = f"https://example.com/{seed}/{r}-{c}"
- cells_written += 1
- features.add("hyperlink")
- except Exception:
- pass
- elif op == "validation":
- try:
- dv = DataValidation(type="list", formula1='"A,B,C"')
- ws.add_data_validation(dv)
- dv.add(f"{get_column_letter(c)}{r}")
- features.add("validation")
- except Exception:
- pass
- elif op == "table":
- try:
- r2 = min(r + 3, 100)
- c2 = min(c + 2, 30)
- if r2 > r and c2 > c:
- for rr in range(r, r2 + 1):
- for cc in range(c, c2 + 1):
- try:
- if ws.cell(row=rr, column=cc).value is None:
- _safe_set(ws, rr, cc, rr * cc)
- except AttributeError:
- pass
- for cc in range(c, c2 + 1):
- _safe_set(ws, r, cc, f"H{cc}")
- tab_name = f"T{seed}_{density}_{rng.randint(0, 99)}"
- ws.add_table(Table(displayName=tab_name, ref=f"{get_column_letter(c)}{r}:{get_column_letter(c2)}{r2}"))
- features.add("table")
- except Exception:
- pass
- elif op == "named":
- try:
- nm = f"N_{seed}_{density}_{rng.randint(0, 99)}"
- wb.defined_names.add(DefinedName(nm, attr_text=f"{ws.title}!${get_column_letter(c)}${r}"))
- features.add("named_range")
- except Exception:
- pass
-
- out = COMBO_DIR / f"combo_d{density:03d}_s{seed:03d}.xlsx"
- try:
- _finalize(wb, out)
- except Exception:
- return None
- return GeneratedFile(
- path=out,
- group="combo",
- features=sorted(features),
- expected_cells=cells_written,
- expected_formulas=formulas,
- notes=f"seed={seed} density={density}",
- )
-
-
-def build_combo_files(limit: int | None) -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
- count = 0
- for density in DENSITIES:
- for seed in range(SEEDS_PER_DENSITY):
- if limit is not None and count >= limit:
- return files
- gf = build_combo_file(seed, density)
- if gf:
- files.append(gf)
- count += 1
- return files
-
-
-# ----------------------------------------------------------------------------
-# Adversarial group — try to break the parser
-# ----------------------------------------------------------------------------
-
-
-ADVERSARIAL_DIR = OUT_ROOT / "adversarial"
-
-
-def _adv_path(slug: str) -> Path:
- return ADVERSARIAL_DIR / f"{slug}.xlsx"
-
-
-def build_adversarial_files(limit: int | None) -> list[GeneratedFile]:
- files: list[GeneratedFile] = []
- specs: list[tuple[str, Callable[[Workbook], tuple[int, int, str]]]] = []
-
- def _mk(slug: str):
- def deco(fn: Callable[[Workbook], tuple[int, int, str]]):
- specs.append((slug, fn))
- return fn
- return deco
-
- @_mk("adv_empty_workbook")
- def _(wb):
- # openpyxl always has one sheet; clear it
- ws = wb.active
- ws.title = "Empty"
- return 0, 0, "no cells"
-
- @_mk("adv_one_cell_1e300")
- def _(wb):
- wb.active["A1"] = 1e300
- return 1, 0, "huge float"
-
- @_mk("adv_one_cell_neg_1e300")
- def _(wb):
- wb.active["A1"] = -1e300
- return 1, 0, "huge negative"
-
- @_mk("adv_one_cell_tiny")
- def _(wb):
- wb.active["A1"] = 1e-300
- return 1, 0, "tiny float"
-
- @_mk("adv_unicode_bomb")
- def _(wb):
- ws = wb.active
- emojis = "🚀🔥💀🎯🌀⚡️🌈🎨🧪💡" * 20
- rtl = "مرحبا بكم في اختبار التحليل" * 5
- cjk = "こんにちは世界 你好世界 안녕하세요" * 5
- ws["A1"] = emojis + " " + rtl + " " + cjk
- ws["A2"] = "\u200B\u200C\u200D\ufeff" # zero-width chars
- ws["A3"] = "a" * 32_000 # long string
- return 3, 0, "unicode stress"
-
- @_mk("adv_circular_chain_10")
- def _(wb):
- ws = wb.active
- for i in range(1, 10):
- ws.cell(row=i, column=1, value=f"=A{i+1}")
- ws["A10"] = "=A1"
- return 10, 10, "10-step cycle"
-
- @_mk("adv_formula_chain_deep_500")
- def _(wb):
- ws = wb.active
- ws["A1"] = 1
- for i in range(2, 501):
- ws.cell(row=i, column=1, value=f"=A{i-1}+1")
- return 500, 499, "500-deep chain"
-
- @_mk("adv_huge_merge_1000x100")
- def _(wb):
- ws = wb.active
- ws.merge_cells("A1:CV1000") # 100 cols × 1000 rows
- ws["A1"] = "one giant merge"
- return 1, 0, "100k-cell merge"
-
- @_mk("adv_many_merges_5000")
- def _(wb):
- ws = wb.active
- for i in range(5000):
- r = i // 50 + 1
- c = (i % 50) * 2 + 1
- try:
- ws.merge_cells(start_row=r, start_column=c, end_row=r, end_column=c + 1)
- ws.cell(row=r, column=c, value="m")
- except Exception:
- pass
- return 2500, 0, "5000 merges"
-
- @_mk("adv_100_sheets")
- def _(wb):
- wb.active.title = "S0"
- for i in range(1, 100):
- ws = wb.create_sheet(f"S{i}")
- ws["A1"] = i
- return 100, 0, "100 sheets"
-
- @_mk("adv_very_wide_2000_cols")
- def _(wb):
- ws = wb.active
- for c in range(1, 2001):
- ws.cell(row=1, column=c, value=c)
- return 2000, 0, "2000 cols in one row"
-
- @_mk("adv_very_tall_20k_rows")
- def _(wb):
- ws = wb.active
- for r in range(1, 20_001):
- ws.cell(row=r, column=1, value=r)
- return 20_000, 0, "20k rows"
-
- @_mk("adv_sparse_million")
- def _(wb):
- ws = wb.active
- for r in [1, 10, 100, 1000, 10_000, 100_000, 500_000, 1_000_000]:
- ws.cell(row=r, column=1, value=f"r{r}")
- ws["A1"].value = "start"
- return 8, 0, "sparse across 1M rows"
-
- @_mk("adv_all_error_types")
- def _(wb):
- ws = wb.active
- for i, formula in enumerate([
- "=1/0", "=SQRT(-1)", "=NA()", "=BAD_FN()", "=#REF!", '="a"+1',
- ], start=1):
- ws.cell(row=i, column=1, value=formula)
- return 6, 6, "errors galore"
-
- @_mk("adv_broken_refs")
- def _(wb):
- ws = wb.active
- ws["A1"] = "=MissingSheet!B5"
- ws["A2"] = "=OtherBook.xlsx!Sheet1!A1"
- ws["A3"] = "=#REF!+1"
- return 3, 3, "dangling references"
-
- @_mk("adv_long_formula")
- def _(wb):
- ws = wb.active
- ws["A1"] = 1
- long_expr = "=" + "+".join("A1" for _ in range(2000))
- ws["B1"] = long_expr
- return 2, 1, "very long formula"
-
- @_mk("adv_long_cell_string")
- def _(wb):
- ws = wb.active
- ws["A1"] = "X" * 32_767 # Excel limit
- return 1, 0, "32k char cell"
-
- @_mk("adv_all_formulas_sheet")
- def _(wb):
- ws = wb.active
- for r in range(1, 101):
- for c in range(1, 6):
- ws.cell(row=r, column=c, value=f"={get_column_letter(c)}{((r - 1) % 5) + 1}+1")
- return 500, 500, "500 formulas"
-
- @_mk("adv_massive_table")
- def _(wb):
- ws = wb.active
- for c in range(1, 51):
- ws.cell(row=1, column=c, value=f"C{c}")
- for r in range(2, 202):
- for c in range(1, 51):
- ws.cell(row=r, column=c, value=(r * c) % 997)
- ws.add_table(Table(displayName="Huge", ref=f"A1:{get_column_letter(50)}201"))
- return 10_050, 0, "50x200 table"
-
- @_mk("adv_cyclic_cross_sheet")
- def _(wb):
- a = wb.active
- a.title = "A"
- a["A1"] = "=B!A1"
- b = wb.create_sheet("B")
- b["A1"] = "=A!A1"
- return 2, 2, "cross-sheet cycle"
-
- @_mk("adv_many_named_ranges")
- def _(wb):
- ws = wb.active
- for i in range(1, 301):
- wb.defined_names.add(DefinedName(f"N{i}", attr_text=f"Sheet!${get_column_letter((i % 30) + 1)}${(i % 100) + 1}"))
- ws["A1"] = "seed"
- return 1, 0, "300 named ranges"
-
- @_mk("adv_duplicate_sheet_names_almost")
- def _(wb):
- wb.active.title = "Data"
- wb.create_sheet("data")
- wb.create_sheet("DATA")
- return 0, 0, "case-sensitive sheet names"
-
- @_mk("adv_rtl_sheet")
- def _(wb):
- ws = wb.active
- ws.sheet_view.rightToLeft = True
- ws["A1"] = "النص يقرأ من اليمين"
- return 1, 0, "RTL view"
-
- @_mk("adv_extreme_column_width")
- def _(wb):
- ws = wb.active
- ws.column_dimensions["A"].width = 255
- ws.row_dimensions[1].height = 409 # excel max
- ws["A1"] = "wide+tall"
- return 1, 0, "max col/row size"
-
- @_mk("adv_autofilter_large")
- def _(wb):
- ws = wb.active
- for c in range(1, 11):
- ws.cell(row=1, column=c, value=f"H{c}")
- for r in range(2, 301):
- for c in range(1, 11):
- ws.cell(row=r, column=c, value=r * c)
- ws.auto_filter.ref = "A1:J300"
- return 3000, 0, "autofilter 3k cells"
-
- @_mk("adv_mixed_types_same_column")
- def _(wb):
- ws = wb.active
- for r in range(1, 51):
- if r % 5 == 0:
- ws.cell(row=r, column=1, value=f"text_{r}")
- elif r % 5 == 1:
- ws.cell(row=r, column=1, value=r)
- elif r % 5 == 2:
- ws.cell(row=r, column=1, value=float(r) / 7.0)
- elif r % 5 == 3:
- ws.cell(row=r, column=1, value=date(2024, (r % 12) + 1, 1))
- else:
- ws.cell(row=r, column=1, value=(r % 2 == 0))
- return 50, 0, "mixed types in one column"
-
- _SAFE_STR_CHARS = string.ascii_letters + string.digits + " -_.,:;!?@#$%^&*()[]{}<>+=/|~"
-
- # adversarial via parametrised generator to pad counts to ~1000 total
- for i in range(1, 278): # 277 parametric adversarial files → 1000 total generated
- rng = random.Random(10_000 + i)
-
- @_mk(f"adv_param_{i:03d}")
- def _(wb, rng=rng, i=i):
- ws = wb.active
- # Keep sizes modest so the full bench runs under 10 min wall-clock.
- n_cells = rng.randint(100, 800)
- cells = 0
- formulas = 0
- for _ in range(n_cells):
- r = rng.randint(1, 300)
- c = rng.randint(1, 50)
- kind = rng.choice(["int", "str", "formula", "date", "bool"])
- try:
- if kind == "int":
- val = rng.randint(-1_000_000, 1_000_000)
- elif kind == "str":
- val = "".join(rng.choices(_SAFE_STR_CHARS, k=rng.randint(1, 50)))
- elif kind == "formula":
- val = f"={get_column_letter(max(1, c - 1))}{max(1, r - 1)}+1"
- elif kind == "date":
- val = date(rng.randint(1900, 2099), rng.randint(1, 12), rng.randint(1, 28))
- else:
- val = rng.choice([True, False])
- if _safe_set(ws, r, c, val):
- cells += 1
- if kind == "formula":
- formulas += 1
- except Exception:
- pass
- for _ in range(rng.randint(0, 20)):
- try:
- r0 = rng.randint(1, 100)
- c0 = rng.randint(1, 50)
- ws.merge_cells(start_row=r0, start_column=c0, end_row=r0 + rng.randint(0, 5), end_column=c0 + rng.randint(0, 5))
- except Exception:
- pass
- return cells, formulas, f"param seed {i}"
-
- files: list[GeneratedFile] = []
- count = 0
- for slug, fn in specs:
- if limit is not None and count >= limit:
- break
- wb = Workbook()
- try:
- cells, formulas, notes = fn(wb)
- except Exception as exc:
- # skip uncooperative generators
- print(f" ⚠ adversarial {slug} failed to build: {exc}", file=sys.stderr)
- continue
- out = _adv_path(slug)
- try:
- _finalize(wb, out)
- except Exception as exc:
- print(f" ⚠ adversarial {slug} failed to save: {exc}", file=sys.stderr)
- continue
- files.append(
- GeneratedFile(
- path=out,
- group="adversarial",
- features=["adversarial", slug],
- expected_cells=cells,
- expected_formulas=formulas,
- notes=notes,
- )
- )
- count += 1
- return files
-
-
-# ----------------------------------------------------------------------------
-# Entry point
-# ----------------------------------------------------------------------------
-
-
-def build_all(groups: set[str], force: bool, limit: int | None) -> list[GeneratedFile]:
- all_files: list[GeneratedFile] = []
- if "matrix" in groups:
- MATRIX_DIR.mkdir(parents=True, exist_ok=True)
- for builder in MATRIX_BUILDERS:
- for gf in builder():
- all_files.append(gf)
- if limit is not None and len(all_files) >= limit:
- return all_files
- if "combo" in groups:
- COMBO_DIR.mkdir(parents=True, exist_ok=True)
- remaining = None if limit is None else max(0, limit - len(all_files))
- all_files.extend(build_combo_files(remaining))
- if limit is not None and len(all_files) >= limit:
- return all_files
- if "adversarial" in groups:
- ADVERSARIAL_DIR.mkdir(parents=True, exist_ok=True)
- remaining = None if limit is None else max(0, limit - len(all_files))
- all_files.extend(build_adversarial_files(remaining))
- return all_files
-
-
-def write_manifest(files: list[GeneratedFile]) -> None:
- by_group: dict[str, int] = {}
- rows = []
- for gf in files:
- rows.append(gf.to_manifest_row())
- by_group[gf.group] = by_group.get(gf.group, 0) + 1
- manifest = {
- "version": 1,
- "generated_at": "deterministic",
- "total_files": len(files),
- "by_group": by_group,
- "files": rows,
- }
- MANIFEST_PATH.write_text(json.dumps(manifest, indent=2, sort_keys=False))
- print(f"✓ manifest written → {MANIFEST_PATH.relative_to(ROOT)}")
-
-
-def main() -> int:
- parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument("--group", choices=["matrix", "combo", "adversarial", "all"], default="all")
- parser.add_argument("--force", action="store_true", help="regenerate even if outputs exist")
- parser.add_argument("--limit", type=int, help="stop after N files (smoke mode)")
- parser.add_argument("--clean", action="store_true", help="wipe testBench/generated/ first")
- args = parser.parse_args()
-
- if args.clean and OUT_ROOT.exists():
- import shutil
- shutil.rmtree(OUT_ROOT)
- print(f"✓ cleaned {OUT_ROOT.relative_to(ROOT)}")
-
- groups = {"matrix", "combo", "adversarial"} if args.group == "all" else {args.group}
- OUT_ROOT.mkdir(parents=True, exist_ok=True)
-
- print(f"building testBench into {OUT_ROOT.relative_to(ROOT)} groups={sorted(groups)} limit={args.limit}")
- files = build_all(groups, args.force, args.limit)
- write_manifest(files)
-
- print(f"\n{'═' * 60}")
- print(f" Generated {len(files)} workbooks")
- by_group: dict[str, int] = {}
- for gf in files:
- by_group[gf.group] = by_group.get(gf.group, 0) + 1
- for g in sorted(by_group):
- print(f" {g:32s} {by_group[g]:4d}")
- print(f"{'═' * 60}")
- return 0
-
-
-if __name__ == "__main__":
- raise SystemExit(main())
diff --git a/scripts/generate_enterprise_fixtures.py b/scripts/generate_enterprise_fixtures.py
deleted file mode 100644
index 189bb78..0000000
--- a/scripts/generate_enterprise_fixtures.py
+++ /dev/null
@@ -1,150 +0,0 @@
-"""Generate small, deterministic enterprise-style Excel fixtures.
-
-These fixtures are used by enterprise scoring tests and corpus metrics.
-They are intentionally lightweight so they can be generated at test time
-without network access or large disk usage.
-"""
-
-
-
-from pathlib import Path
-from typing import Callable
-
-from openpyxl import Workbook
-from openpyxl.styles import Font
-from openpyxl.workbook.defined_name import DefinedName
-
-
-ROOT = Path(__file__).resolve().parent.parent
-TARGET_DIR = ROOT / "testBench" / "enterprise"
-
-
-def _prepare_target() -> None:
- TARGET_DIR.mkdir(parents=True, exist_ok=True)
-
-
-def create_financial_model() -> Workbook:
- wb = Workbook()
- ws = wb.active
- ws.title = "Model"
-
- ws.merge_cells("A1:D1")
- ws["A1"] = "Financial Model Q1 2026"
- ws["A1"].font = Font(bold=True, size=14)
-
- ws["A3"] = "ASSUMPTIONS"
- ws["A4"] = "Rent per unit"
- ws["B4"] = 2500
- ws["A5"] = "Units occupied"
- ws["B5"] = 42
-
- ws["A7"] = "RESULTS"
- ws["A8"] = "Total Revenue"
- ws["B8"] = "=B4*B5"
-
- wb.defined_names.add(DefinedName("UnitCount", attr_text="Model!$B$5"))
- wb.defined_names.add(DefinedName("RentPerUnit", attr_text="Model!$B$4"))
-
- return wb
-
-
-def create_inventory_tracker() -> Workbook:
- wb = Workbook()
- ws = wb.active
- ws.title = "Master"
-
- ws["A1"] = "SKU"
- ws["B1"] = "Description"
- ws["C1"] = "Qty"
- ws["D1"] = "Unit Cost"
-
- for i in range(2, 52):
- ws[f"A{i}"] = f"SKU-{i:04d}"
- ws[f"B{i}"] = f"Product {i}"
- ws[f"C{i}"] = i * 100
- ws[f"D{i}"] = i * 1.5
-
- tx = wb.create_sheet("Transactions")
- tx["A1"] = "SKU"
- tx["B1"] = "Qty"
- tx["C1"] = "Total"
-
- for i in range(2, 102):
- tx[f"A{i}"] = f"=Master!A{(i % 50) + 2}"
- tx[f"B{i}"] = (i % 10) + 1
- tx[f"C{i}"] = f"=VLOOKUP(A{i},Master!A:D,4,0)*B{i}"
-
- return wb
-
-
-def create_forecast_model() -> Workbook:
- wb = Workbook()
- base = wb.active
- base.title = "Base"
-
- for month in range(1, 13):
- base[f"A{month}"] = f"Month {month}"
- base[f"B{month}"] = 10000 * (1 + month * 0.05)
-
- pess = wb.create_sheet("Pessimistic")
- opt = wb.create_sheet("Optimistic")
- for month in range(1, 13):
- pess[f"B{month}"] = f"=Base!B{month}*0.8"
- opt[f"B{month}"] = f"=Base!B{month}*1.2"
-
- return wb
-
-
-def create_operations_tracker() -> Workbook:
- wb = Workbook()
- ws = wb.active
- ws.title = "Ops"
-
- ws["A1"] = "Project"
- ws["B1"] = "Status"
- ws["C1"] = "Budget"
- ws["D1"] = "Actual"
- ws["E1"] = "Variance %"
-
- statuses = ["Active", "Complete", "On Hold"]
- for i in range(2, 22):
- ws[f"A{i}"] = f"Project {i-1}"
- ws[f"B{i}"] = statuses[i % 3]
- ws[f"C{i}"] = i * 50000
- ws[f"D{i}"] = i * 50000 * (1 + (i % 5) * 0.1)
- ws[f"E{i}"] = f"=(D{i}-C{i})/C{i}"
-
- ref = wb.create_sheet("Reference", 1)
- ref.sheet_state = "hidden"
- ref["A1"] = "Rate"
- ref["A2"] = 1.05
-
- return wb
-
-
-def _write_workbook(name: str, builder: Callable[[], Workbook]) -> Path:
- _prepare_target()
- path = TARGET_DIR / name
- if path.exists():
- return path
- wb = builder()
- wb.save(path)
- return path
-
-
-def generate_all() -> list[Path]:
- """Generate all enterprise fixtures and return their paths."""
- fixtures = [
- ("financial_model.xlsx", create_financial_model),
- ("inventory_tracker.xlsx", create_inventory_tracker),
- ("forecast_model.xlsx", create_forecast_model),
- ("operations_tracker.xlsx", create_operations_tracker),
- ]
-
- return [_write_workbook(name, builder) for name, builder in fixtures]
-
-
-if __name__ == "__main__":
- paths = generate_all()
- for p in paths:
- print(f"✓ Generated {p.relative_to(ROOT)}")
diff --git a/site/index.html b/site/index.html
index 585f9af..37fca39 100644
--- a/site/index.html
+++ b/site/index.html
@@ -160,7 +160,7 @@
"name": "What file formats does ks-xlsx-parser support?",
"acceptedAnswer": {
"@type": "Answer",
- "text": "ks-xlsx-parser supports .xlsx and .xlsm (OOXML). Legacy .xls (BIFF) is not supported — convert those externally first. The parser handles unicode content, very wide sheets, very tall sheets, sparse workbooks, 250-sheet workbooks, circular formula chains, and files with 32k-character cells, all covered in the 1054-workbook testBench that runs in CI."
+ "text": "ks-xlsx-parser supports .xlsx and .xlsm (OOXML). Legacy .xls (BIFF) is not supported — convert those externally first. The parser handles unicode content, very wide sheets, very tall sheets, sparse workbooks, 250-sheet workbooks, circular formula chains, and files with 32k-character cells, all benchmarked on the 5,458-workbook SpreadsheetBench corpus."
}
},
{
@@ -168,7 +168,7 @@
"name": "How fast is ks-xlsx-parser?",
"acceptedAnswer": {
"@type": "Answer",
- "text": "The full 1054-workbook testBench round-trips in approximately 70 seconds on a single machine. A real-world 21k-cell, 13-sheet financial model parses in about 4.6 seconds (previously 307 seconds before a circular-ref caching fix). Sparse workbooks with extreme addresses parse in under 200 ms."
+ "text": "SpreadsheetBench's full 5,458-workbook corpus parses end-to-end in roughly 20 minutes on a single machine (low double-digit ms P50 parse time). A real-world 21k-cell, 13-sheet financial model parses in about 4.6 seconds (previously 307 seconds before a circular-ref caching fix). Sparse workbooks with extreme addresses parse in under 200 ms."
}
}
]
@@ -432,7 +432,7 @@
Features
Demo
Compare
- testBench
+ Benchmarks
Docs
⭐ Star on GitHub
@@ -551,14 +551,14 @@ What you get back
TESTED & FAST
-
1054-workbook stress corpus. Every commit.
-
testBench ships with the repo and runs in CI. One-feature-per-file matrix, randomised density cocktails, and engineered adversarial files — unicode bombs, circular refs, sparse 1M-row sheets, 250-sheet workbooks.
+
SpreadsheetBench: 5,458 real-world workbooks.
+
We benchmark against the public SpreadsheetBench v0.1 corpus — 912 instruction tasks, 5,458 unique xlsx files spanning financial models, project trackers, HR records, and a long tail of small-business spreadsheets.
-
1054/1054tests passing on every CI run
-
~70send-to-end bench wall time
-
66×Walbridge financial model speedup (0.1.1)
-
17 MBdataset zip attached to each release
+
5,455 / 5,458parsed cleanly (99.945%)
+
912instruction × retrieval tasks measured
+
66×21k-cell financial model speedup (0.1.1)
+
vs Doclingtied @1, +2.7pp @3, +1.8pp @5
@@ -684,7 +684,7 @@ Frequently asked questions
How fast is it?
- The full 1054-workbook testBench round-trips in about 70 seconds. A real 21k-cell, 13-sheet financial model parses in ~4.6 s. Sparse workbooks with extreme addresses parse in under 200 ms. Details in the CHANGELOG.
+ SpreadsheetBench's full 5,458-workbook corpus parses end-to-end in roughly 20 minutes on a single machine. A real 21k-cell, 13-sheet financial model parses in ~4.6 s. Sparse workbooks with extreme addresses parse in under 200 ms. Details in the CHANGELOG.
diff --git a/src/models/common.py b/src/models/common.py
index d199da8..0d4af5b 100644
--- a/src/models/common.py
+++ b/src/models/common.py
@@ -64,7 +64,7 @@ class CellCoord:
"""A single cell coordinate (1-indexed row and column).
**Not a Pydantic model** — frozen slotted dataclass. Profiling showed
- 339k Pydantic inits on Walbridge contributed ~0.65 s of parse time;
+ 339k Pydantic inits on a real-world workbook contributed ~0.65 s of parse time;
dataclass construction is ~2.2× faster with the same immutability
and equality semantics. Validation of ``row >= 1`` / ``col >= 1`` is
dropped: all producers in this codebase build coords from parsed
diff --git a/src/parsers/workbook_parser.py b/src/parsers/workbook_parser.py
index 899402f..bee0452 100644
--- a/src/parsers/workbook_parser.py
+++ b/src/parsers/workbook_parser.py
@@ -81,9 +81,9 @@ def __init__(
max_workers: Number of parallel workers.
build_dep_graph: Build the formula dependency graph + run cycle
detection. Fast mode sets this False — on formula-heavy
- workbooks (Walbridge: 17.6k formulas → 48k edges) the dep
- graph is one of the largest remaining costs and nothing in
- fast mode consumes it.
+ workbooks (17k formulas → 48k edges is typical for a real
+ financial model) the dep graph is one of the largest
+ remaining costs and nothing in fast mode consumes it.
"""
if path is None and content is None:
raise ValueError("Either path or content must be provided")
@@ -249,8 +249,8 @@ def parse(self) -> WorkbookDTO:
# Build dependency graph (skippable in fast mode — this stage scans
# every formula, runs the parser, creates thousands of edges, and
- # then runs cycle detection; on Walbridge alone it accounts for
- # ~25% of the full-mode wall clock).
+ # then runs cycle detection; on a 17k-formula real-world workbook it
+ # accounts for ~25% of the full-mode wall clock).
if self._build_dep_graph:
try:
from formula.dependency_builder import DependencyBuilder
diff --git a/testBench/README.md b/testBench/README.md
deleted file mode 100644
index 68cc9eb..0000000
--- a/testBench/README.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# testBench — the ks-xlsx-parser stress corpus
-
-A single, self-contained dataset of **1053 `.xlsx` workbooks** used to
-regression-test and stress-test [ks-xlsx-parser](https://github.com/knowledgestack/ks-xlsx-parser).
-
-It is MIT-licensed, free to reuse for any Excel parser research (commercial or
-otherwise). If it saves you time, please [star the repo](https://github.com/knowledgestack/ks-xlsx-parser) —
-that's the only signal we have that open-sourcing this was worth doing.
-
-## Layout
-
-| Directory | Files | What's in it |
-|-----------|------:|--------------|
-| `real_world/` | 8 | Real anonymised workbooks shipped as demos (financial models, project trackers, engineering calcs). |
-| `enterprise/` | 4 | Deterministic enterprise templates (financial / forecast / inventory / operations). |
-| `github_datasets/` | 10 | Public CSV→XLSX conversions (iris, titanic, superstore, apple stock, …). |
-| `stress/curated/` | 26 | 26 hand-authored progressive stress levels (`stress_level_0`…`stress_level_25`). |
-| `stress/merges/` | 5 | Pathological merge patterns that historically broke parsers. |
-| `generated/matrix/` | ~297 | **One feature per file** across 18 categories (formulas, merges, named ranges, data validation, conditional formatting, tables, charts, styles, dates, errors, hidden rows/cols, hyperlinks, comments, rich text, freeze panes, edge addresses, sheet names, 3D refs). |
-| `generated/combo/` | 400 | Deterministically randomised cocktails at 5 densities × 80 seeds. |
-| `generated/adversarial/`| 300 | Files engineered to break parsers: deep formula chains, 1M-row sparse sheets, 250-sheet workbooks, unicode bombs, huge merges, broken refs, 32 k-char cells, circular refs, long formulas. |
-| **Total** | **1053** | |
-
-The `generated/` tree is produced by [`scripts/build_testbench.py`](../scripts/build_testbench.py)
-and is deterministic — identical commits produce byte-identical files. The other
-directories are checked in as-is.
-
-## Manifest
-
-`generated/MANIFEST.json` lists every generated file with:
-
-* `group` — matrix category, combo, or adversarial
-* `features` — tags describing what the file exercises
-* `expected_cells` — sanity check count
-* `expected_formulas` — sanity check count
-* `sha256` / `size_bytes` — integrity + packaging info
-* `notes` — e.g. seed/density for combo files
-
-## How we use it
-
-```bash
-# regenerate the 1000-file generated tree (idempotent)
-make testbench-build
-
-# parse every file and record failures to metrics/testbench/failures.json
-make testbench
-
-# package for a GitHub release
-make testbench-zip
-```
-
-The round-trip test (`tests/test_testbench_roundtrip.py`) asserts every
-workbook parses without raising and produces a non-empty JSON result. The
-failure log is a first-class artifact — every parser regression shows up as a
-new entry.
-
-## Licensing
-
-All files generated by `build_testbench.py` are synthetic and released under
-MIT alongside the parser. The `real_world/`, `enterprise/`, and
-`github_datasets/` contents are either authored for this project or sourced
-from public-domain datasets; attribution is in the parent repo.
diff --git a/testBench/enterprise/financial_model.xlsx b/testBench/enterprise/financial_model.xlsx
deleted file mode 100644
index f84c12d..0000000
Binary files a/testBench/enterprise/financial_model.xlsx and /dev/null differ
diff --git a/testBench/enterprise/forecast_model.xlsx b/testBench/enterprise/forecast_model.xlsx
deleted file mode 100644
index 7f08d91..0000000
Binary files a/testBench/enterprise/forecast_model.xlsx and /dev/null differ
diff --git a/testBench/enterprise/inventory_tracker.xlsx b/testBench/enterprise/inventory_tracker.xlsx
deleted file mode 100644
index a13fcd1..0000000
Binary files a/testBench/enterprise/inventory_tracker.xlsx and /dev/null differ
diff --git a/testBench/enterprise/operations_tracker.xlsx b/testBench/enterprise/operations_tracker.xlsx
deleted file mode 100644
index a3997b3..0000000
Binary files a/testBench/enterprise/operations_tracker.xlsx and /dev/null differ
diff --git a/testBench/github_datasets/apple_stock.xlsx b/testBench/github_datasets/apple_stock.xlsx
deleted file mode 100644
index 62edeb6..0000000
Binary files a/testBench/github_datasets/apple_stock.xlsx and /dev/null differ
diff --git a/testBench/github_datasets/bestsellers.xlsx b/testBench/github_datasets/bestsellers.xlsx
deleted file mode 100644
index 665b312..0000000
Binary files a/testBench/github_datasets/bestsellers.xlsx and /dev/null differ
diff --git a/testBench/github_datasets/boston.xlsx b/testBench/github_datasets/boston.xlsx
deleted file mode 100644
index ab85439..0000000
Binary files a/testBench/github_datasets/boston.xlsx and /dev/null differ
diff --git a/testBench/github_datasets/breast_cancer.xlsx b/testBench/github_datasets/breast_cancer.xlsx
deleted file mode 100644
index adca3b2..0000000
Binary files a/testBench/github_datasets/breast_cancer.xlsx and /dev/null differ
diff --git a/testBench/github_datasets/iris.xlsx b/testBench/github_datasets/iris.xlsx
deleted file mode 100644
index fede151..0000000
Binary files a/testBench/github_datasets/iris.xlsx and /dev/null differ
diff --git a/testBench/github_datasets/superstore.xlsx b/testBench/github_datasets/superstore.xlsx
deleted file mode 100644
index c51783b..0000000
Binary files a/testBench/github_datasets/superstore.xlsx and /dev/null differ
diff --git a/testBench/github_datasets/titanic.xlsx b/testBench/github_datasets/titanic.xlsx
deleted file mode 100644
index 5cba13b..0000000
Binary files a/testBench/github_datasets/titanic.xlsx and /dev/null differ
diff --git a/testBench/github_datasets/winequality_red.xlsx b/testBench/github_datasets/winequality_red.xlsx
deleted file mode 100644
index 58ddf1e..0000000
Binary files a/testBench/github_datasets/winequality_red.xlsx and /dev/null differ
diff --git a/testBench/github_datasets/world_happiness_2019.xlsx b/testBench/github_datasets/world_happiness_2019.xlsx
deleted file mode 100644
index 6de5ad7..0000000
Binary files a/testBench/github_datasets/world_happiness_2019.xlsx and /dev/null differ
diff --git a/testBench/github_datasets/worldcups.xlsx b/testBench/github_datasets/worldcups.xlsx
deleted file mode 100644
index 4b122f5..0000000
Binary files a/testBench/github_datasets/worldcups.xlsx and /dev/null differ
diff --git a/testBench/real_world/Employee Sample Data.xlsx b/testBench/real_world/Employee Sample Data.xlsx
deleted file mode 100644
index 4cc5a38..0000000
Binary files a/testBench/real_world/Employee Sample Data.xlsx and /dev/null differ
diff --git a/testBench/real_world/Financials Sample Data.xlsx b/testBench/real_world/Financials Sample Data.xlsx
deleted file mode 100644
index 76bc6dd..0000000
Binary files a/testBench/real_world/Financials Sample Data.xlsx and /dev/null differ
diff --git a/testBench/real_world/data_inventory.xlsx b/testBench/real_world/data_inventory.xlsx
deleted file mode 100644
index 3371e0c..0000000
Binary files a/testBench/real_world/data_inventory.xlsx and /dev/null differ
diff --git a/testBench/real_world/engineering_calcs.xlsx b/testBench/real_world/engineering_calcs.xlsx
deleted file mode 100644
index 49e1fb0..0000000
Binary files a/testBench/real_world/engineering_calcs.xlsx and /dev/null differ
diff --git a/testBench/real_world/financial_model.xlsx b/testBench/real_world/financial_model.xlsx
deleted file mode 100644
index 276ea8a..0000000
Binary files a/testBench/real_world/financial_model.xlsx and /dev/null differ
diff --git a/testBench/real_world/project_tracker.xlsx b/testBench/real_world/project_tracker.xlsx
deleted file mode 100644
index bca638a..0000000
Binary files a/testBench/real_world/project_tracker.xlsx and /dev/null differ
diff --git a/testBench/real_world/sales_dashboard.xlsx b/testBench/real_world/sales_dashboard.xlsx
deleted file mode 100644
index fb05bb8..0000000
Binary files a/testBench/real_world/sales_dashboard.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_0.xlsx b/testBench/stress/curated/stress_level_0.xlsx
deleted file mode 100644
index 4a620f0..0000000
Binary files a/testBench/stress/curated/stress_level_0.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_1.xlsx b/testBench/stress/curated/stress_level_1.xlsx
deleted file mode 100644
index 76a7e01..0000000
Binary files a/testBench/stress/curated/stress_level_1.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_10.xlsx b/testBench/stress/curated/stress_level_10.xlsx
deleted file mode 100644
index 7578615..0000000
Binary files a/testBench/stress/curated/stress_level_10.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_11.xlsx b/testBench/stress/curated/stress_level_11.xlsx
deleted file mode 100644
index 72d5c8d..0000000
Binary files a/testBench/stress/curated/stress_level_11.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_12.xlsx b/testBench/stress/curated/stress_level_12.xlsx
deleted file mode 100644
index 56e10e4..0000000
Binary files a/testBench/stress/curated/stress_level_12.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_13.xlsx b/testBench/stress/curated/stress_level_13.xlsx
deleted file mode 100644
index 274c560..0000000
Binary files a/testBench/stress/curated/stress_level_13.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_14.xlsx b/testBench/stress/curated/stress_level_14.xlsx
deleted file mode 100644
index 7d69a4c..0000000
Binary files a/testBench/stress/curated/stress_level_14.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_15.xlsx b/testBench/stress/curated/stress_level_15.xlsx
deleted file mode 100644
index 50aa2a4..0000000
Binary files a/testBench/stress/curated/stress_level_15.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_16.xlsx b/testBench/stress/curated/stress_level_16.xlsx
deleted file mode 100644
index a22617a..0000000
Binary files a/testBench/stress/curated/stress_level_16.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_17.xlsx b/testBench/stress/curated/stress_level_17.xlsx
deleted file mode 100644
index 3e8fc4c..0000000
Binary files a/testBench/stress/curated/stress_level_17.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_18.xlsx b/testBench/stress/curated/stress_level_18.xlsx
deleted file mode 100644
index 56ae03b..0000000
Binary files a/testBench/stress/curated/stress_level_18.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_19.xlsx b/testBench/stress/curated/stress_level_19.xlsx
deleted file mode 100644
index 98c9f4a..0000000
Binary files a/testBench/stress/curated/stress_level_19.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_2.xlsx b/testBench/stress/curated/stress_level_2.xlsx
deleted file mode 100644
index 97fb325..0000000
Binary files a/testBench/stress/curated/stress_level_2.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_20.xlsx b/testBench/stress/curated/stress_level_20.xlsx
deleted file mode 100644
index 72154d7..0000000
Binary files a/testBench/stress/curated/stress_level_20.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_21.xlsx b/testBench/stress/curated/stress_level_21.xlsx
deleted file mode 100644
index 7df3bc8..0000000
Binary files a/testBench/stress/curated/stress_level_21.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_22.xlsx b/testBench/stress/curated/stress_level_22.xlsx
deleted file mode 100644
index 1dca4d7..0000000
Binary files a/testBench/stress/curated/stress_level_22.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_23.xlsx b/testBench/stress/curated/stress_level_23.xlsx
deleted file mode 100644
index 489bae5..0000000
Binary files a/testBench/stress/curated/stress_level_23.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_24.xlsx b/testBench/stress/curated/stress_level_24.xlsx
deleted file mode 100644
index 82f946e..0000000
Binary files a/testBench/stress/curated/stress_level_24.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_25.xlsx b/testBench/stress/curated/stress_level_25.xlsx
deleted file mode 100644
index 6ba2f67..0000000
Binary files a/testBench/stress/curated/stress_level_25.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_3.xlsx b/testBench/stress/curated/stress_level_3.xlsx
deleted file mode 100644
index e43c5d2..0000000
Binary files a/testBench/stress/curated/stress_level_3.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_4.xlsx b/testBench/stress/curated/stress_level_4.xlsx
deleted file mode 100644
index 0464f9d..0000000
Binary files a/testBench/stress/curated/stress_level_4.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_5.xlsx b/testBench/stress/curated/stress_level_5.xlsx
deleted file mode 100644
index f279818..0000000
Binary files a/testBench/stress/curated/stress_level_5.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_6.xlsx b/testBench/stress/curated/stress_level_6.xlsx
deleted file mode 100644
index e5b3f85..0000000
Binary files a/testBench/stress/curated/stress_level_6.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_7.xlsx b/testBench/stress/curated/stress_level_7.xlsx
deleted file mode 100644
index dff80f4..0000000
Binary files a/testBench/stress/curated/stress_level_7.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_8.xlsx b/testBench/stress/curated/stress_level_8.xlsx
deleted file mode 100644
index 780d0a3..0000000
Binary files a/testBench/stress/curated/stress_level_8.xlsx and /dev/null differ
diff --git a/testBench/stress/curated/stress_level_9.xlsx b/testBench/stress/curated/stress_level_9.xlsx
deleted file mode 100644
index a3a6650..0000000
Binary files a/testBench/stress/curated/stress_level_9.xlsx and /dev/null differ
diff --git a/testBench/stress/merges/merge_stress_across.xlsx b/testBench/stress/merges/merge_stress_across.xlsx
deleted file mode 100644
index 52db4d7..0000000
Binary files a/testBench/stress/merges/merge_stress_across.xlsx and /dev/null differ
diff --git a/testBench/stress/merges/merge_stress_dense_grid.xlsx b/testBench/stress/merges/merge_stress_dense_grid.xlsx
deleted file mode 100644
index 7c938bf..0000000
Binary files a/testBench/stress/merges/merge_stress_dense_grid.xlsx and /dev/null differ
diff --git a/testBench/stress/merges/merge_stress_empty_master.xlsx b/testBench/stress/merges/merge_stress_empty_master.xlsx
deleted file mode 100644
index 06713b0..0000000
Binary files a/testBench/stress/merges/merge_stress_empty_master.xlsx and /dev/null differ
diff --git a/testBench/stress/merges/merge_stress_table_header.xlsx b/testBench/stress/merges/merge_stress_table_header.xlsx
deleted file mode 100644
index 13d1092..0000000
Binary files a/testBench/stress/merges/merge_stress_table_header.xlsx and /dev/null differ
diff --git a/testBench/stress/merges/merge_stress_vertical.xlsx b/testBench/stress/merges/merge_stress_vertical.xlsx
deleted file mode 100644
index 1a44d8e..0000000
Binary files a/testBench/stress/merges/merge_stress_vertical.xlsx and /dev/null differ
diff --git a/tests/benchmarks/README.md b/tests/benchmarks/README.md
index 1cb056f..412102e 100644
--- a/tests/benchmarks/README.md
+++ b/tests/benchmarks/README.md
@@ -4,7 +4,7 @@ Two benchmarks, both reproducible:
| Benchmark | What it measures | Corpus | Cost |
|---|---|---|---|
-| `vs_hucre.py` (structural) | Parse-success rate + structural counts (cells, formulas, tables, merges, etc.) across many files | `testBench/` (53 curated) or `data/corpora/spreadsheetbench/` (5,458 real-world) | Cheap — 1–20 min |
+| `vs_hucre.py` (structural) | Parse-success rate + structural counts (cells, formulas, tables, merges, etc.) across many files | `data/corpora/spreadsheetbench/` (5,458 real-world) | Cheap — 1–20 min |
| `scripts/eval_retrieval.py` (chunk quality) | Recall@k for retrieving the relevant chunk given a natural-language instruction, + table-integrity fragmentation rate | SpreadsheetBench `dataset.json` (912 instruction + position pairs) | Medium — 10 min on 100 instances |
## 1. Structural benchmark — `vs_hucre.py`
@@ -18,9 +18,9 @@ Long-running NDJSON-protocol workers, per-file timeout, batch respawn, randomize
Supported parsers today: `ks` (ks-xlsx-parser), `hucre` (TypeScript, requires `pnpm install` under `hucre_node/`), `docling` (IBM Docling — `uv pip install docling`).
```bash
-# Quick smoke (50 random files from testBench)
+# Quick smoke (50 random files from SpreadsheetBench)
PYTHONPATH=src uv run python -m tests.benchmarks.vs_hucre \
- --corpus testBench --sample 50 --parsers ks
+ --corpus data/corpora/spreadsheetbench --sample 50 --parsers ks
# Robustness on full SpreadsheetBench (5,458 files, ~20 min)
PYTHONPATH=src uv run python -m tests.benchmarks.vs_hucre \
diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py
index 4e77399..4558721 100644
--- a/tests/benchmarks/__init__.py
+++ b/tests/benchmarks/__init__.py
@@ -2,11 +2,11 @@
Local-only benchmark harness. Not part of the public test suite.
Runs `ks-xlsx-parser` head-to-head against external parsers (currently `hucre`,
-a TypeScript zero-dependency spreadsheet I/O library) across the `testBench/`
-corpus and produces per-file perf + feature-coverage records.
+a TypeScript zero-dependency spreadsheet I/O library) across the
+SpreadsheetBench corpus and produces per-file perf + feature-coverage records.
Not committed by default — reports and node_modules are git-ignored. Invoke
-via `python -m tests.benchmarks.vs_hucre --corpus testBench`.
+via `python -m tests.benchmarks.vs_hucre --corpus data/corpora/spreadsheetbench`.
Pitfalls this harness is designed to avoid (read before editing):
diff --git a/tests/benchmarks/_driver.py b/tests/benchmarks/_driver.py
index dad231d..a66acd5 100644
--- a/tests/benchmarks/_driver.py
+++ b/tests/benchmarks/_driver.py
@@ -257,13 +257,7 @@ def generate_summary(out_dir: Path) -> None:
continue
try:
rel = Path(r["file"]).resolve()
- # Find segment after 'testBench/' or use file's parent name.
- parts = rel.parts
- if "testBench" in parts:
- idx = parts.index("testBench")
- sub = "/".join(parts[idx + 1: idx + 3]) if idx + 2 < len(parts) else parts[idx + 1]
- else:
- sub = rel.parent.name
+ sub = rel.parent.name
except Exception: # noqa: BLE001
sub = "?"
by_sub[(r["parser"], sub)].append(r["parse_time_ms"])
diff --git a/tests/benchmarks/vs_hucre.py b/tests/benchmarks/vs_hucre.py
index 24e0e50..44ca561 100644
--- a/tests/benchmarks/vs_hucre.py
+++ b/tests/benchmarks/vs_hucre.py
@@ -4,7 +4,7 @@
Usage (from repo root, with venv active):
python -m tests.benchmarks.vs_hucre \\
- --corpus testBench \\
+ --corpus data/corpora/spreadsheetbench \\
--out tests/benchmarks/reports \\
[--subset real_world,enterprise] \\
[--sample 50] \\
@@ -33,7 +33,7 @@
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description=__doc__.splitlines()[1] if __doc__ else "")
- parser.add_argument("--corpus", type=Path, default=Path("testBench"),
+ parser.add_argument("--corpus", type=Path, default=Path("data/corpora/spreadsheetbench"),
help="Corpus directory containing .xlsx/.xlsm files.")
parser.add_argument("--out", type=Path, default=Path("tests/benchmarks/reports"),
help="Root directory for reports; a timestamped subdir is created.")
diff --git a/tests/conftest.py b/tests/conftest.py
index 85d21a0..d422b9b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -22,14 +22,9 @@
from openpyxl.worksheet.table import Table, TableStyleInfo
# ---------------------------------------------------------------------------
-# All-xlsx-files collection for cross-validation and invariant tests
+# Programmatic fixture collection for cross-validation and invariant tests
# ---------------------------------------------------------------------------
-_PROJECT_ROOT = Path(__file__).parent.parent
-_TESTBENCH_DIR = _PROJECT_ROOT / "testBench"
-_EXAMPLES_DIR = _TESTBENCH_DIR / "real_world"
-_DATASETS_DIR = _TESTBENCH_DIR / "github_datasets"
-
# Names of conftest fixtures that produce .xlsx files
PROGRAMMATIC_FIXTURE_NAMES = [
"simple_workbook",
@@ -69,33 +64,12 @@
]
-def collect_static_xlsx_files() -> list[Path]:
- """Collect all static .xlsx files from examples and github_datasets."""
- files = []
- for d in [_EXAMPLES_DIR, _DATASETS_DIR]:
- if d.exists():
- files.extend(sorted(d.glob("*.xlsx")))
- return files
-
-
-STATIC_XLSX_FILES = collect_static_xlsx_files()
-
-
@pytest.fixture(params=PROGRAMMATIC_FIXTURE_NAMES)
def programmatic_xlsx(request, tmp_dir) -> Path:
"""Yields each programmatic fixture as a Path (re-uses other fixtures)."""
return request.getfixturevalue(request.param)
-@pytest.fixture(
- params=STATIC_XLSX_FILES,
- ids=[f.stem for f in STATIC_XLSX_FILES],
-)
-def static_xlsx(request) -> Path:
- """Yields each static .xlsx file path."""
- return request.param
-
-
@pytest.fixture
def tmp_dir():
"""Provide a temporary directory for test workbooks."""
diff --git a/tests/test_cross_validation.py b/tests/test_cross_validation.py
deleted file mode 100644
index 94f73f1..0000000
--- a/tests/test_cross_validation.py
+++ /dev/null
@@ -1,334 +0,0 @@
-"""
-Cross-validation tests comparing parser output against python-calamine.
-
-Calamine is a Rust-based Excel reader, completely independent from openpyxl.
-These tests verify that our parser reads the same data that calamine does.
-"""
-
-
-
-import datetime
-
-import pytest
-
-from pipeline import parse_workbook
-
-from tests.helpers.calamine_reader import CalamineResult
-from tests.helpers.value_comparator import Mismatch, compare_cell_value, values_match
-
-
-# ---------------------------------------------------------------------------
-# Cross-validation on programmatic fixtures
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.crossval
-class TestSheetNamesCrossVal:
- """Verify sheet names match between parser and calamine."""
-
- def test_sheet_names_match(self, programmatic_xlsx):
- parser_result = parse_workbook(path=programmatic_xlsx)
- calamine = CalamineResult.from_path(programmatic_xlsx)
-
- parser_names = [s.sheet_name for s in parser_result.workbook.sheets]
- assert parser_names == calamine.sheet_names, (
- f"Sheet names differ:\n parser: {parser_names}\n"
- f" calamine: {calamine.sheet_names}"
- )
-
- def test_sheet_count_match(self, programmatic_xlsx):
- parser_result = parse_workbook(path=programmatic_xlsx)
- calamine = CalamineResult.from_path(programmatic_xlsx)
- assert len(parser_result.workbook.sheets) == len(calamine.sheet_names)
-
-
-@pytest.mark.crossval
-class TestCellValuesCrossVal:
- """Verify cell values match between parser and calamine."""
-
- def test_non_formula_values_match(self, programmatic_xlsx):
- parser_result = parse_workbook(path=programmatic_xlsx)
- calamine = CalamineResult.from_path(programmatic_xlsx)
- mismatches = _collect_mismatches(parser_result, calamine, formula_cells=False)
- assert len(mismatches) == 0, (
- f"{len(mismatches)} non-formula value mismatches:\n"
- + _format_mismatches(mismatches[:10])
- )
-
- def test_formula_computed_values_match(self, programmatic_xlsx):
- """For formula cells with cached values, parser's formula_value should
- match calamine's computed value. Programmatic fixtures often have no
- cached values, so we use a lenient threshold."""
- parser_result = parse_workbook(path=programmatic_xlsx)
- calamine = CalamineResult.from_path(programmatic_xlsx)
- mismatches = _collect_mismatches(parser_result, calamine, formula_cells=True)
-
- total_formulas = sum(
- 1 for s in parser_result.workbook.sheets
- for c in s.cells.values()
- if c.formula
- )
- # Allow up to 100% mismatch for programmatic fixtures (no cached values)
- # This test is more meaningful for real-world files
- if total_formulas > 0 and len(mismatches) > 0:
- rate = len(mismatches) / total_formulas
- # Only fail if we have actual cached values but they don't match
- hard_mismatches = [
- m for m in mismatches
- if m.parser_value is not None and m.calamine_value is not None
- ]
- assert len(hard_mismatches) == 0, (
- f"{len(hard_mismatches)} formula value mismatches "
- f"(with cached values):\n"
- + _format_mismatches(hard_mismatches[:10])
- )
-
-
-@pytest.mark.crossval
-class TestDimensionsCrossVal:
- """Verify dimensions roughly match between parser and calamine."""
-
- def test_row_count_similar(self, programmatic_xlsx):
- parser_result = parse_workbook(path=programmatic_xlsx)
- calamine = CalamineResult.from_path(programmatic_xlsx)
-
- for sheet in parser_result.workbook.sheets:
- cal_sheet = calamine.sheets.get(sheet.sheet_name)
- if not cal_sheet or not sheet.used_range:
- continue
- parser_rows = sheet.used_range.row_count()
- # calamine total_height is the total row count of the sheet
- # For comparison, use the data area (start/end)
- if cal_sheet.start is not None and cal_sheet.end is not None:
- cal_rows = cal_sheet.end[0] - cal_sheet.start[0] + 1
- # Allow ±2 row difference (calamine may include trailing empty rows)
- assert abs(parser_rows - cal_rows) <= 2, (
- f"Sheet '{sheet.sheet_name}' row count: "
- f"parser={parser_rows}, calamine={cal_rows}"
- )
-
- def test_column_count_similar(self, programmatic_xlsx):
- parser_result = parse_workbook(path=programmatic_xlsx)
- calamine = CalamineResult.from_path(programmatic_xlsx)
-
- for sheet in parser_result.workbook.sheets:
- cal_sheet = calamine.sheets.get(sheet.sheet_name)
- if not cal_sheet or not sheet.used_range:
- continue
- parser_cols = sheet.used_range.col_count()
- if cal_sheet.start is not None and cal_sheet.end is not None:
- cal_cols = cal_sheet.end[1] - cal_sheet.start[1] + 1
- assert abs(parser_cols - cal_cols) <= 2, (
- f"Sheet '{sheet.sheet_name}' col count: "
- f"parser={parser_cols}, calamine={cal_cols}"
- )
-
-
-@pytest.mark.crossval
-class TestMergedRegionsCrossVal:
- """Verify merged regions match between parser and calamine."""
-
- def test_merged_region_count(self, programmatic_xlsx):
- parser_result = parse_workbook(path=programmatic_xlsx)
- calamine = CalamineResult.from_path(programmatic_xlsx)
-
- for sheet in parser_result.workbook.sheets:
- cal_sheet = calamine.sheets.get(sheet.sheet_name)
- if not cal_sheet or cal_sheet.merged_ranges is None:
- continue
- parser_count = len(sheet.merged_regions)
- cal_count = len(cal_sheet.merged_ranges)
- assert parser_count == cal_count, (
- f"Sheet '{sheet.sheet_name}' merge count: "
- f"parser={parser_count}, calamine={cal_count}"
- )
-
- def test_merged_region_ranges(self, programmatic_xlsx):
- parser_result = parse_workbook(path=programmatic_xlsx)
- calamine = CalamineResult.from_path(programmatic_xlsx)
-
- for sheet in parser_result.workbook.sheets:
- cal_sheet = calamine.sheets.get(sheet.sheet_name)
- if not cal_sheet or cal_sheet.merged_ranges is None:
- continue
- # Convert calamine ranges to comparable format
- # calamine: ((start_row, start_col), (end_row, end_col)) 0-indexed
- cal_ranges = set()
- for (sr, sc), (er, ec) in cal_sheet.merged_ranges:
- cal_ranges.add((sr + 1, sc + 1, er + 1, ec + 1))
-
- parser_ranges = set()
- for region in sheet.merged_regions:
- parser_ranges.add((
- region.range.top_left.row,
- region.range.top_left.col,
- region.range.bottom_right.row,
- region.range.bottom_right.col,
- ))
-
- assert parser_ranges == cal_ranges, (
- f"Sheet '{sheet.sheet_name}' merge ranges differ:\n"
- f" parser: {sorted(parser_ranges)}\n"
- f" calamine: {sorted(cal_ranges)}"
- )
-
-
-@pytest.mark.crossval
-class TestMismatchRateCrossVal:
- """Overall mismatch rate must be below threshold."""
-
- def test_overall_mismatch_rate(self, programmatic_xlsx):
- parser_result = parse_workbook(path=programmatic_xlsx)
- calamine = CalamineResult.from_path(programmatic_xlsx)
- mismatches = _collect_mismatches(
- parser_result, calamine, formula_cells=False
- )
- total_cells = sum(
- s.cell_count() for s in parser_result.workbook.sheets
- )
- if total_cells > 0:
- rate = len(mismatches) / total_cells
- assert rate < 0.01, (
- f"Mismatch rate {rate:.1%} ({len(mismatches)}/{total_cells}) "
- f"exceeds 1% threshold"
- )
-
-
-# ---------------------------------------------------------------------------
-# Cross-validation on static files (examples + github datasets)
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.crossval
-class TestSheetNamesStatic:
- def test_sheet_names_match(self, static_xlsx):
- parser_result = parse_workbook(path=static_xlsx)
- calamine = CalamineResult.from_path(static_xlsx)
- parser_names = [s.sheet_name for s in parser_result.workbook.sheets]
- assert parser_names == calamine.sheet_names
-
-
-@pytest.mark.crossval
-class TestCellValuesStatic:
- def test_non_formula_values_match(self, static_xlsx):
- parser_result = parse_workbook(path=static_xlsx)
- calamine = CalamineResult.from_path(static_xlsx)
- mismatches = _collect_mismatches(parser_result, calamine, formula_cells=False)
- total_cells = sum(s.cell_count() for s in parser_result.workbook.sheets)
- if total_cells > 0:
- rate = len(mismatches) / total_cells
- assert rate < 0.01, (
- f"{static_xlsx.name}: {len(mismatches)}/{total_cells} "
- f"({rate:.1%}) mismatches:\n"
- + _format_mismatches(mismatches[:10])
- )
-
- def test_formula_cached_values_match(self, static_xlsx):
- """For real-world files, formula cached values should match calamine.
-
- Threshold: <5% mismatch overall. A handful of files with highly nested
- dynamic-array or volatile formulas are known to exceed this because
- openpyxl doesn't always surface the latest cached value Excel wrote —
- we allow up to 15% for those, tracked in docs/PARSER_KNOWN_ISSUES.md.
- """
- known_loose_files = {
- "Walbridge Coatings 8.9.23.xlsx", # openpyxl cached-value gap
- }
- threshold = 0.15 if static_xlsx.name in known_loose_files else 0.05
-
- parser_result = parse_workbook(path=static_xlsx)
- calamine = CalamineResult.from_path(static_xlsx)
- mismatches = _collect_mismatches(parser_result, calamine, formula_cells=True)
- hard_mismatches = [
- m for m in mismatches
- if m.parser_value is not None and m.calamine_value is not None
- ]
- total_formulas = sum(
- 1 for s in parser_result.workbook.sheets
- for c in s.cells.values()
- if c.formula
- )
- if total_formulas > 0 and len(hard_mismatches) > 0:
- rate = len(hard_mismatches) / total_formulas
- assert rate < threshold, (
- f"{static_xlsx.name}: {len(hard_mismatches)}/{total_formulas} "
- f"formula mismatches ({rate:.1%}, threshold {threshold:.0%}):\n"
- + _format_mismatches(hard_mismatches[:10])
- )
-
-
-@pytest.mark.crossval
-class TestDimensionsStatic:
- def test_dimensions_similar(self, static_xlsx):
- parser_result = parse_workbook(path=static_xlsx)
- calamine = CalamineResult.from_path(static_xlsx)
- for sheet in parser_result.workbook.sheets:
- cal_sheet = calamine.sheets.get(sheet.sheet_name)
- if not cal_sheet or not sheet.used_range:
- continue
- if cal_sheet.start is not None and cal_sheet.end is not None:
- parser_rows = sheet.used_range.row_count()
- cal_rows = cal_sheet.end[0] - cal_sheet.start[0] + 1
- # Allow ±5 for real-world files (empty trailing rows)
- assert abs(parser_rows - cal_rows) <= 5, (
- f"{static_xlsx.name} sheet '{sheet.sheet_name}' rows: "
- f"parser={parser_rows}, calamine={cal_rows}"
- )
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-def _collect_mismatches(
- parser_result,
- calamine: CalamineResult,
- formula_cells: bool,
-) -> list[Mismatch]:
- """Collect all mismatches between parser and calamine."""
- mismatches = []
- for sheet in parser_result.workbook.sheets:
- cal_sheet = calamine.sheets.get(sheet.sheet_name)
- if not cal_sheet:
- continue
-
- for cell in sheet.cells.values():
- # Filter by formula/non-formula
- if formula_cells and not cell.formula:
- continue
- if not formula_cells and cell.formula:
- continue
-
- # Skip merged slaves
- if cell.is_merged_slave:
- continue
-
- cal_val = cal_sheet.get_value(cell.coord.row, cell.coord.col)
-
- if not compare_cell_value(cell, cal_val):
- parser_val = (
- cell.formula_value if cell.formula else cell.raw_value
- )
- mismatches.append(Mismatch(
- sheet=sheet.sheet_name,
- row=cell.coord.row,
- col=cell.coord.col,
- a1_ref=cell.a1_ref,
- parser_value=parser_val,
- calamine_value=cal_val,
- category="formula" if cell.formula else "value",
- ))
-
- return mismatches
-
-
-def _format_mismatches(mismatches: list[Mismatch]) -> str:
- """Format mismatch list for error messages."""
- lines = []
- for m in mismatches:
- lines.append(
- f" {m.a1_ref}: parser={m.parser_value!r} ({type(m.parser_value).__name__}) "
- f"vs calamine={m.calamine_value!r} ({type(m.calamine_value).__name__})"
- )
- return "\n".join(lines)
diff --git a/tests/test_enterprise_scoring.py b/tests/test_enterprise_scoring.py
deleted file mode 100644
index 55b5cb4..0000000
--- a/tests/test_enterprise_scoring.py
+++ /dev/null
@@ -1,149 +0,0 @@
-"""Enterprise-focused scoring of parser output on synthetic fixtures.
-
-These tests provide lightweight, deterministic benchmarks that run without
-network access. They exercise formulas, tables, cross-sheet references,
-named ranges, hidden sheets, and simple calculation lineage.
-"""
-
-
-
-import json
-from pathlib import Path
-
-import pytest
-
-from ks_xlsx_parser import parse_workbook
-
-from scripts.generate_enterprise_fixtures import generate_all
-
-
-ROOT = Path(__file__).resolve().parents[1]
-FIXTURE_DIR = ROOT / "testBench" / "enterprise"
-
-
-@pytest.fixture(scope="session")
-def enterprise_workbooks() -> list[Path]:
- """Generate (or reuse) enterprise fixtures and return their paths."""
- return generate_all()
-
-
-class EnterpriseScorecard:
- def __init__(self, parse_result, expected_metadata=None):
- self.result = parse_result
- self.expected = expected_metadata or {}
-
- def formula_fidelity(self) -> float:
- workbook = self.result.workbook
- extracted = 0
- total = 0
- for sheet in workbook.sheets:
- for cell in sheet.cells.values():
- if cell.formula:
- total += 1
- if cell.formula_value is not None or cell.raw_value is not None:
- extracted += 1
- return extracted / total if total else 0.0
-
- def table_detection_f1(self) -> float:
- detected = len(self.result.workbook.tables)
- expected = self.expected.get("expected_tables", detected)
- if expected == 0 and detected == 0:
- return 1.0
- precision = detected / max(detected, 1)
- recall = detected / max(expected, 1)
- return 2 * (precision * recall) / (precision + recall + 1e-10)
-
- def lineage_accuracy(self) -> float:
- graph = self.result.workbook.dependency_graph
- edges = len(graph.edges)
- cycles = 0 # DependencyGraph does not expose cycles directly
- accuracy = 1.0 - (cycles / (edges + 1)) * 0.1
- return max(accuracy, 0.0)
-
- def chunk_quality(self) -> float:
- chunks = self.result.chunks
- tokens = [c.token_count for c in chunks]
- if not tokens:
- return 0.0
- mean_tokens = sum(tokens) / len(tokens)
- variance = sum((t - mean_tokens) ** 2 for t in tokens) / len(tokens)
- std_dev = variance ** 0.5
- cv = std_dev / (mean_tokens + 1e-10)
- return max(1.0 - cv, 0.0)
-
- def layout_recovery(self) -> float:
- blocks_by_type = {}
- for chunk in self.result.chunks:
- blocks_by_type[chunk.block_type] = blocks_by_type.get(chunk.block_type, 0) + 1
- type_count = len(blocks_by_type)
- return min(type_count / 3.0, 1.0)
-
- def composite_score(self):
- weights = {
- "formula_fidelity": 0.25,
- "table_detection": 0.20,
- "lineage_accuracy": 0.20,
- "chunk_quality": 0.20,
- "layout_recovery": 0.15,
- }
- scores = {
- "formula_fidelity": self.formula_fidelity(),
- "table_detection": self.table_detection_f1(),
- "lineage_accuracy": self.lineage_accuracy(),
- "chunk_quality": self.chunk_quality(),
- "layout_recovery": self.layout_recovery(),
- }
- composite = sum(scores[k] * weights[k] for k in weights)
- return scores, composite
-
- def metrics(self):
- scores, composite = self.composite_score()
- scores["composite"] = composite
- return scores
-
-
-@pytest.mark.enterprise
-@pytest.mark.parametrize(
- "filename,expected",
- [
- ("financial_model.xlsx", {"expected_tables": 0, "expected_formulas": 2}),
- ("inventory_tracker.xlsx", {"expected_tables": 0, "expected_formulas": 100}),
- ("forecast_model.xlsx", {"expected_tables": 0, "expected_formulas": 24}),
- ("operations_tracker.xlsx", {"expected_tables": 0, "expected_formulas": 20}),
- ],
-)
-def test_enterprise_scorecard(enterprise_workbooks, filename, expected):
- path = FIXTURE_DIR / filename
- assert path.exists(), f"Fixture missing: {path}"
-
- result = parse_workbook(path=path)
- scorecard = EnterpriseScorecard(result, expected_metadata=expected)
- scores, composite = scorecard.composite_score()
-
- metrics_dir = ROOT / "metrics" / "corpus"
- metrics_dir.mkdir(parents=True, exist_ok=True)
- with open(metrics_dir / f"{path.stem}_scorecard.json", "w") as f:
- json.dump(scorecard.metrics(), f, indent=2)
-
- print(scorecard.metrics())
- assert composite >= 0.45, f"Composite {composite:.2%} too low for {filename}"
-
-
-@pytest.mark.enterprise
-def test_enterprise_summary(enterprise_workbooks):
- paths = enterprise_workbooks
- results = []
- for p in paths:
- result = parse_workbook(path=p)
- scorecard = EnterpriseScorecard(result)
- scores = scorecard.metrics()
- scores["file"] = p.name
- results.append(scores)
-
- metrics_dir = ROOT / "metrics"
- metrics_dir.mkdir(parents=True, exist_ok=True)
- summary_path = metrics_dir / "corpus_summary.json"
- with open(summary_path, "w") as f:
- json.dump({"files": results}, f, indent=2)
-
- assert len(results) == len(paths)
diff --git a/tests/test_real_world_datasets.py b/tests/test_real_world_datasets.py
deleted file mode 100644
index d10905a..0000000
--- a/tests/test_real_world_datasets.py
+++ /dev/null
@@ -1,433 +0,0 @@
-"""
-Tests against real-world Excel datasets from GitHub.
-
-Source: https://github.com/rohanmistry231/Practice-Datasets-for-Excel
-
-Validates that the parser produces correct, complete JSON output for
-a variety of public datasets covering different shapes, sizes, and
-content types (numeric, text, dates, mixed).
-"""
-
-
-
-import json
-from pathlib import Path
-
-import pytest
-
-from chunking.segmenter import LayoutSegmenter
-from models import BlockType
-from parsers import WorkbookParser
-from pipeline import parse_workbook
-from storage.serializer import WorkbookSerializer
-
-
-FIXTURES_DIR = Path(__file__).parent.parent / "testBench" / "github_datasets"
-
-# Each entry: (filename, expected_sheets, expected_min_rows, expected_header_sample)
-DATASET_CATALOG = [
- ("iris.xlsx", 1, 150, ["sepal_length", "sepal_width", "petal_length"]),
- ("titanic.xlsx", 1, 891, ["PassengerId", "Survived", "Pclass"]),
- ("boston.xlsx", 1, 506, ["CRIM", "ZN", "INDUS"]),
- ("world_happiness_2019.xlsx", 1, 156, ["Overall rank", "Country or region", "Score"]),
- ("bestsellers.xlsx", 1, 550, ["Name", "Author", "User Rating"]),
- ("superstore.xlsx", 3, 1952, ["Row ID", "Order Priority", "Discount"]),
- ("worldcups.xlsx", 1, 20, ["Year", "Country", "Winner"]),
- ("breast_cancer.xlsx", 1, 569, ["id", "diagnosis", "radius_mean"]),
- ("apple_stock.xlsx", 1, 10016, ["Date", "Open", "High"]),
- ("winequality_red.xlsx", 1, 1599, None), # semicolon-separated header, skip header check
-]
-
-
-def _fixture_path(name: str) -> Path:
- return FIXTURES_DIR / name
-
-
-# ---------------------------------------------------------------------------
-# Parametrized: every dataset parses without error
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.parametrize(
- "filename,expected_sheets,expected_min_rows,expected_headers",
- DATASET_CATALOG,
- ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG],
-)
-class TestDatasetParsing:
- """Core parsing validation across all datasets."""
-
- def test_parses_without_error(self, filename, expected_sheets, expected_min_rows, expected_headers):
- """Parser completes without raising an exception."""
- result = parse_workbook(path=_fixture_path(filename))
- assert result.workbook is not None
-
- def test_correct_sheet_count(self, filename, expected_sheets, expected_min_rows, expected_headers):
- """Workbook has the expected number of sheets."""
- result = parse_workbook(path=_fixture_path(filename))
- assert len(result.workbook.sheets) == expected_sheets
-
- def test_minimum_data_rows(self, filename, expected_sheets, expected_min_rows, expected_headers):
- """First sheet has at least the expected number of data rows."""
- result = parse_workbook(path=_fixture_path(filename))
- sheet = result.workbook.sheets[0]
- if sheet.used_range:
- data_rows = sheet.used_range.row_count() - 1 # minus header row
- assert data_rows >= expected_min_rows
-
- def test_headers_detected(self, filename, expected_sheets, expected_min_rows, expected_headers):
- """First row contains the expected column headers."""
- if expected_headers is None:
- pytest.skip("Header check skipped for this dataset")
- result = parse_workbook(path=_fixture_path(filename))
- sheet = result.workbook.sheets[0]
- first_row = sheet.used_range.top_left.row
- actual_headers = []
- for col in range(sheet.used_range.top_left.col, sheet.used_range.bottom_right.col + 1):
- cell = sheet.get_cell(first_row, col)
- if cell and cell.raw_value is not None:
- actual_headers.append(str(cell.raw_value))
- for expected in expected_headers:
- assert expected in actual_headers, (
- f"Expected header '{expected}' not found in {actual_headers[:10]}"
- )
-
- def test_produces_chunks(self, filename, expected_sheets, expected_min_rows, expected_headers):
- """Pipeline produces at least one chunk per sheet."""
- result = parse_workbook(path=_fixture_path(filename))
- assert result.total_chunks >= expected_sheets
-
-
-# ---------------------------------------------------------------------------
-# JSON serialization
-# ---------------------------------------------------------------------------
-
-
-class TestJsonSerialization:
- """Verify JSON output is valid, complete, and contains expected fields."""
-
- @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG],
- ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG])
- def test_to_json_valid(self, filename):
- """to_json() returns a dict that round-trips through json.dumps/loads."""
- result = parse_workbook(path=_fixture_path(filename))
- data = result.to_json()
- json_str = json.dumps(data)
- roundtripped = json.loads(json_str)
- assert roundtripped["total_chunks"] == result.total_chunks
-
- @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG],
- ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG])
- def test_to_json_has_required_keys(self, filename):
- """JSON output contains all required top-level keys."""
- result = parse_workbook(path=_fixture_path(filename))
- data = result.to_json()
- assert "workbook" in data
- assert "chunks" in data
- assert "total_chunks" in data
- assert "total_tokens" in data
-
- @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG],
- ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG])
- def test_workbook_metadata_in_json(self, filename):
- """Workbook section has all required metadata fields."""
- result = parse_workbook(path=_fixture_path(filename))
- wb_json = result.to_json()["workbook"]
- assert wb_json["workbook_id"]
- assert wb_json["filename"]
- assert wb_json["workbook_hash"]
- assert isinstance(wb_json["total_sheets"], int)
- assert isinstance(wb_json["total_cells"], int)
- assert isinstance(wb_json["errors"], list)
-
- @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG],
- ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG])
- def test_chunk_json_has_required_keys(self, filename):
- """Each chunk in JSON has all required fields."""
- result = parse_workbook(path=_fixture_path(filename))
- for chunk in result.to_json()["chunks"]:
- assert "chunk_id" in chunk
- assert "source_uri" in chunk
- assert "sheet_name" in chunk
- assert "block_type" in chunk
- assert "top_left" in chunk
- assert "bottom_right" in chunk
- assert "render_text" in chunk
- assert chunk["render_text"] # not empty
-
- @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG],
- ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG])
- def test_chunk_render_text_contains_data(self, filename):
- """Rendered text in chunks contains actual cell data, not just structure."""
- result = parse_workbook(path=_fixture_path(filename))
- sheet = result.workbook.sheets[0]
- # Get a data value from the sheet (short values to avoid semicolon-delimited lines)
- if sheet.used_range:
- first_data_row = sheet.used_range.top_left.row + 1
- for col in range(sheet.used_range.top_left.col, sheet.used_range.bottom_right.col + 1):
- cell = sheet.get_cell(first_data_row, col)
- if cell and cell.display_value and 2 < len(str(cell.display_value)) <= 30:
- # At least one chunk should contain this value
- found = any(
- str(cell.display_value) in c.render_text
- for c in result.chunks
- )
- assert found, f"Value '{cell.display_value}' not found in any chunk render_text"
- return
- pytest.skip("No suitable data value found to check")
-
-
-# ---------------------------------------------------------------------------
-# Serializer records (Postgres-ready)
-# ---------------------------------------------------------------------------
-
-
-class TestSerializerRecords:
- """Verify WorkbookSerializer produces valid storage records."""
-
- @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG],
- ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG])
- def test_workbook_record(self, filename):
- """Workbook record has all required fields for Postgres."""
- result = parse_workbook(path=_fixture_path(filename))
- serializer = WorkbookSerializer(result.workbook, result.chunks)
- rec = serializer.to_workbook_record()
- assert rec["id"]
- assert rec["file_hash"]
- assert rec["filename"]
- assert isinstance(rec["total_sheets"], int)
- assert isinstance(rec["total_cells"], int)
- # Ensure JSON-serializable
- json.dumps(rec)
-
- @pytest.mark.parametrize(
- "filename,expected_sheets",
- [(d[0], d[1]) for d in DATASET_CATALOG],
- ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG],
- )
- def test_sheet_records_count(self, filename, expected_sheets):
- """Correct number of sheet records produced."""
- result = parse_workbook(path=_fixture_path(filename))
- serializer = WorkbookSerializer(result.workbook, result.chunks)
- sheets = serializer.to_sheet_records()
- assert len(sheets) == expected_sheets
- for s in sheets:
- assert s["sheet_name"]
- assert s["workbook_id"]
- json.dumps(s)
-
- @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG],
- ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG])
- def test_chunk_records(self, filename):
- """Chunk records are valid and JSON-serializable."""
- result = parse_workbook(path=_fixture_path(filename))
- serializer = WorkbookSerializer(result.workbook, result.chunks)
- chunks = serializer.to_chunk_records()
- assert len(chunks) >= 1
- for c in chunks:
- assert c["id"]
- assert c["sheet_name"]
- assert c["block_type"]
- assert c["render_text"]
- json.dumps(c)
-
- @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG],
- ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG])
- def test_vector_store_entries(self, filename):
- """Vector store entries have text and metadata for embedding."""
- result = parse_workbook(path=_fixture_path(filename))
- serializer = WorkbookSerializer(result.workbook, result.chunks)
- entries = serializer.to_vector_store_entries()
- assert len(entries) >= 1
- for e in entries:
- assert e["id"]
- assert e["text"]
- assert e["metadata"]["workbook_hash"]
- assert e["metadata"]["sheet_name"]
- assert e["metadata"]["source_uri"]
- json.dumps(e)
-
-
-# ---------------------------------------------------------------------------
-# Layout detection on real data
-# ---------------------------------------------------------------------------
-
-
-class TestRealWorldLayout:
- """Verify layout segmentation works correctly on real datasets."""
-
- @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG],
- ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG])
- def test_blocks_have_valid_ranges(self, filename):
- """All detected blocks have non-degenerate cell ranges."""
- result = WorkbookParser(path=_fixture_path(filename)).parse()
- for sheet in result.sheets:
- tables = [t for t in result.tables if t.sheet_name == sheet.sheet_name]
- segmenter = LayoutSegmenter(sheet, tables=tables)
- blocks = segmenter.segment()
- for block in blocks:
- assert block.cell_range is not None
- assert block.cell_range.row_count() >= 1
- assert block.cell_range.col_count() >= 1
- assert block.cell_count > 0
-
- @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG],
- ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG])
- def test_blocks_have_valid_types(self, filename):
- """All block types are valid BlockType enum values."""
- result = WorkbookParser(path=_fixture_path(filename)).parse()
- for sheet in result.sheets:
- tables = [t for t in result.tables if t.sheet_name == sheet.sheet_name]
- segmenter = LayoutSegmenter(sheet, tables=tables)
- blocks = segmenter.segment()
- valid_types = set(BlockType)
- for block in blocks:
- assert block.block_type in valid_types
-
- def test_superstore_multi_sheet_layout(self):
- """SuperStore has 3 sheets, each producing at least one block."""
- result = WorkbookParser(path=_fixture_path("superstore.xlsx")).parse()
- assert len(result.sheets) == 3
- for sheet in result.sheets:
- tables = [t for t in result.tables if t.sheet_name == sheet.sheet_name]
- segmenter = LayoutSegmenter(sheet, tables=tables)
- blocks = segmenter.segment()
- assert len(blocks) >= 1, f"Sheet '{sheet.sheet_name}' has no blocks"
-
- def test_world_happiness_has_table(self):
- """World Happiness dataset has an Excel ListObject table."""
- result = WorkbookParser(path=_fixture_path("world_happiness_2019.xlsx")).parse()
- assert len(result.tables) >= 1
- table = result.tables[0]
- assert table.table_name
- assert table.ref_range is not None
-
-
-# ---------------------------------------------------------------------------
-# Determinism on real data
-# ---------------------------------------------------------------------------
-
-
-class TestRealWorldDeterminism:
- """Parsing the same file twice produces identical output."""
-
- @pytest.mark.parametrize("filename", ["iris.xlsx", "worldcups.xlsx", "bestsellers.xlsx"],
- ids=["iris", "worldcups", "bestsellers"])
- def test_deterministic_json(self, filename):
- """Two parses of the same file produce identical JSON (excluding timing)."""
- r1 = parse_workbook(path=_fixture_path(filename))
- r2 = parse_workbook(path=_fixture_path(filename))
- j1 = r1.to_json()
- j2 = r2.to_json()
- # parse_duration_ms varies between runs; exclude from comparison
- j1["workbook"]["parse_duration_ms"] = 0
- j2["workbook"]["parse_duration_ms"] = 0
- assert json.dumps(j1, sort_keys=True) == json.dumps(j2, sort_keys=True)
-
- @pytest.mark.parametrize("filename", ["iris.xlsx", "worldcups.xlsx", "bestsellers.xlsx"],
- ids=["iris", "worldcups", "bestsellers"])
- def test_deterministic_hashes(self, filename):
- """Chunk IDs and content hashes are stable across runs."""
- r1 = parse_workbook(path=_fixture_path(filename))
- r2 = parse_workbook(path=_fixture_path(filename))
- assert r1.total_chunks == r2.total_chunks
- for c1, c2 in zip(r1.chunks, r2.chunks):
- assert c1.chunk_id == c2.chunk_id
- assert c1.content_hash == c2.content_hash
-
-
-# ---------------------------------------------------------------------------
-# Specific dataset content validation
-# ---------------------------------------------------------------------------
-
-
-class TestDatasetContent:
- """Spot-check specific known values in well-known datasets."""
-
- def test_iris_species_values(self):
- """Iris dataset contains known species names."""
- result = parse_workbook(path=_fixture_path("iris.xlsx"))
- sheet = result.workbook.sheets[0]
- species_col = None
- # Find the species column
- for col in range(1, 20):
- cell = sheet.get_cell(1, col)
- if cell and cell.raw_value == "species":
- species_col = col
- break
- assert species_col is not None, "species column not found"
- # Check known species
- species_values = set()
- for row in range(2, 152):
- cell = sheet.get_cell(row, species_col)
- if cell and cell.raw_value:
- species_values.add(cell.raw_value)
- assert "setosa" in species_values
- assert "versicolor" in species_values
- assert "virginica" in species_values
-
- def test_worldcups_has_known_winners(self):
- """WorldCups dataset contains known World Cup winners."""
- result = parse_workbook(path=_fixture_path("worldcups.xlsx"))
- sheet = result.workbook.sheets[0]
- winner_col = None
- for col in range(1, 20):
- cell = sheet.get_cell(1, col)
- if cell and cell.raw_value == "Winner":
- winner_col = col
- break
- assert winner_col is not None, "Winner column not found"
- winners = set()
- for row in range(2, 25):
- cell = sheet.get_cell(row, winner_col)
- if cell and cell.raw_value:
- winners.add(cell.raw_value)
- assert "Brazil" in winners
- assert "Germany" in winners
-
- def test_titanic_numeric_columns(self):
- """Titanic dataset has numeric columns (Survived, Pclass, Age)."""
- result = parse_workbook(path=_fixture_path("titanic.xlsx"))
- sheet = result.workbook.sheets[0]
- # Check Survived column has 0/1 values
- survived_col = None
- for col in range(1, 30):
- cell = sheet.get_cell(1, col)
- if cell and cell.raw_value == "Survived":
- survived_col = col
- break
- assert survived_col is not None
- cell_val = sheet.get_cell(2, survived_col)
- assert cell_val is not None
- assert cell_val.raw_value in (0, 1, 0.0, 1.0)
-
- def test_apple_stock_date_column(self):
- """Apple stock dataset has a Date column with date values."""
- result = parse_workbook(path=_fixture_path("apple_stock.xlsx"))
- sheet = result.workbook.sheets[0]
- date_col = None
- for col in range(1, 10):
- cell = sheet.get_cell(1, col)
- if cell and cell.raw_value == "Date":
- date_col = col
- break
- assert date_col is not None
- # Check that at least one date cell has a date-like display value
- date_cell = sheet.get_cell(2, date_col)
- assert date_cell is not None
- assert date_cell.display_value is not None
-
- def test_superstore_multiple_sheets_content(self):
- """SuperStore has Orders, Returns, and Users sheets with distinct content."""
- result = parse_workbook(path=_fixture_path("superstore.xlsx"))
- sheet_names = {s.sheet_name for s in result.workbook.sheets}
- assert "Orders" in sheet_names
- assert "Returns" in sheet_names
- assert "Users" in sheet_names
-
- # Orders sheet should be large
- orders = next(s for s in result.workbook.sheets if s.sheet_name == "Orders")
- assert orders.cell_count() > 40000
-
- # Users sheet should be small
- users = next(s for s in result.workbook.sheets if s.sheet_name == "Users")
- assert users.cell_count() <= 20
diff --git a/tests/test_structural_invariants.py b/tests/test_structural_invariants.py
index f11a467..612d11a 100644
--- a/tests/test_structural_invariants.py
+++ b/tests/test_structural_invariants.py
@@ -278,29 +278,3 @@ def test_sheet_ids_populated(self, programmatic_xlsx):
)
-# ---------------------------------------------------------------------------
-# Same invariants on static files (examples + github datasets)
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.invariant
-class TestAllInvariantsStatic:
- """Run full invariant checker against each static xlsx file."""
-
- def test_all_invariants_pass(self, static_xlsx):
- result = parse_workbook(path=static_xlsx)
- violations = check_invariants(result.workbook)
- assert len(violations) == 0, (
- f"{len(violations)} violations in {static_xlsx.name}:\n"
- + "\n".join(violations[:10])
- )
-
- def test_deterministic_hashes(self, static_xlsx):
- r1 = parse_workbook(path=static_xlsx)
- r2 = parse_workbook(path=static_xlsx)
- assert r1.workbook.workbook_hash == r2.workbook.workbook_hash
-
- def test_json_serializable(self, static_xlsx):
- result = parse_workbook(path=static_xlsx)
- data = result.to_json()
- json.dumps(data) # must not raise
diff --git a/tests/test_testbench_roundtrip.py b/tests/test_testbench_roundtrip.py
deleted file mode 100644
index bfa4fd7..0000000
--- a/tests/test_testbench_roundtrip.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""
-testBench round-trip tests.
-
-Parses every .xlsx under ``testBench/`` and asserts:
-
-* ``parse_workbook()`` returns without raising.
-* ``result.to_json()`` produces non-empty JSON (> 100 bytes).
-* ``result.workbook`` has at least one sheet.
-
-Failures are collected into ``metrics/testbench/failures.json`` so parser
-regressions across the whole bench are easy to diff.
-
-Runs under the ``testbench`` marker only (skipped by default). Invoke with:
-
- pytest tests/test_testbench_roundtrip.py -m testbench -q
- make testbench # convenience wrapper
-"""
-
-
-import json
-import os
-import traceback
-from pathlib import Path
-
-import pytest
-
-from ks_xlsx_parser import parse_workbook
-
-ROOT = Path(__file__).resolve().parent.parent
-TESTBENCH_DIR = ROOT / "testBench"
-METRICS_DIR = ROOT / "metrics" / "testbench"
-FAILURES_PATH = METRICS_DIR / "failures.json"
-FAILURES_JSONL = METRICS_DIR / "failures.jsonl" # append-only, xdist-safe
-
-
-def _collect_files() -> list[Path]:
- if not TESTBENCH_DIR.exists():
- return []
- return sorted(TESTBENCH_DIR.rglob("*.xlsx"))
-
-
-ALL_FILES = _collect_files()
-
-pytestmark = [pytest.mark.testbench, pytest.mark.timeout(60)]
-
-
-def _record_failure(entry: dict) -> None:
- """Append one failure row to the JSONL log. Safe under xdist parallelism."""
- METRICS_DIR.mkdir(parents=True, exist_ok=True)
- entry["worker"] = os.environ.get("PYTEST_XDIST_WORKER", "main")
- with FAILURES_JSONL.open("a", encoding="utf-8") as f:
- f.write(json.dumps(entry) + "\n")
-
-
-@pytest.fixture(scope="session", autouse=True)
-def _reset_log():
- """Reset the append log at the start of the session (master worker only)."""
- # Under xdist, PYTEST_XDIST_WORKER is set for workers but not the master.
- # The master is responsible for cleanup before workers start writing.
- if os.environ.get("PYTEST_XDIST_WORKER") is None:
- METRICS_DIR.mkdir(parents=True, exist_ok=True)
- if FAILURES_JSONL.exists():
- FAILURES_JSONL.unlink()
- yield
- # After session, aggregate JSONL → JSON summary (master only)
- if os.environ.get("PYTEST_XDIST_WORKER") is None:
- failures: list[dict] = []
- if FAILURES_JSONL.exists():
- for line in FAILURES_JSONL.read_text().splitlines():
- if line.strip():
- failures.append(json.loads(line))
- FAILURES_PATH.write_text(
- json.dumps(
- {"total": len(ALL_FILES), "failure_count": len(failures), "failures": failures},
- indent=2,
- )
- )
-
-
-def _relpath(p: Path) -> str:
- return str(p.relative_to(ROOT))
-
-
-@pytest.mark.parametrize("path", ALL_FILES, ids=lambda p: _relpath(p))
-def test_parse_roundtrip(path: Path):
- """Each workbook must parse, serialize to JSON, and report ≥1 sheet."""
- try:
- result = parse_workbook(path=path)
- except Exception as exc:
- _record_failure({
- "file": _relpath(path),
- "stage": "parse",
- "error": f"{type(exc).__name__}: {exc}",
- "traceback": traceback.format_exc(limit=5),
- })
- raise
-
- assert result.workbook is not None, f"no workbook DTO for {path}"
- assert result.workbook.total_sheets >= 1, f"{path} reports zero sheets"
-
- try:
- js = result.to_json()
- except Exception as exc:
- _record_failure({
- "file": _relpath(path),
- "stage": "to_json",
- "error": f"{type(exc).__name__}: {exc}",
- "traceback": traceback.format_exc(limit=5),
- })
- raise
-
- assert isinstance(js, dict), f"to_json returned non-dict for {path}"
- assert "workbook" in js, f"to_json result missing 'workbook' key for {path}"
- try:
- encoded = json.dumps(js, default=str)
- except Exception as exc:
- _record_failure({
- "file": _relpath(path),
- "stage": "json_encode",
- "error": f"{type(exc).__name__}: {exc}",
- "traceback": traceback.format_exc(limit=5),
- })
- raise
- assert len(encoded) > 100, f"encoded JSON suspiciously short ({len(encoded)} chars) for {path}"
-
-
-def test_testbench_has_files():
- """Guard against an empty testBench (e.g. missing dataset zip)."""
- assert ALL_FILES, (
- f"No .xlsx files found under {TESTBENCH_DIR}. "
- "Run `make testbench-build` or download the dataset zip from the GitHub release."
- )