knowledgestack · arnav2 · May 11, 2026 · May 11, 2026
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -56,10 +56,10 @@ body:
       label: Traceback (if any)
       render: shell
   - type: checkboxes
-    id: testbench
+    id: benchmark
     attributes:
-      label: testBench check
-      description: Does your file already fail `make testbench`? If so, please note which group.
+      label: Benchmark check
+      description: Did your file surface in `make bench-robust` (SpreadsheetBench)?
       options:
-        - label: "I ran `make testbench` and my file failed (attach `metrics/testbench/failures.json`)."
-        - label: "The file is not in the bench; I can contribute it as a new fixture."
+        - label: "I ran `make bench-robust` and my file failed (attach the row from results.csv if you can)."
+        - label: "The file is from outside SpreadsheetBench; I can attach a minimal reproducer."
diff --git a/.github/ISSUE_TEMPLATE/parser_edge_case.yml b/.github/ISSUE_TEMPLATE/parser_edge_case.yml
@@ -6,8 +6,8 @@ body:
   - type: markdown
     attributes:
       value: |
-        Every edge-case report ideally becomes a new fixture in `testBench/`. Bonus points
-        for a minimal generator in `scripts/build_testbench.py`.
+        Every edge-case report ideally becomes a regression test. Bonus points
+        for a minimal `openpyxl` generator that reproduces it.
   - type: textarea
     id: pattern
     attributes:
@@ -33,6 +33,6 @@ body:
     attributes:
       label: What would you like next?
       options:
-        - label: "Add it to `testBench/` as a new stress fixture."
+        - label: "Land it as a new regression test in `tests/`."
         - label: "Open a PR fixing the parser."
         - label: "Triage help — I'm stuck."
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -6,15 +6,15 @@
 
 - [ ] 🐞 Bug fix
 - [ ] ✨ New feature
-- [ ] 🧪 Parser edge case / new `testBench/` fixture
+- [ ] 🧪 Parser edge case / new regression test
 - [ ] 📚 Docs
 - [ ] 🧹 Refactor / chore
 - [ ] 🚀 Performance
 
 ## Checklist
 
 - [ ] `make test` passes locally
-- [ ] `make testbench` still shows 1054/1054 (or the delta is explained below)
+- [ ] If parser/chunker internals changed: ran `make bench-robust` against SpreadsheetBench (call out any regressions below)
 - [ ] Added/updated tests covering the change
 - [ ] `ruff check` is clean
 - [ ] Updated docs if user-facing behaviour changed

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -50,34 +50,3 @@ jobs:
           name: junit-${{ matrix.os }}-py${{ matrix.python-version }}
           path: reports/junit.xml
           if-no-files-found: ignore
-
-  testbench:
-    name: testBench round-trip (ubuntu / py3.12)
-    runs-on: ubuntu-latest
-    needs: test
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.12"
-          cache: pip
-          cache-dependency-path: pyproject.toml
-
-      - name: Install
-        run: |
-          python -m pip install --upgrade pip
-          pip install -e ".[dev,api]"
-
-      - name: Build generated testBench
-        run: make testbench-build
-
-      - name: Run round-trip tests
-        run: make testbench
-
-      - name: Upload failure log
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: testbench-failures
-          path: metrics/testbench/failures.json
-          if-no-files-found: ignore
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -37,17 +37,13 @@ jobs:
       - name: Build wheel + sdist
         run: python -m build
 
-      - name: Build testBench zip
-        run: make testbench-zip
-
       - name: Upload distribution artifacts
         uses: actions/upload-artifact@v4
         with:
           name: dist
           path: |
             dist/*.whl
             dist/*.tar.gz
-            dist/testBench-v*.zip
 
   github-release:
     needs: build
@@ -81,7 +77,6 @@ jobs:
           files: |
             dist/*.whl
             dist/*.tar.gz
-            dist/testBench-v*.zip
           body_path: ${{ steps.notes.outputs.path }}
           generate_release_notes: ${{ steps.notes.outputs.auto == 'true' }}
 
@@ -97,9 +92,6 @@ jobs:
           name: dist
           path: dist
 
-      - name: Strip non-PyPI artifacts
-        run: rm -f dist/testBench-v*.zip
-
       - name: Publish to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
         with:

diff --git a/.gitignore b/.gitignore
@@ -51,17 +51,12 @@ tests/fixtures/corpus/
 # Corpus & metrics outputs
 metrics/corpus/
 metrics/corpus_summary.json
-metrics/testbench/
 
-# Generated stress test artifacts — the 1000-file bench is re-built on demand
-testBench/generated/
+# Generated stress test artifacts
 examples/stress_test/stress_results.json
 examples/stress_test/built_reference.json
 examples/stress_test/STRESS_TEST_RESULTS.md
 
-# Packaged dataset (produced by `make testbench-zip`)
-dist/testBench*.zip
-
 # Local benchmark harness (private, not pushed)
 tests/benchmarks/reports/
 tests/benchmarks/hucre_node/node_modules/

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,12 +4,10 @@ repos:
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
-        exclude: "^testBench/"
       - id: check-yaml
       - id: check-toml
       - id: check-added-large-files
-        args: ["--maxkb=5120"]   # 5 MB ceiling per file — testBench fixtures are larger, excluded below
-        exclude: "^testBench/"
+        args: ["--maxkb=5120"]
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.6.9

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -45,7 +45,28 @@ Template for a new release (copy this block, fill in, move Unreleased items in):
 
 ## [Unreleased]
 
-Nothing yet. Open a PR and add your entry under the appropriate heading.
+### ⚠️ BREAKING
+- Retired the in-tree `testBench/` corpus. The 1054-workbook stress dataset
+  and `make testbench*` targets are gone — benchmarks now run against the
+  public SpreadsheetBench v0.1 corpus, downloaded on demand to `data/corpora/`
+  (gitignored). See `docs/corpora.md`.
+
+### Removed
+- `testBench/` directory and all bundled real-world / generated workbooks.
+- `make testbench-build`, `make testbench`, `make testbench-zip` targets.
+- `testbench` job in `.github/workflows/ci.yml`.
+- `testBench-vX.Y.Z.zip` release asset from the release workflow.
+- `tests/test_testbench_roundtrip.py`, `tests/test_enterprise_scoring.py`,
+  `tests/test_real_world_datasets.py`, `tests/test_cross_validation.py`.
+- `scripts/build_testbench.py`, `scripts/generate_enterprise_fixtures.py`.
+- `static_xlsx` pytest fixture (the test bench it iterated is gone).
+
+### Changed
+- README, wiki, examples, and contributor docs now point at SpreadsheetBench
+  (`make bench-robust` / `make bench-retrieval`) as the canonical benchmark.
+- `examples/demo.py` + `examples/generate_examples.py` now write/read fixtures
+  under `examples/fixtures/` instead of the (removed) `testBench/real_world/`.
+
 
 ## [0.2.0] — 2026-05-11
 
@@ -173,7 +194,7 @@ announcement: [`docs/launch/RELEASE_NOTES_v0.1.1.md`](docs/launch/RELEASE_NOTES_
 
 ### Performance
 - Chunk builder caches `detect_circular_refs()` per workbook instead of
-  re-running it per block. Real 21k-cell financial model (Walbridge):
+  re-running it per block. Real 21k-cell financial model:
   **307 s → 4.6 s (66×)**.
 - Sheet parser iterates openpyxl's `_cells` dict instead of `iter_rows()`
   over the full bounding box. Workbooks with extreme sparse addresses
@@ -185,9 +206,8 @@ announcement: [`docs/launch/RELEASE_NOTES_v0.1.1.md`](docs/launch/RELEASE_NOTES_
   non-existent `dxfId=0` in generated fixtures, so openpyxl can load them
   back without an `IndexError`.
 - `test_formula_cached_values_match` now applies a 15 % threshold for
-  workbooks with known openpyxl `data_only` caching gaps (Walbridge),
-  5 % everywhere else. See
-  [`docs/PARSER_KNOWN_ISSUES.md`](docs/PARSER_KNOWN_ISSUES.md).
+  workbooks with known openpyxl `data_only` caching gaps, 5 % everywhere
+  else. See [`docs/PARSER_KNOWN_ISSUES.md`](docs/PARSER_KNOWN_ISSUES.md).
 
 ### Docs
 - New README positioned as *"Make XLSX LLM Ready"* with architecture

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -13,12 +13,12 @@ bug or send a small PR. If that's you, thank you.
 
 ## Ways to help (in order of preference for first-time contributors)
 
-1. **Run `make testbench` and report a file that breaks.** We actively want
-   edge-case `.xlsx` fixtures — use the
+1. **Run `make bench-robust` on SpreadsheetBench and report a file that
+   breaks.** We actively want edge-case `.xlsx` fixtures — use the
    [Parser edge case issue template](https://github.com/knowledgestack/ks-xlsx-parser/issues/new?template=parser_edge_case.yml).
-2. **Add a new workbook to `testBench/`.** Either drop a file under
-   `testBench/stress/` or add a builder to `scripts/build_testbench.py`. If
-   the parser crashes on it, even better.
+2. **Submit an adversarial workbook.** Attach a `.xlsx` (or a generator
+   that builds one) to a Parser edge case issue. If the parser crashes
+   on it, even better.
 3. **Fix one of the flagged issues** in [`docs/PARSER_KNOWN_ISSUES.md`](docs/PARSER_KNOWN_ISSUES.md).
 4. **Improve docs.** The README, the architecture diagram, the examples —
    if something confused you, it confuses everyone.
@@ -32,8 +32,9 @@ git clone https://github.com/knowledgestack/ks-xlsx-parser.git
 cd ks-xlsx-parser
 make install               # pip install -e ".[dev,api]"
 make test                  # fast, default suite
-make testbench-build       # regenerate 1000-file stress corpus (~1 min)
-make testbench             # round-trip every workbook; parallel
+make corpus-download       # fetch SpreadsheetBench (5,458 real-world xlsx)
+make bench-robust          # parse-success + structural counts vs Docling
+make bench-retrieval       # retrieval recall@k vs Docling
 ```
 
 Prerequisites: Python 3.10+, `pip`, optionally `make`. We use `ruff` for
@@ -44,7 +45,8 @@ linting/formatting — install it with the `[dev]` extra.
 Your PR should:
 
 1. Have tests. `pytest` must stay green: `make test`.
-2. Keep `make testbench` at 1054/1054 (or explain the delta in the PR description).
+2. If touching parser or chunker internals, run `make bench-robust` against
+   SpreadsheetBench and call out any regressions in the PR description.
 3. Pass `ruff check` (`make lint`) and be formatted with `make format`.
 4. Include one sentence in the PR description that starts with *"This change…"*.
 5. Use [conventional-commit style](https://www.conventionalcommits.org/)
@@ -74,7 +76,7 @@ Helpful things to include:
 - Type hints everywhere that's practical.
 - Tests live in `tests/`; programmatic workbook fixtures live in `tests/conftest.py`.
 - Cross-validation against calamine uses the `crossval` marker.
-- Long-running bench tests use `@pytest.mark.testbench` and are skipped by default.
+- The benchmark harness (`tests/benchmarks/`) lives outside `pytest` — invoke via `make bench-robust` / `make bench-retrieval`.
 - Keep public-API changes additive; if you can't, note it in the PR and the
   maintainers will line up the deprecation.
 

diff --git a/Makefile b/Makefile
@@ -1,25 +1,20 @@
-.PHONY: help install test test-ci testbench testbench-build testbench-zip lint format typecheck clean corpus-download bench-robust bench-retrieval bench
+.PHONY: help install test test-ci lint format typecheck clean corpus-download bench-robust bench-retrieval bench
 
 PYTHON ?= python
 PKG_VERSION := $(shell $(PYTHON) -c "import tomllib, pathlib; print(tomllib.loads(pathlib.Path('pyproject.toml').read_text())['project']['version'])")
-TESTBENCH_ZIP := dist/testBench-v$(PKG_VERSION).zip
 
 help:
 	@echo "ks-xlsx-parser — common targets"
 	@echo ""
 	@echo "  make install         Install package and dev deps (editable)"
-	@echo "  make test            Run the default test suite (skips corpus + testbench)"
+	@echo "  make test            Run the default test suite"
 	@echo "  make test-ci         Run the suite with verbose output for CI"
 	@echo ""
-	@echo "  make testbench-build Generate the 1000-file testBench dataset"
-	@echo "  make testbench       Run parser round-trip across the full testBench"
-	@echo "  make testbench-zip   Package testBench into $(TESTBENCH_ZIP) for GitHub release"
-	@echo ""
 	@echo "  make lint            Ruff lint"
 	@echo "  make format          Ruff format"
 	@echo "  make typecheck       mypy"
 	@echo ""
-	@echo "  make corpus-download Fetch public XLSX corpora for extended robustness"
+	@echo "  make corpus-download Fetch SpreadsheetBench for benchmark runs"
 	@echo ""
 	@echo "  make bench-robust    Robustness on SpreadsheetBench (ks vs docling, ~20 min)"
 	@echo "  make bench-retrieval Retrieval recall on SpreadsheetBench (ks vs docling, ~40 min)"
@@ -34,23 +29,6 @@ test:
 test-ci:
 	$(PYTHON) -m pytest tests/ -v --tb=short -W ignore::UserWarning --junitxml=reports/junit.xml
 
-testbench-build:
-	$(PYTHON) scripts/build_testbench.py --clean
-
-testbench:
-	@test -d testBench/generated || (echo "testBench/generated missing. Run 'make testbench-build' first." && exit 1)
-	$(PYTHON) -m pytest tests/test_testbench_roundtrip.py -m testbench --tb=short -W ignore::UserWarning
-
-testbench-zip: testbench-build
-	@mkdir -p dist
-	@echo "→ packaging testBench into $(TESTBENCH_ZIP)"
-	@rm -f $(TESTBENCH_ZIP)
-	@cd . && zip -qr $(TESTBENCH_ZIP) testBench \
-		-x "testBench/**/__pycache__/*" \
-		-x "testBench/**/.DS_Store"
-	@ls -lh $(TESTBENCH_ZIP)
-	@echo "→ attach with: gh release create v$(PKG_VERSION) $(TESTBENCH_ZIP) --generate-notes"
-
 lint:
 	$(PYTHON) -m ruff check src/ tests/ scripts/