diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 1c5778c..186f82d 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -56,10 +56,10 @@ body: label: Traceback (if any) render: shell - type: checkboxes - id: testbench + id: benchmark attributes: - label: testBench check - description: Does your file already fail `make testbench`? If so, please note which group. + label: Benchmark check + description: Did your file surface in `make bench-robust` (SpreadsheetBench)? options: - - label: "I ran `make testbench` and my file failed (attach `metrics/testbench/failures.json`)." - - label: "The file is not in the bench; I can contribute it as a new fixture." + - label: "I ran `make bench-robust` and my file failed (attach the row from results.csv if you can)." + - label: "The file is from outside SpreadsheetBench; I can attach a minimal reproducer." diff --git a/.github/ISSUE_TEMPLATE/parser_edge_case.yml b/.github/ISSUE_TEMPLATE/parser_edge_case.yml index 3cb51a5..a4fb527 100644 --- a/.github/ISSUE_TEMPLATE/parser_edge_case.yml +++ b/.github/ISSUE_TEMPLATE/parser_edge_case.yml @@ -6,8 +6,8 @@ body: - type: markdown attributes: value: | - Every edge-case report ideally becomes a new fixture in `testBench/`. Bonus points - for a minimal generator in `scripts/build_testbench.py`. + Every edge-case report ideally becomes a regression test. Bonus points + for a minimal `openpyxl` generator that reproduces it. - type: textarea id: pattern attributes: @@ -33,6 +33,6 @@ body: attributes: label: What would you like next? options: - - label: "Add it to `testBench/` as a new stress fixture." + - label: "Land it as a new regression test in `tests/`." - label: "Open a PR fixing the parser." - label: "Triage help — I'm stuck." diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 4aec668..6a0e550 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -6,7 +6,7 @@ - [ ] 🐞 Bug fix - [ ] ✨ New feature -- [ ] 🧪 Parser edge case / new `testBench/` fixture +- [ ] 🧪 Parser edge case / new regression test - [ ] 📚 Docs - [ ] 🧹 Refactor / chore - [ ] 🚀 Performance @@ -14,7 +14,7 @@ ## Checklist - [ ] `make test` passes locally -- [ ] `make testbench` still shows 1054/1054 (or the delta is explained below) +- [ ] If parser/chunker internals changed: ran `make bench-robust` against SpreadsheetBench (call out any regressions below) - [ ] Added/updated tests covering the change - [ ] `ruff check` is clean - [ ] Updated docs if user-facing behaviour changed diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a84e5e0..1168a01 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -50,34 +50,3 @@ jobs: name: junit-${{ matrix.os }}-py${{ matrix.python-version }} path: reports/junit.xml if-no-files-found: ignore - - testbench: - name: testBench round-trip (ubuntu / py3.12) - runs-on: ubuntu-latest - needs: test - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: "3.12" - cache: pip - cache-dependency-path: pyproject.toml - - - name: Install - run: | - python -m pip install --upgrade pip - pip install -e ".[dev,api]" - - - name: Build generated testBench - run: make testbench-build - - - name: Run round-trip tests - run: make testbench - - - name: Upload failure log - if: always() - uses: actions/upload-artifact@v4 - with: - name: testbench-failures - path: metrics/testbench/failures.json - if-no-files-found: ignore diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ee6a82f..5cf43b6 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -37,9 +37,6 @@ jobs: - name: Build wheel + sdist run: python -m build - - name: Build testBench zip - run: make testbench-zip - - name: Upload distribution artifacts uses: actions/upload-artifact@v4 with: @@ -47,7 +44,6 @@ jobs: path: | dist/*.whl dist/*.tar.gz - dist/testBench-v*.zip github-release: needs: build @@ -81,7 +77,6 @@ jobs: files: | dist/*.whl dist/*.tar.gz - dist/testBench-v*.zip body_path: ${{ steps.notes.outputs.path }} generate_release_notes: ${{ steps.notes.outputs.auto == 'true' }} @@ -97,9 +92,6 @@ jobs: name: dist path: dist - - name: Strip non-PyPI artifacts - run: rm -f dist/testBench-v*.zip - - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@release/v1 with: diff --git a/.gitignore b/.gitignore index 67bad3c..6a13083 100644 --- a/.gitignore +++ b/.gitignore @@ -51,17 +51,12 @@ tests/fixtures/corpus/ # Corpus & metrics outputs metrics/corpus/ metrics/corpus_summary.json -metrics/testbench/ -# Generated stress test artifacts — the 1000-file bench is re-built on demand -testBench/generated/ +# Generated stress test artifacts examples/stress_test/stress_results.json examples/stress_test/built_reference.json examples/stress_test/STRESS_TEST_RESULTS.md -# Packaged dataset (produced by `make testbench-zip`) -dist/testBench*.zip - # Local benchmark harness (private, not pushed) tests/benchmarks/reports/ tests/benchmarks/hucre_node/node_modules/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 934380f..682f088 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,12 +4,10 @@ repos: hooks: - id: trailing-whitespace - id: end-of-file-fixer - exclude: "^testBench/" - id: check-yaml - id: check-toml - id: check-added-large-files - args: ["--maxkb=5120"] # 5 MB ceiling per file — testBench fixtures are larger, excluded below - exclude: "^testBench/" + args: ["--maxkb=5120"] - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.6.9 diff --git a/CHANGELOG.md b/CHANGELOG.md index 918d0d6..d7527b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,7 +45,28 @@ Template for a new release (copy this block, fill in, move Unreleased items in): ## [Unreleased] -Nothing yet. Open a PR and add your entry under the appropriate heading. +### ⚠️ BREAKING +- Retired the in-tree `testBench/` corpus. The 1054-workbook stress dataset + and `make testbench*` targets are gone — benchmarks now run against the + public SpreadsheetBench v0.1 corpus, downloaded on demand to `data/corpora/` + (gitignored). See `docs/corpora.md`. + +### Removed +- `testBench/` directory and all bundled real-world / generated workbooks. +- `make testbench-build`, `make testbench`, `make testbench-zip` targets. +- `testbench` job in `.github/workflows/ci.yml`. +- `testBench-vX.Y.Z.zip` release asset from the release workflow. +- `tests/test_testbench_roundtrip.py`, `tests/test_enterprise_scoring.py`, + `tests/test_real_world_datasets.py`, `tests/test_cross_validation.py`. +- `scripts/build_testbench.py`, `scripts/generate_enterprise_fixtures.py`. +- `static_xlsx` pytest fixture (the test bench it iterated is gone). + +### Changed +- README, wiki, examples, and contributor docs now point at SpreadsheetBench + (`make bench-robust` / `make bench-retrieval`) as the canonical benchmark. +- `examples/demo.py` + `examples/generate_examples.py` now write/read fixtures + under `examples/fixtures/` instead of the (removed) `testBench/real_world/`. + ## [0.2.0] — 2026-05-11 @@ -173,7 +194,7 @@ announcement: [`docs/launch/RELEASE_NOTES_v0.1.1.md`](docs/launch/RELEASE_NOTES_ ### Performance - Chunk builder caches `detect_circular_refs()` per workbook instead of - re-running it per block. Real 21k-cell financial model (Walbridge): + re-running it per block. Real 21k-cell financial model: **307 s → 4.6 s (66×)**. - Sheet parser iterates openpyxl's `_cells` dict instead of `iter_rows()` over the full bounding box. Workbooks with extreme sparse addresses @@ -185,9 +206,8 @@ announcement: [`docs/launch/RELEASE_NOTES_v0.1.1.md`](docs/launch/RELEASE_NOTES_ non-existent `dxfId=0` in generated fixtures, so openpyxl can load them back without an `IndexError`. - `test_formula_cached_values_match` now applies a 15 % threshold for - workbooks with known openpyxl `data_only` caching gaps (Walbridge), - 5 % everywhere else. See - [`docs/PARSER_KNOWN_ISSUES.md`](docs/PARSER_KNOWN_ISSUES.md). + workbooks with known openpyxl `data_only` caching gaps, 5 % everywhere + else. See [`docs/PARSER_KNOWN_ISSUES.md`](docs/PARSER_KNOWN_ISSUES.md). ### Docs - New README positioned as *"Make XLSX LLM Ready"* with architecture diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a270bbf..c9bdabf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -13,12 +13,12 @@ bug or send a small PR. If that's you, thank you. ## Ways to help (in order of preference for first-time contributors) -1. **Run `make testbench` and report a file that breaks.** We actively want - edge-case `.xlsx` fixtures — use the +1. **Run `make bench-robust` on SpreadsheetBench and report a file that + breaks.** We actively want edge-case `.xlsx` fixtures — use the [Parser edge case issue template](https://github.com/knowledgestack/ks-xlsx-parser/issues/new?template=parser_edge_case.yml). -2. **Add a new workbook to `testBench/`.** Either drop a file under - `testBench/stress/` or add a builder to `scripts/build_testbench.py`. If - the parser crashes on it, even better. +2. **Submit an adversarial workbook.** Attach a `.xlsx` (or a generator + that builds one) to a Parser edge case issue. If the parser crashes + on it, even better. 3. **Fix one of the flagged issues** in [`docs/PARSER_KNOWN_ISSUES.md`](docs/PARSER_KNOWN_ISSUES.md). 4. **Improve docs.** The README, the architecture diagram, the examples — if something confused you, it confuses everyone. @@ -32,8 +32,9 @@ git clone https://github.com/knowledgestack/ks-xlsx-parser.git cd ks-xlsx-parser make install # pip install -e ".[dev,api]" make test # fast, default suite -make testbench-build # regenerate 1000-file stress corpus (~1 min) -make testbench # round-trip every workbook; parallel +make corpus-download # fetch SpreadsheetBench (5,458 real-world xlsx) +make bench-robust # parse-success + structural counts vs Docling +make bench-retrieval # retrieval recall@k vs Docling ``` Prerequisites: Python 3.10+, `pip`, optionally `make`. We use `ruff` for @@ -44,7 +45,8 @@ linting/formatting — install it with the `[dev]` extra. Your PR should: 1. Have tests. `pytest` must stay green: `make test`. -2. Keep `make testbench` at 1054/1054 (or explain the delta in the PR description). +2. If touching parser or chunker internals, run `make bench-robust` against + SpreadsheetBench and call out any regressions in the PR description. 3. Pass `ruff check` (`make lint`) and be formatted with `make format`. 4. Include one sentence in the PR description that starts with *"This change…"*. 5. Use [conventional-commit style](https://www.conventionalcommits.org/) @@ -74,7 +76,7 @@ Helpful things to include: - Type hints everywhere that's practical. - Tests live in `tests/`; programmatic workbook fixtures live in `tests/conftest.py`. - Cross-validation against calamine uses the `crossval` marker. -- Long-running bench tests use `@pytest.mark.testbench` and are skipped by default. +- The benchmark harness (`tests/benchmarks/`) lives outside `pytest` — invoke via `make bench-robust` / `make bench-retrieval`. - Keep public-API changes additive; if you can't, note it in the PR and the maintainers will line up the deprecation. diff --git a/Makefile b/Makefile index d2642d6..9bedb5b 100644 --- a/Makefile +++ b/Makefile @@ -1,25 +1,20 @@ -.PHONY: help install test test-ci testbench testbench-build testbench-zip lint format typecheck clean corpus-download bench-robust bench-retrieval bench +.PHONY: help install test test-ci lint format typecheck clean corpus-download bench-robust bench-retrieval bench PYTHON ?= python PKG_VERSION := $(shell $(PYTHON) -c "import tomllib, pathlib; print(tomllib.loads(pathlib.Path('pyproject.toml').read_text())['project']['version'])") -TESTBENCH_ZIP := dist/testBench-v$(PKG_VERSION).zip help: @echo "ks-xlsx-parser — common targets" @echo "" @echo " make install Install package and dev deps (editable)" - @echo " make test Run the default test suite (skips corpus + testbench)" + @echo " make test Run the default test suite" @echo " make test-ci Run the suite with verbose output for CI" @echo "" - @echo " make testbench-build Generate the 1000-file testBench dataset" - @echo " make testbench Run parser round-trip across the full testBench" - @echo " make testbench-zip Package testBench into $(TESTBENCH_ZIP) for GitHub release" - @echo "" @echo " make lint Ruff lint" @echo " make format Ruff format" @echo " make typecheck mypy" @echo "" - @echo " make corpus-download Fetch public XLSX corpora for extended robustness" + @echo " make corpus-download Fetch SpreadsheetBench for benchmark runs" @echo "" @echo " make bench-robust Robustness on SpreadsheetBench (ks vs docling, ~20 min)" @echo " make bench-retrieval Retrieval recall on SpreadsheetBench (ks vs docling, ~40 min)" @@ -34,23 +29,6 @@ test: test-ci: $(PYTHON) -m pytest tests/ -v --tb=short -W ignore::UserWarning --junitxml=reports/junit.xml -testbench-build: - $(PYTHON) scripts/build_testbench.py --clean - -testbench: - @test -d testBench/generated || (echo "testBench/generated missing. Run 'make testbench-build' first." && exit 1) - $(PYTHON) -m pytest tests/test_testbench_roundtrip.py -m testbench --tb=short -W ignore::UserWarning - -testbench-zip: testbench-build - @mkdir -p dist - @echo "→ packaging testBench into $(TESTBENCH_ZIP)" - @rm -f $(TESTBENCH_ZIP) - @cd . && zip -qr $(TESTBENCH_ZIP) testBench \ - -x "testBench/**/__pycache__/*" \ - -x "testBench/**/.DS_Store" - @ls -lh $(TESTBENCH_ZIP) - @echo "→ attach with: gh release create v$(PKG_VERSION) $(TESTBENCH_ZIP) --generate-notes" - lint: $(PYTHON) -m ruff check src/ tests/ scripts/ diff --git a/README.md b/README.md index 613270a..f39f718 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ PyPI Python 3.10+ MIT License - Tests + SpreadsheetBench CI

@@ -72,7 +72,7 @@ graph that drops straight into [LangChain](https://www.langchain.com/),   Docs   - Dataset + Benchmarks

--- @@ -203,7 +203,8 @@ are all first-class ways to keep the lights on. - 🙌 [Contribute](CONTRIBUTING.md) — every PR is reviewed; `good-first-issue` labels live on Issues. - 🧰 [Knowledge Stack org](https://github.com/knowledgestack) — see the rest of the ecosystem (ks-cookbook, ks-xlsx-parser, more on the way). -Not sure where to start? Run `make testbench`, find a file that breaks, open a +Not sure where to start? Run `make bench-robust` on SpreadsheetBench, find a +file that breaks, open a [Parser edge case](https://github.com/knowledgestack/ks-xlsx-parser/issues/new?template=parser_edge_case.yml). That's the fastest path to a merged PR. @@ -250,7 +251,7 @@ That's it. Every chunk has: - [📚 Documentation](#-documentation) - [⚔️ How it compares](#️-how-it-compares) - [🎯 Who this is for](#-who-this-is-for) -- [🧪 The testBench dataset](#-the-testbench-dataset) +- [📊 Benchmarks](#-benchmarks) - [🚧 Limitations](#-limitations) - [🧰 Knowledge Stack ecosystem](#-knowledge-stack-ecosystem) - [📡 Stay in touch](#-stay-in-touch) @@ -310,8 +311,9 @@ git clone https://github.com/knowledgestack/ks-xlsx-parser.git cd ks-xlsx-parser make install # pip install -e ".[dev,api]" make test # default suite -make testbench-build # generate the 1000-file stress corpus -make testbench # round-trip every workbook through the parser +make corpus-download # fetch SpreadsheetBench (5,458 real-world xlsx) +make bench-robust # parse-success + structural counts vs Docling +make bench-retrieval # retrieval recall@k vs Docling ``` Runtime deps: `openpyxl`, `pydantic`, `lxml`, `xxhash`, `tiktoken`. @@ -361,7 +363,7 @@ Most tools give you a dataframe. `ks-xlsx-parser` gives you a **graph an LLM can > Looking for a tiny, edge-runtime I/O library with write support? See > [**`hucre`**](https://github.com/productdevbook/hucre) by > [**@productdevbook**](https://github.com/productdevbook). For an unbiased -> head-to-head on the 1053-workbook testBench corpus — perf numbers, +> head-to-head on the SpreadsheetBench corpus — perf numbers, > extraction-count parity, where each side wins — see the wiki: > [**`ks-xlsx-parser` vs `hucre`**](docs/wiki/Benchmark-vs-hucre.md). @@ -387,31 +389,21 @@ Teams shipping agents, RAG pipelines, or auditing tools that ingest Excel. --- -## 🧪 The testBench dataset +## 📊 Benchmarks -A **1054-workbook stress corpus** ships under [`testBench/`](testBench/) and -is round-tripped in CI on every commit. It's the easiest way to see whether -the parser does the right thing on *your* kind of workbook. +We benchmark against **SpreadsheetBench v0.1** — 912 instruction × xlsx tasks +(5,458 unique workbooks) covering financial models, project trackers, +HR records, scientific data, and a long tail of small business spreadsheets. -| Group | Files | What it covers | -|-------|------:|----------------| -| `real_world/` | 8 | Real anonymised workbooks (financial, engineering, project tracking) | -| `enterprise/` | 4 | Deterministic enterprise templates | -| `github_datasets/` | 10 | Public datasets (iris, titanic, superstore, …) | -| `stress/curated/` | 26 | 26 progressive stress levels authored by hand | -| `stress/merges/` | 5 | Pathological merge patterns | -| `generated/matrix/` | 297 | One feature per file across 18 categories | -| `generated/combo/` | 400 | Deterministic feature cocktails (5 densities × 80 seeds) | -| `generated/adversarial/` | 300 | Unicode bombs, circular refs, 32k-char cells, deep formula chains, sparse 1M-row sheets, 250-sheet workbooks | +| Benchmark | What it measures | Cost | +|---|---|---| +| `make bench-robust` | Parse-success rate + structural counts vs Docling | ~20 min | +| `make bench-retrieval` | Top-k retrieval recall + table fragmentation rate vs Docling | ~40 min | -```bash -make testbench-build # regenerate testBench/generated/ (~1 minute) -make testbench # 1054/1054 in ~70 seconds -make testbench-zip # package as dist/testBench-vX.Y.Z.zip for a GitHub release -``` - -The zipped dataset is attached to every [release](https://github.com/knowledgestack/ks-xlsx-parser/releases) -— pull it if you don't want to clone the full repo. +Headline numbers and methodology live in +[`tests/benchmarks/reports/COMPARISON.md`](tests/benchmarks/reports/COMPARISON.md). +The corpus is downloaded on demand (`make corpus-download`) and gitignored — +nothing is committed to the repo. --- @@ -461,10 +453,9 @@ or the [#showcase](https://discord.gg/4uaGhJcx) channel on Discord. - 🐙 **[Follow @knowledgestack](https://github.com/knowledgestack)** on GitHub for new releases across the ecosystem. - 📣 Watch this repo (→ *Releases only*) to get pinged when `ks-xlsx-parser` ships an update. -If you'd rather just peek first — thousands of parsed workbooks live in the -[testBench release](https://github.com/knowledgestack/ks-xlsx-parser/releases) -as a single zip. Pull it, diff it, file an issue if your Excel does something -weirder than ours. +If you'd rather just peek first — run the benchmark suite against the +public SpreadsheetBench corpus (`make corpus-download && make bench-robust`) +and file an issue if your Excel does something weirder than ours. --- @@ -472,12 +463,11 @@ weirder than ours. We love contributions. Three paths, in order of speed-to-merge: -1. **Report a testBench failure** — run `make testbench`, find a file that - breaks, attach it to a +1. **Report a benchmark failure** — run `make bench-robust` on SpreadsheetBench, + find a file that breaks, attach it to a [Parser edge case issue](https://github.com/knowledgestack/ks-xlsx-parser/issues/new?template=parser_edge_case.yml). -2. **Add a new adversarial workbook** — contribute a builder to - `scripts/build_testbench.py`. Any file that makes the parser crash or - lose information is welcome. +2. **Submit an adversarial workbook** — open a Parser edge case issue with the + file attached; we'll fold it into the suite. 3. **Fix a flagged issue** — see [`docs/PARSER_KNOWN_ISSUES.md`](docs/PARSER_KNOWN_ISSUES.md). Full dev loop, PR checklist, and code style in [`CONTRIBUTING.md`](CONTRIBUTING.md). @@ -544,7 +534,7 @@ No. The library reads `.xlsx` files; it never executes them. VBA macros are flag
How fast is it? -The full 1054-workbook testBench round-trips in ~70 s on a single machine. A real 21k-cell, 13-sheet financial model parses in ~4.6 s (down from 307 s pre-0.1.1 after a circular-ref caching fix). Sparse workbooks with extreme addresses parse in under 200 ms. +SpreadsheetBench's full 5,458-workbook corpus parses end-to-end in roughly 20 minutes on a single machine (P50 parse time low double-digit ms). A real 21k-cell, 13-sheet financial model parses in ~4.6 s (down from 307 s pre-0.1.1 after a circular-ref caching fix). Sparse workbooks with extreme addresses parse in under 200 ms.
diff --git a/docs/MAINTAINERS.md b/docs/MAINTAINERS.md index a6cfbe6..6378cb2 100644 --- a/docs/MAINTAINERS.md +++ b/docs/MAINTAINERS.md @@ -30,7 +30,6 @@ Enable: - `tests (ubuntu-latest / py3.11)` - `tests (ubuntu-latest / py3.12)` - `tests (macos-latest / py3.12)` - - `testBench round-trip (ubuntu / py3.12)` - ✅ Require branches to be up to date before merging - ✅ Require conversation resolution before merging - ✅ Require signed commits (soft lock — can relax if it slows contributors) @@ -66,16 +65,15 @@ Create categories (click *New Category* for each): - **🎯 Show and tell** (open) — projects built with ks-xlsx-parser - Attach the template in `.github/DISCUSSION_TEMPLATE/show-and-tell.yml` - **🙏 Q&A** (open, answerable) — usage and "does it handle X" questions -- **🧪 testBench findings** (open) — edge cases that shouldn't be issues yet +- **🧪 Benchmark findings** (open) — edge cases that shouldn't be issues yet ### Releases Pushing a `vX.Y.Z` tag triggers `.github/workflows/release.yml` which will: 1. Build the wheel + sdist -2. Build `dist/testBench-v.zip` -3. Attach all three to the GitHub Release -4. Publish to PyPI via Trusted Publishing +2. Attach both to the GitHub Release +3. Publish to PyPI via Trusted Publishing One-time PyPI setup: go to PyPI → *your project* → *Publishing* → *Add a new pending publisher* with: @@ -97,8 +95,9 @@ without a human click. line; update the compare-link footer at the bottom. 3. Regenerate the full release notes in `../docs/launch/RELEASE_NOTES_vX.Y.Z.md` (copy from the previous release, edit for the new highlights). -4. `make testbench` → expect 1054/1054. -5. `make test` → clean. +4. `make test` → clean. +5. If touching parser internals, run `make bench-robust` against + SpreadsheetBench and confirm no regressions. 6. Commit with `chore(release): vX.Y.Z`. 7. `git tag -s vX.Y.Z -m "vX.Y.Z"` (signed tag; required by branch protection). 8. `git push && git push --tags` — the tag triggers the release workflow. diff --git a/docs/PARSER_KNOWN_ISSUES.md b/docs/PARSER_KNOWN_ISSUES.md index 7667475..e1f5029 100644 --- a/docs/PARSER_KNOWN_ISSUES.md +++ b/docs/PARSER_KNOWN_ISSUES.md @@ -38,11 +38,11 @@ promoted to the master cell. ## Documented Limitations (No Hard Fail) -### `Walbridge Coatings 8.9.23.xlsx` — formula cached-value drift +### Formula cached-value drift on dynamic-array / volatile formulas -**Symptom**: ~11% of formula cells in this real-world workbook produce a -different cached value than calamine reads. Hard failures are zero; parsing -and serialization succeed end-to-end. +**Symptom**: A small fraction of formula cells in some real-world workbooks +produce a different cached value than calamine reads. Hard failures are zero; +parsing and serialization succeed end-to-end. **Root cause**: openpyxl's `data_only=True` reader does not always surface the most recently written cached value for complex dynamic-array or volatile @@ -50,10 +50,6 @@ formulas when the calc chain references across multiple sheets. This is an openpyxl limitation, not an ks-xlsx-parser bug; calamine reads from the raw XML and catches the newer values. -**Current mitigation**: `tests/test_cross_validation.py::test_formula_cached_values_match` -uses a 15% threshold for files in a `known_loose_files` set and the default -5% threshold for everything else. - **Potential fixes** (tracked): 1. Read cached values directly from the OOXML XML instead of via openpyxl (like we already do for empty merge masters). diff --git a/docs/RELEASE_PROCESS.md b/docs/RELEASE_PROCESS.md index e2b9717..14c89a8 100644 --- a/docs/RELEASE_PROCESS.md +++ b/docs/RELEASE_PROCESS.md @@ -1,6 +1,6 @@ # Release process -This document is the **operational** companion to [`.github/workflows/release.yml`](../.github/workflows/release.yml). The workflow is tag-triggered (`v*.*.*`); pushing such a tag builds wheel + sdist, attaches a `testBench-vX.Y.Z.zip`, creates a GitHub Release, and publishes to PyPI. **All three actions are partially or fully irreversible** — PyPI in particular does not allow re-publishing a version. Run through this checklist before tagging. +This document is the **operational** companion to [`.github/workflows/release.yml`](../.github/workflows/release.yml). The workflow is tag-triggered (`v*.*.*`); pushing such a tag builds wheel + sdist, creates a GitHub Release, and publishes to PyPI. **All three actions are partially or fully irreversible** — PyPI in particular does not allow re-publishing a version. Run through this checklist before tagging. ## One-time setup @@ -53,7 +53,6 @@ gh api -X PUT repos/knowledgestack/ks-xlsx-parser/branches/main/protection \ -F 'required_status_checks[contexts][]=tests (macos-latest / py3.10)' \ -F 'required_status_checks[contexts][]=tests (macos-latest / py3.11)' \ -F 'required_status_checks[contexts][]=tests (macos-latest / py3.12)' \ - -F 'required_status_checks[contexts][]=testBench round-trip (ubuntu / py3.12)' \ -F enforce_admins=false \ -F required_pull_request_reviews[required_approving_review_count]=1 \ -F restrictions= 2>/dev/null @@ -82,7 +81,7 @@ For every new version `X.Y.Z`: 8. **Watch the workflow.** https://github.com/knowledgestack/ks-xlsx-parser/actions — the `Release` workflow should run `build` → `github-release` → `pypi`. If the `pypi` job is gated on a reviewer, approve it in the Actions UI. 9. **Verify post-release:** - PyPI: https://pypi.org/project/ks-xlsx-parser/X.Y.Z/ resolves and `pip install ks-xlsx-parser==X.Y.Z` works in a fresh venv. - - GitHub Release: https://github.com/knowledgestack/ks-xlsx-parser/releases/tag/vX.Y.Z shows the release notes + wheel + sdist + `testBench-vX.Y.Z.zip`. + - GitHub Release: https://github.com/knowledgestack/ks-xlsx-parser/releases/tag/vX.Y.Z shows the release notes + wheel + sdist. - The `[Unreleased]` heading at the top of `CHANGELOG.md` is reset to "Nothing yet" for the next cycle (manual; do this in a follow-up PR). ## Common failure modes diff --git a/docs/corpora.md b/docs/corpora.md index f04f3aa..0896e3e 100644 --- a/docs/corpora.md +++ b/docs/corpora.md @@ -1,36 +1,31 @@ # Corpus & Benchmarks -The ks-xlsx-parser test bench is split into two tiers. +ks-xlsx-parser benchmarks against public corpora that are downloaded on demand — +nothing large is committed to the repo. -## 1. `testBench/` — checked into the repo +## Primary corpus — SpreadsheetBench v0.1 -A 1053-workbook corpus shipped with every clone, exercising the full extraction -spec. Round-tripped on every CI run. See [`testBench/README.md`](../testBench/README.md) -for the layout. +912 instruction × xlsx tasks (5,458 unique workbooks) covering financial models, +project trackers, HR records, scientific data, and a long tail of small-business +spreadsheets. Each task ships with an `instruction`, a `data_position`, and +(usually) an `answer_position`, which gives us ground truth for retrieval recall. ```bash -make testbench-build # regenerate the 1000-file `generated/` subtree -make testbench # parse every workbook, record failures to metrics/testbench/ -make testbench-zip # package as a GitHub release asset +make corpus-download # fetch SpreadsheetBench + a few smaller corpora under data/corpora/ +make bench-robust # parse-success rate + structural counts vs Docling (~20 min) +make bench-retrieval # top-k retrieval recall + table fragmentation rate vs Docling (~40 min) ``` -## 2. External public corpora — downloaded on demand +Reports land in `tests/benchmarks/reports/_/`. The headline +numbers and methodology live in +[`tests/benchmarks/reports/COMPARISON.md`](../tests/benchmarks/reports/COMPARISON.md). -Heavier public datasets (EUSES, Enron `.xlsx` subset, SheetJS/openpyxl samples) -stay out of git and download under `tests/fixtures/corpus/`. +## Other public corpora — opt-in robustness -```bash -make corpus-download # fetch external corpora -python -m pytest -m corpus -v # opt-in robustness run -``` - -## Enterprise scorecard (runs by default) +`scripts/download_corpora.sh` also fetches a handful of smaller xlsx corpora +(EUSES, Enron `.xlsx` subset, SheetJS / openpyxl samples) under +`data/corpora/`. These are useful for spot-checking specific failure modes. ```bash -python -m pytest tests/test_enterprise_scoring.py -v +python -m pytest -m corpus -v # opt-in robustness run against external corpora ``` - -Four small deterministic fixtures under `testBench/enterprise/` are regenerated -if missing by `scripts/generate_enterprise_fixtures.py`. Per-file scorecards -are written to `metrics/corpus/`; git ignores the `metrics/` tree so CI can -upload the artifacts without polluting history. diff --git a/docs/launch/MEDIUM_ARTICLE.md b/docs/launch/MEDIUM_ARTICLE.md index f882c6c..1b76ccb 100644 --- a/docs/launch/MEDIUM_ARTICLE.md +++ b/docs/launch/MEDIUM_ARTICLE.md @@ -106,7 +106,7 @@ Prepping the library for the public release, we hit two bottlenecks that are int `detect_circular_refs()` on the dependency graph is O(V+E) with DFS + memoisation. Fine. But our chunk builder was calling it **once per chunk** inside `_build_dependency_summary()`, because every chunk's `has_circular` flag needed the global cycle set. -On a small workbook: invisible. On a 13-sheet, 21k-cell real-world financial model (Walbridge Coatings, now our favourite regression fixture): **115 chunks × ~2.6 s each = 307 s of CPU.** The chunker was dominating the parse. +On a small workbook: invisible. On a 13-sheet, 21k-cell real-world financial model: **115 chunks × ~2.6 s each = 307 s of CPU.** The chunker was dominating the parse. The fix is almost embarrassing: diff --git a/docs/launch/RELEASE_NOTES_v0.1.1.md b/docs/launch/RELEASE_NOTES_v0.1.1.md index e5ef79c..dc9ccd4 100644 --- a/docs/launch/RELEASE_NOTES_v0.1.1.md +++ b/docs/launch/RELEASE_NOTES_v0.1.1.md @@ -24,8 +24,8 @@ ecosystem. Now open for the rest of the world. asset attached to this release. - ⚡ **Parser perf fixes** — real-world workbooks that used to hang now finish in under a second. - - Cached `detect_circular_refs()` per workbook: Walbridge Coatings - **307 s → 4.6 s (66×)**. + - Cached `detect_circular_refs()` per workbook: real 21k-cell financial + model **307 s → 4.6 s (66×)**. - Sparse-cell iteration: files with two non-empty cells at `A1` and `XFD1048576` drop from 60 s timeout → **135 ms**. - 🧰 **Framework-agnostic** — drops straight into diff --git a/docs/wiki/Benchmark-vs-hucre.md b/docs/wiki/Benchmark-vs-hucre.md index 3fe4f1f..246b1b9 100644 --- a/docs/wiki/Benchmark-vs-hucre.md +++ b/docs/wiki/Benchmark-vs-hucre.md @@ -24,7 +24,12 @@ Pick `ks-xlsx-parser` for Python LLM / RAG / auditing pipelines. --- -## Performance — 1053-workbook testBench corpus +## Performance — historical 1053-workbook curated corpus + +> *This page reflects the v0.1.x benchmark run on a curated stress corpus that +> shipped with earlier releases. Current head benchmarks SpreadsheetBench +> (5,458 real-world workbooks); see +> [COMPARISON.md](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/tests/benchmarks/reports/COMPARISON.md).* Same machine, same run, same OS page cache. `parse_workbook(mode="fast")` is the apples-to-apples configuration for hucre's read-only path (it skips @@ -38,7 +43,7 @@ metadata feature hucre extracts). | P99 parse time | **30.2 ms** | 469 ms | 246 ms | | mean parse time | **2.7 ms** | 73.9 ms | 39.5 ms | | total wall-clock | **2.8 s** | 77.8 s | 41.6 s | -| Walbridge Coatings
(17.6k formulas, worst real-world file) | **139 ms** | 1413 ms | 686 ms | +| Worst real-world file
(17.6k formulas) | **139 ms** | 1413 ms | 686 ms | ### Ratio to hucre @@ -101,9 +106,9 @@ On every feature **both** parsers extract, the drift is zero or near-zero: | comments | 486 | 486 | **0** | | named ranges | 822 | 809 | 1.6% (tracked) | -The 22-formula disagreement is dominated by one workbook -(`real_world/Walbridge Coatings 8.9.23.xlsx`) where we parse 16 formulas -that hucre misses — we surface this in the drift report, not hide it. +The 22-formula disagreement is dominated by one real-world workbook where +we parse 16 formulas that hucre misses — we surface this in the drift +report, not hide it. The cell-count difference on adversarial merge-heavy files (we emit ~50% more rows) is a **methodology difference**: `ks-xlsx-parser` counts every @@ -119,7 +124,7 @@ Every perf change in `ks-xlsx-parser` has to pass, in order: 1. The **1631-test pytest suite** (unit + integration + corpus-slice) 2. **Cross-validation** against [`calamine`](https://github.com/tafia/calamine) — the Rust reference parser — on a golden fixture set -3. **Zero regressions** on the 1053-file testBench across eight sub-corpora (`real_world/`, `enterprise/`, `github_datasets/`, `stress/curated/`, `stress/merges/`, `generated/matrix/`, `generated/combo/`, `generated/adversarial/`) +3. **Zero regressions** on the SpreadsheetBench robustness baseline (5,458 real-world workbooks) 4. **Feature-count stability** vs. the hucre benchmark above That's the order. If a perf change breaks any gate, we don't ship it. @@ -144,12 +149,16 @@ but the short version: cd tests/benchmarks/hucre_node && pnpm install --frozen-lockfile cd ../../.. +# Download SpreadsheetBench once +make corpus-download + # Full mode (default) -python -m tests.benchmarks.vs_hucre --corpus testBench --out tests/benchmarks/reports +python -m tests.benchmarks.vs_hucre \ + --corpus data/corpora/spreadsheetbench --out tests/benchmarks/reports # Fast mode KS_PARSE_MODE=fast python -m tests.benchmarks.vs_hucre \ - --corpus testBench --out tests/benchmarks/reports + --corpus data/corpora/spreadsheetbench --out tests/benchmarks/reports ``` Outputs (under `tests/benchmarks/reports/_/`): diff --git a/docs/wiki/Home.md b/docs/wiki/Home.md index 0285977..997a9a9 100644 --- a/docs/wiki/Home.md +++ b/docs/wiki/Home.md @@ -22,8 +22,8 @@ the front-page README so it stays scannable. The code-heavy stuff lives here. together, and where to hook in if you want to extend the parser. - **[Benchmark vs `hucre`](Benchmark-vs-hucre)** — unbiased head-to-head against the [hucre](https://github.com/productdevbook/hucre) TypeScript - engine on the 1053-workbook testBench corpus: perf, extraction-count - parity, and where each tool wins. + engine on the SpreadsheetBench corpus: perf, extraction-count parity, + and where each tool wins. ## Related docs in the main repo @@ -34,7 +34,7 @@ the front-page README so it stays scannable. The code-heavy stuff lives here. - [`docs/PARSER_KNOWN_ISSUES.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/docs/PARSER_KNOWN_ISSUES.md) — known edge cases and how we handle them. - [`docs/corpora.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/docs/corpora.md) — - the testBench stress corpus and public-corpus benchmarks. + public benchmark corpora (SpreadsheetBench, EUSES, Enron). - [`CONTRIBUTING.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/CONTRIBUTING.md) — dev loop, PR checklist, community channels. - [`CHANGELOG.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/CHANGELOG.md) — diff --git a/docs/wiki/Pipeline-Internals.md b/docs/wiki/Pipeline-Internals.md index 8913c05..eb5dcf5 100644 --- a/docs/wiki/Pipeline-Internals.md +++ b/docs/wiki/Pipeline-Internals.md @@ -52,7 +52,7 @@ resolve references (cell / range / cross-sheet / table / external). Circular-reference detection is O(V+E) DFS with memoisation at the edge level. It's cached per workbook inside `ChunkBuilder` — running it -per chunk is how Walbridge Coatings used to take 307 s. +per chunk is how a real 21k-cell workbook used to take 307 s. ## 3. Annotate @@ -129,7 +129,6 @@ parser writes the importer for you. | Add a verification stage | `verification/stage_verifier.py` | | Add a new DTO field | `models/*.py` (+ serializer + renderer) | -When in doubt, write the test first — the -[`testBench/`](https://github.com/knowledgestack/ks-xlsx-parser/tree/main/testBench) -corpus is the fastest signal that a pipeline change didn't regress -anything else. +When in doubt, write the test first — the SpreadsheetBench benchmark +(`make bench-robust`) is the fastest signal that a pipeline change didn't +regress anything else. diff --git a/examples/demo.py b/examples/demo.py index ceeb75f..dbf6118 100644 --- a/examples/demo.py +++ b/examples/demo.py @@ -15,7 +15,7 @@ from xlsx_parser.pipeline import parse_workbook from xlsx_parser.utils.logging_config import configure_logging -EXAMPLES_DIR = Path(__file__).parent.parent / "testBench" / "real_world" +EXAMPLES_DIR = Path(__file__).parent / "fixtures" def demo_financial_model(): diff --git a/examples/generate_examples.py b/examples/generate_examples.py index 9fecb1f..8d25e01 100644 --- a/examples/generate_examples.py +++ b/examples/generate_examples.py @@ -20,7 +20,7 @@ from openpyxl.worksheet.datavalidation import DataValidation from openpyxl.worksheet.table import Table, TableStyleInfo -EXAMPLES_DIR = Path(__file__).parent.parent / "testBench" / "real_world" +EXAMPLES_DIR = Path(__file__).parent / "fixtures" EXAMPLES_DIR.mkdir(parents=True, exist_ok=True) diff --git a/pyproject.toml b/pyproject.toml index 74f125e..a425853 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,10 +75,8 @@ markers = [ "invariant: structural invariant tests", "corpus: external corpus tests (skipped by default)", "slow: tests taking >10 seconds", - "enterprise: enterprise scorecard benchmarks", - "testbench: 1000-file testBench round-trip (skipped by default; run with -m testbench)", ] -addopts = "-m 'not corpus and not testbench'" +addopts = "-m 'not corpus'" [tool.setuptools.packages.find] where = ["src"] @@ -86,7 +84,7 @@ where = ["src"] [tool.ruff] line-length = 110 target-version = "py310" -extend-exclude = ["testBench", "examples/stress_test", "dist", "build"] +extend-exclude = ["examples/stress_test", "dist", "build"] [tool.ruff.lint] select = [ diff --git a/scripts/build_testbench.py b/scripts/build_testbench.py deleted file mode 100644 index 29f3535..0000000 --- a/scripts/build_testbench.py +++ /dev/null @@ -1,1667 +0,0 @@ -#!/usr/bin/env python3 -""" -build_testbench.py — deterministic generator for the ks-xlsx-parser testBench. - -Produces ~1000 `.xlsx` workbooks under ``testBench/generated/`` organised into -three groups: - -* ``matrix/`` — one feature-per-file across every knob the parser exercises - (formulas, merges, named ranges, CF, DV, tables, charts, - styles, dates, errors, hidden rows/cols, hyperlinks, - comments, rich text, number formats, edge addresses, - array formulas, 3D refs, pivot placeholders, huge sheet - names). -* ``combo/`` — randomised combinations of the above at five density - levels (5/10/25/50/100 operations per file) × 80 seeds. -* ``adversarial/`` — files engineered to break parsers: circular formulas, - deep formula chains, 1M-row sparse sheets, 255-sheet - workbooks, unicode/RTL/emoji stress, oversized merges, - broken references, long formula strings. - -Usage ------ - - python scripts/build_testbench.py # builds everything - python scripts/build_testbench.py --force # regenerates even if present - python scripts/build_testbench.py --group matrix - python scripts/build_testbench.py --limit 50 # first 50 files only (smoke) - -The generator is fully deterministic: identical invocations produce -byte-identical files (modulo openpyxl's own timestamping, which we neutralise). -Every file is accompanied by one row in ``testBench/generated/MANIFEST.json`` -describing its group, feature tags, expected cell count, and SHA256. -""" - - -import argparse -import hashlib -import json -import random -import string -import sys -from collections.abc import Callable -from dataclasses import dataclass, field -from datetime import date, datetime, time -from pathlib import Path - -from openpyxl import Workbook -from openpyxl.chart import ( - AreaChart, - BarChart, - BubbleChart, - LineChart, - PieChart, - RadarChart, - Reference, - ScatterChart, -) -from openpyxl.comments import Comment -from openpyxl.formatting.rule import ( - CellIsRule, - ColorScaleRule, - DataBarRule, - FormulaRule, - IconSetRule, - Rule, -) -from openpyxl.styles import ( - Alignment, - Border, - Font, - PatternFill, - Side, -) -from openpyxl.utils import get_column_letter -from openpyxl.workbook.defined_name import DefinedName -from openpyxl.worksheet.datavalidation import DataValidation -from openpyxl.worksheet.table import Table, TableStyleInfo - -ROOT = Path(__file__).resolve().parent.parent -OUT_ROOT = ROOT / "testBench" / "generated" -MANIFEST_PATH = OUT_ROOT / "MANIFEST.json" - -# ---------------------------------------------------------------------------- -# Data classes -# ---------------------------------------------------------------------------- - - -@dataclass -class GeneratedFile: - path: Path - group: str - features: list[str] = field(default_factory=list) - expected_sheets: int = 1 - expected_cells: int = 0 - expected_formulas: int = 0 - notes: str = "" - - def to_manifest_row(self) -> dict: - return { - "path": str(self.path.relative_to(OUT_ROOT)), - "group": self.group, - "features": self.features, - "expected_sheets": self.expected_sheets, - "expected_cells": self.expected_cells, - "expected_formulas": self.expected_formulas, - "sha256": sha256_of(self.path), - "size_bytes": self.path.stat().st_size, - "notes": self.notes, - } - - -def sha256_of(path: Path) -> str: - h = hashlib.sha256() - with path.open("rb") as f: - for chunk in iter(lambda: f.read(65536), b""): - h.update(chunk) - return h.hexdigest() - - -def _finalize(wb: Workbook, out: Path) -> None: - """Save workbook with deterministic metadata.""" - wb.properties.created = datetime(2025, 1, 1, 0, 0, 0) - wb.properties.modified = datetime(2025, 1, 1, 0, 0, 0) - wb.properties.creator = "ks-xlsx-parser testBench generator" - wb.properties.title = out.stem - out.parent.mkdir(parents=True, exist_ok=True) - wb.save(out) - - -# ---------------------------------------------------------------------------- -# Matrix group — one feature per file -# ---------------------------------------------------------------------------- - - -MATRIX_DIR = OUT_ROOT / "matrix" - - -def _matrix_path(slug: str) -> Path: - return MATRIX_DIR / f"{slug}.xlsx" - - -# --- formulas ------------------------------------------------------------- - -FORMULA_RECIPES: list[tuple[str, str, str]] = [ - # (slug, label, formula expression — evaluated in B1 with constants in A1:A5) - ("formula_sum", "SUM", "=SUM(A1:A5)"), - ("formula_average", "AVERAGE", "=AVERAGE(A1:A5)"), - ("formula_min_max", "MIN/MAX", "=MAX(A1:A5)-MIN(A1:A5)"), - ("formula_count", "COUNT", "=COUNT(A1:A5)"), - ("formula_counta", "COUNTA", "=COUNTA(A1:A5)"), - ("formula_sumif", "SUMIF", "=SUMIF(A1:A5,\">2\")"), - ("formula_sumifs", "SUMIFS", "=SUMIFS(A1:A5,A1:A5,\">1\",A1:A5,\"<5\")"), - ("formula_countif", "COUNTIF", "=COUNTIF(A1:A5,\">2\")"), - ("formula_countifs", "COUNTIFS", "=COUNTIFS(A1:A5,\">0\",A1:A5,\"<5\")"), - ("formula_averageif", "AVERAGEIF", "=AVERAGEIF(A1:A5,\">1\")"), - ("formula_if_basic", "IF", "=IF(A1>2,\"big\",\"small\")"), - ("formula_if_nested", "nested IF", "=IF(A1>4,\"high\",IF(A1>2,\"mid\",\"low\"))"), - ("formula_ifs", "IFS", "=IFS(A1>4,\"high\",A1>2,\"mid\",TRUE,\"low\")"), - ("formula_ifna", "IFNA", "=IFNA(VLOOKUP(99,A1:B5,2,FALSE),\"missing\")"), - ("formula_iferror", "IFERROR", "=IFERROR(1/0,\"err\")"), - ("formula_and_or_not", "AND/OR/NOT", "=AND(A1>0,OR(A2>0,NOT(A3<0)))"), - ("formula_concat", "CONCAT", "=CONCAT(A1,\"-\",A2)"), - ("formula_textjoin", "TEXTJOIN", "=TEXTJOIN(\",\",TRUE,A1:A5)"), - ("formula_left_right_mid", "LEFT/RIGHT/MID", "=LEFT(\"abcdef\",3)&RIGHT(\"abcdef\",2)&MID(\"abcdef\",3,2)"), - ("formula_substitute", "SUBSTITUTE", "=SUBSTITUTE(\"foo-bar\",\"-\",\"_\")"), - ("formula_find_search", "FIND/SEARCH", "=FIND(\"b\",\"foobar\")+SEARCH(\"B\",\"foobar\")"), - ("formula_len_trim", "LEN/TRIM", "=LEN(TRIM(\" hi \"))"), - ("formula_upper_lower_proper", "case fns", "=UPPER(\"a\")&LOWER(\"B\")&PROPER(\"hello world\")"), - ("formula_round_roundup_rounddown", "ROUND*", "=ROUND(A1,1)+ROUNDUP(A1,0)+ROUNDDOWN(A1,0)"), - ("formula_int_mod", "INT/MOD", "=INT(A1)+MOD(A1,2)"), - ("formula_abs_sign", "ABS/SIGN", "=ABS(-5)+SIGN(A1)"), - ("formula_sqrt_power", "SQRT/POWER", "=SQRT(16)+POWER(A1,2)"), - ("formula_log_ln_exp", "LOG/LN/EXP", "=LOG(10)+LN(EXP(1))"), - ("formula_date_functions", "DATE fns", "=YEAR(TODAY())+MONTH(TODAY())+DAY(TODAY())"), - ("formula_datedif", "DATEDIF", "=DATEDIF(DATE(2020,1,1),DATE(2025,1,1),\"Y\")"), - ("formula_edate_eomonth", "EDATE/EOMONTH", "=EDATE(DATE(2020,1,1),12)+EOMONTH(DATE(2020,1,1),3)"), - ("formula_weekday_workday", "WEEKDAY/WORKDAY", "=WEEKDAY(TODAY())+WORKDAY(TODAY(),5)"), - ("formula_vlookup", "VLOOKUP", "=VLOOKUP(A1,A1:B5,2,FALSE)"), - ("formula_hlookup", "HLOOKUP", "=HLOOKUP(A1,A1:E2,2,FALSE)"), - ("formula_xlookup", "XLOOKUP", "=XLOOKUP(A1,A1:A5,B1:B5,\"not found\")"), - ("formula_index_match", "INDEX/MATCH", "=INDEX(A1:A5,MATCH(A2,A1:A5,0))"), - ("formula_offset", "OFFSET", "=OFFSET(A1,2,0)"), - ("formula_indirect", "INDIRECT", "=INDIRECT(\"A\"&2)"), - ("formula_rank", "RANK", "=RANK(A1,A1:A5,0)"), - ("formula_large_small", "LARGE/SMALL", "=LARGE(A1:A5,2)+SMALL(A1:A5,2)"), - ("formula_choose", "CHOOSE", "=CHOOSE(2,\"a\",\"b\",\"c\")"), - ("formula_switch", "SWITCH", "=SWITCH(A1,1,\"one\",2,\"two\",\"other\")"), - ("formula_array_cse", "array CSE", "{=SUM(A1:A5*A1:A5)}"), - ("formula_long", "8000-char expression", "=" + "+".join(f"A{((i % 5) + 1)}" for i in range(400))), -] - - -def build_formula_files() -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - for slug, label, formula in FORMULA_RECIPES: - wb = Workbook() - ws = wb.active - ws.title = "Formula" - for i in range(1, 6): - ws.cell(row=i, column=1, value=i * 1.5) - ws["B1"] = formula - ws["D1"] = f"Test: {label}" - out = _matrix_path(slug) - _finalize(wb, out) - files.append( - GeneratedFile( - path=out, - group="matrix/formula", - features=["formula", slug.replace("formula_", "")], - expected_cells=7, - expected_formulas=1, - ) - ) - return files - - -# --- merged cells --------------------------------------------------------- - - -def build_merge_files() -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - recipes = [ - ("merge_horizontal_small", [("A1:C1",)]), - ("merge_horizontal_wide", [(f"A1:{get_column_letter(20)}1",)]), - ("merge_vertical_small", [("A1:A5",)]), - ("merge_vertical_tall", [("A1:A100",)]), - ("merge_rectangular", [("A1:E5",)]), - ("merge_many_horizontal", [(f"A{r}:C{r}",) for r in range(1, 51)]), - ("merge_many_vertical", [(f"{get_column_letter(c)}1:{get_column_letter(c)}30",) for c in range(1, 11)]), - ("merge_grid_5x5", [(f"{get_column_letter(2*c-1)}{2*r-1}:{get_column_letter(2*c)}{2*r}",) for r in range(1, 6) for c in range(1, 6)]), - ("merge_diagonal_steps", [(f"{get_column_letter(2*i-1)}{2*i-1}:{get_column_letter(2*i)}{2*i}",) for i in range(1, 8)]), - ("merge_header_3_levels", [("A1:F1",), ("A2:C2",), ("D2:F2",), ("A3:B3",), ("C3:C3",), ("D3:E3",), ("F3:F3",)]), - ("merge_with_value_only_in_master", [("A1:C3",)]), - ("merge_around_data", [("A1:C1",), ("A5:C5",)]), - ("merge_single_cell_noop", [("A1:A1",)]), # degenerate - ("merge_adjacent_row_pair", [("A1:B1",), ("A2:B2",)]), - ("merge_wide_header_narrow_data", [("A1:J1",)]), - ("merge_mixed_sizes", [("A1:B2",), ("C1:E1",), ("A4:A10",), ("D4:F6",)]), - ("merge_100_singletons", [(f"{get_column_letter(((i-1) % 20)+1)}{((i-1)//20)+1}:{get_column_letter(((i-1) % 20)+1)}{((i-1)//20)+1}",) for i in range(1, 101)]), - ("merge_full_row", [("A1:Z1",)]), - ("merge_full_column_short", [("A1:A50",)]), - ("merge_nonadjacent_blocks", [("A1:C3",), ("F1:H3",), ("A5:C7",), ("F5:H7",)]), - ("merge_within_table_header", [("A1:D1",)]), # we'll add a table below - ("merge_empty_range", [("B2:D4",)]), # no data in master - ("merge_unicode_content", [("A1:C1",)]), - ("merge_with_rich_formatting", [("A1:C1",)]), - ("merge_column_header_stack", [("A1:A2",), ("B1:B2",), ("C1:C2",)]), - ("merge_report_grid", [("A1:D1",), ("A2:A10",), ("B2:D2",), ("B3:B10",), ("C3:D3",)]), - ("merge_large_single", [("A1:Z100",)]), - ("merge_thousand_cells", [("A1:J100",)]), - ("merge_within_table_footer", [("A11:D11",)]), - ("merge_spanning_formula_range", [("A1:C1",)]), - ] - for slug, ranges in recipes: - wb = Workbook() - ws = wb.active - ws.title = "Merges" - for i, (rng,) in enumerate(ranges): - anchor = rng.split(":")[0] - try: - ws[anchor] = f"m{i+1}" # must write before merging; skip if cell is already merged - except AttributeError: - pass - try: - ws.merge_cells(rng) - except Exception: - pass - if slug == "merge_with_value_only_in_master": - ws["A1"] = "only-master" - if slug == "merge_within_table_header": - for c, h in enumerate(["a", "b", "c", "d"], 1): - ws.cell(row=2, column=c, value=h) - for r in range(3, 8): - for c in range(1, 5): - ws.cell(row=r, column=c, value=r * c) - ws.add_table(Table(displayName="T1", ref="A2:D7")) - if slug == "merge_unicode_content": - ws["A1"] = "éñÜ日本語 🚀 حرف" - if slug == "merge_with_rich_formatting": - ws["A1"].font = Font(bold=True, size=14, color="FF0000") - ws["A1"].fill = PatternFill("solid", start_color="FFFF00") - ws["A1"].alignment = Alignment(horizontal="center", vertical="center") - out = _matrix_path(slug) - _finalize(wb, out) - files.append( - GeneratedFile( - path=out, - group="matrix/merge", - features=["merged_cells", slug], - expected_cells=len(ranges), - ) - ) - return files - - -# --- named ranges --------------------------------------------------------- - - -def build_named_range_files() -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - recipes = [ - ("named_workbook_scope", "Total", "Sheet1!$A$1", None), - ("named_sheet_scope", "SheetLocal", "Sheet1!$B$1", "Sheet1"), - ("named_constant", "TaxRate", "0.07", None), - ("named_range_multi_cell", "Prices", "Sheet1!$A$1:$A$10", None), - ("named_formula", "Doubled", "Sheet1!$A$1*2", None), - ("named_with_unicode", "Mẹtá", "Sheet1!$A$1", None), - ("named_long_identifier", "very_long_identifier_" + "x" * 50, "Sheet1!$A$1", None), - ("named_escaped_sheet", "Quoted", "'Sheet 2'!$A$1", None), # needs 'Sheet 2' - ("named_external_like", "ExternalLike", "[Budget.xlsx]Sheet1!$A$1", None), - ("named_list_variation", "ChoiceList", "Sheet1!$D$1:$D$5", None), - ("named_col_range", "FullColumn", "Sheet1!$A:$A", None), - ("named_row_range", "FullRow", "Sheet1!$1:$1", None), - ("named_cross_sheet", "CrossRef", "Other!$A$1", None), # needs Other sheet - ("named_multi_area", "Islands", "Sheet1!$A$1,Sheet1!$C$3", None), - ("named_with_hash_prefix", "_Prefix", "Sheet1!$A$1", None), - ("named_digits", "X1", "Sheet1!$A$1", None), - ("named_empty_formula_error", "ErrRef", "#REF!", None), - ("named_boolean_constant", "IsOn", "TRUE", None), - ("named_string_constant", "Greeting", '"hello"', None), - ("named_table_column_ref", "TableCol", "Table1[Value]", None), # needs table - ] - for slug, name, ref, scope in recipes: - wb = Workbook() - ws = wb.active - ws.title = "Sheet1" - for i in range(1, 11): - ws.cell(row=i, column=1, value=i) - ws.cell(row=i, column=4, value=f"item{i}") - if scope == "Sheet1": - ws.defined_names.add(DefinedName(name, attr_text=ref)) - elif slug == "named_escaped_sheet": - wb.create_sheet("Sheet 2")["A1"] = 42 - wb.defined_names.add(DefinedName(name, attr_text=ref)) - elif slug == "named_cross_sheet": - wb.create_sheet("Other")["A1"] = 99 - wb.defined_names.add(DefinedName(name, attr_text=ref)) - elif slug == "named_table_column_ref": - for c, h in enumerate(["ID", "Value"], 1): - ws.cell(row=1, column=c, value=h) - for r in range(2, 6): - ws.cell(row=r, column=1, value=r) - ws.cell(row=r, column=2, value=r * 10) - ws.add_table(Table(displayName="Table1", ref="A1:B5")) - wb.defined_names.add(DefinedName(name, attr_text=ref)) - else: - wb.defined_names.add(DefinedName(name, attr_text=ref)) - out = _matrix_path(slug) - _finalize(wb, out) - files.append( - GeneratedFile( - path=out, - group="matrix/named_range", - features=["named_range", slug], - expected_cells=14, - ) - ) - return files - - -# --- data validation ------------------------------------------------------ - - -def build_data_validation_files() -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - recipes = [ - ("dv_list_literal", {"type": "list", "formula1": '"Red,Green,Blue"'}), - ("dv_list_range", {"type": "list", "formula1": "=$D$1:$D$5"}), - ("dv_whole_between", {"type": "whole", "operator": "between", "formula1": "1", "formula2": "100"}), - ("dv_decimal_gt", {"type": "decimal", "operator": "greaterThan", "formula1": "0.5"}), - ("dv_date_after", {"type": "date", "operator": "greaterThan", "formula1": "DATE(2024,1,1)"}), - ("dv_time_before", {"type": "time", "operator": "lessThan", "formula1": "TIME(12,0,0)"}), - ("dv_textlength", {"type": "textLength", "operator": "lessThan", "formula1": "10"}), - ("dv_custom", {"type": "custom", "formula1": "=A1>0"}), - ("dv_list_unicode", {"type": "list", "formula1": '"红,绿,蓝"'}), - ("dv_list_one_item", {"type": "list", "formula1": '"Only"'}), - ("dv_list_many_items", {"type": "list", "formula1": '"' + ",".join(f"opt{i}" for i in range(1, 31)) + '"'}), - ("dv_with_error_message", {"type": "list", "formula1": '"A,B"', "error": "pick A or B", "errorTitle": "Err"}), - ("dv_with_prompt", {"type": "list", "formula1": '"A,B"', "prompt": "select letter", "promptTitle": "Hint"}), - ("dv_ignore_blank", {"type": "list", "formula1": '"A,B"', "allowBlank": True}), - ("dv_multiple_ranges", {"type": "list", "formula1": '"A,B"'}), # will apply to multiple ranges - ("dv_whole_equal", {"type": "whole", "operator": "equal", "formula1": "42"}), - ("dv_date_between", {"type": "date", "operator": "between", "formula1": "DATE(2020,1,1)", "formula2": "DATE(2025,12,31)"}), - ("dv_decimal_not_between", {"type": "decimal", "operator": "notBetween", "formula1": "0", "formula2": "1"}), - ("dv_textlength_greater", {"type": "textLength", "operator": "greaterThan", "formula1": "3"}), - ("dv_custom_cross_cell", {"type": "custom", "formula1": "=AND(A1>0,B1<100)"}), - ] - for slug, kwargs in recipes: - wb = Workbook() - ws = wb.active - ws.title = "DV" - for r in range(1, 6): - ws.cell(row=r, column=4, value=f"Option{r}") - dv_kwargs = {k: v for k, v in kwargs.items() if k not in {"error", "errorTitle", "prompt", "promptTitle", "allowBlank"}} - dv = DataValidation(**dv_kwargs) - if "error" in kwargs: - dv.error = kwargs["error"] - dv.errorTitle = kwargs.get("errorTitle", "Err") - dv.showErrorMessage = True - if "prompt" in kwargs: - dv.prompt = kwargs["prompt"] - dv.promptTitle = kwargs.get("promptTitle", "Hint") - dv.showInputMessage = True - if kwargs.get("allowBlank"): - dv.allowBlank = True - ws.add_data_validation(dv) - if slug == "dv_multiple_ranges": - dv.add("A1:A5") - dv.add("C1:C5") - else: - dv.add("A1:A10") - out = _matrix_path(slug) - _finalize(wb, out) - files.append( - GeneratedFile( - path=out, - group="matrix/data_validation", - features=["data_validation", slug], - expected_cells=5, - ) - ) - return files - - -# --- conditional formatting ----------------------------------------------- - - -def build_conditional_formatting_files() -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - - def _seed_ws(ws): - for r in range(1, 11): - ws.cell(row=r, column=1, value=r) - ws.cell(row=r, column=2, value=11 - r) - ws.cell(row=r, column=3, value=(r * 7) % 10) - - recipes: list[tuple[str, Callable[[object], None]]] = [ - ("cf_cellis_greater", lambda ws: ws.conditional_formatting.add( - "A1:A10", - CellIsRule(operator="greaterThan", formula=["5"], fill=PatternFill("solid", start_color="FFC7CE")), - )), - ("cf_cellis_less", lambda ws: ws.conditional_formatting.add( - "A1:A10", - CellIsRule(operator="lessThan", formula=["3"], fill=PatternFill("solid", start_color="C6EFCE")), - )), - ("cf_cellis_between", lambda ws: ws.conditional_formatting.add( - "A1:A10", - CellIsRule(operator="between", formula=["3", "7"], fill=PatternFill("solid", start_color="FFEB9C")), - )), - ("cf_color_scale_2", lambda ws: ws.conditional_formatting.add( - "A1:A10", - ColorScaleRule(start_type="min", start_color="FFAA0000", - end_type="max", end_color="FF00AA00"), - )), - ("cf_color_scale_3", lambda ws: ws.conditional_formatting.add( - "B1:B10", - ColorScaleRule(start_type="min", start_color="FFAA0000", - mid_type="percentile", mid_value=50, mid_color="FFFFFFFF", - end_type="max", end_color="FF00AA00"), - )), - ("cf_databar", lambda ws: ws.conditional_formatting.add( - "C1:C10", - DataBarRule(start_type="min", end_type="max", color="FF638EC6"), - )), - ("cf_iconset_3traffic", lambda ws: ws.conditional_formatting.add( - "A1:A10", - IconSetRule("3TrafficLights1", "percent", [0, 33, 67]), - )), - ("cf_iconset_5arrows", lambda ws: ws.conditional_formatting.add( - "B1:B10", - IconSetRule("5Arrows", "percent", [0, 20, 40, 60, 80]), - )), - ("cf_formula_rule", lambda ws: ws.conditional_formatting.add( - "A1:A10", - FormulaRule(formula=["MOD(ROW(),2)=0"], fill=PatternFill("solid", start_color="DDDDDD")), - )), - # Note: omit dxfId; openpyxl cannot round-trip Rule(dxfId=0) unless - # the differential style table has a matching entry. - ("cf_top10", lambda ws: ws.conditional_formatting.add( - "A1:C10", Rule(type="top10", rank=3), - )), - ("cf_unique_values", lambda ws: ws.conditional_formatting.add( - "A1:A10", Rule(type="uniqueValues"), - )), - ("cf_duplicate_values", lambda ws: ws.conditional_formatting.add( - "A1:A10", Rule(type="duplicateValues"), - )), - ("cf_contains_text", lambda ws: ws.conditional_formatting.add( - "A1:A10", Rule(type="containsText", operator="containsText", text="5"), - )), - ("cf_above_average", lambda ws: ws.conditional_formatting.add( - "A1:A10", Rule(type="aboveAverage", aboveAverage=True), - )), - ("cf_below_average", lambda ws: ws.conditional_formatting.add( - "A1:A10", Rule(type="aboveAverage", aboveAverage=False), - )), - ("cf_multiple_rules_same_range", lambda ws: ( - ws.conditional_formatting.add("A1:A10", CellIsRule(operator="greaterThan", formula=["7"], fill=PatternFill("solid", start_color="FF0000"))), - ws.conditional_formatting.add("A1:A10", CellIsRule(operator="lessThan", formula=["3"], fill=PatternFill("solid", start_color="00FF00"))), - )), - ("cf_overlapping_ranges", lambda ws: ( - ws.conditional_formatting.add("A1:B5", ColorScaleRule(start_type="min", start_color="FFFF0000", end_type="max", end_color="FF00FF00")), - ws.conditional_formatting.add("B3:C10", DataBarRule(start_type="min", end_type="max", color="FF0000FF")), - )), - ("cf_single_cell", lambda ws: ws.conditional_formatting.add( - "A1", CellIsRule(operator="equal", formula=["1"], fill=PatternFill("solid", start_color="FFFF00")), - )), - ("cf_large_range", lambda ws: ws.conditional_formatting.add( - "A1:Z100", CellIsRule(operator="greaterThan", formula=["0"], fill=PatternFill("solid", start_color="EEEEEE")), - )), - ("cf_entire_column", lambda ws: ws.conditional_formatting.add( - "A1:A1048576", CellIsRule(operator="greaterThan", formula=["5"], fill=PatternFill("solid", start_color="FFC7CE")), - )), - ("cf_formula_complex", lambda ws: ws.conditional_formatting.add( - "A1:A10", - FormulaRule(formula=["AND(A1>3,A1<8)"], fill=PatternFill("solid", start_color="99FF99")), - )), - ("cf_iconset_3signs", lambda ws: ws.conditional_formatting.add( - "A1:A10", - IconSetRule("3Signs", "percent", [0, 33, 67]), - )), - ("cf_iconset_4ratings", lambda ws: ws.conditional_formatting.add( - "A1:A10", - IconSetRule("4Rating", "percent", [0, 25, 50, 75]), - )), - ("cf_color_scale_percentile", lambda ws: ws.conditional_formatting.add( - "A1:A10", - ColorScaleRule(start_type="percentile", start_value=10, start_color="FF0000FF", - end_type="percentile", end_value=90, end_color="FFFF0000"), - )), - ("cf_databar_negative", lambda ws: ws.conditional_formatting.add( - "C1:C10", - DataBarRule(start_type="min", end_type="max", color="FFFF0000", showValue=False), - )), - ] - - for slug, apply in recipes: - wb = Workbook() - ws = wb.active - ws.title = "CF" - _seed_ws(ws) - apply(ws) - out = _matrix_path(slug) - _finalize(wb, out) - files.append( - GeneratedFile( - path=out, - group="matrix/conditional_formatting", - features=["conditional_formatting", slug], - expected_cells=30, - ) - ) - return files - - -# --- tables --------------------------------------------------------------- - - -def build_table_files() -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - for idx, (rows, cols, style, totals) in enumerate([ - (3, 2, "TableStyleLight1", False), - (10, 3, "TableStyleMedium2", False), - (50, 5, "TableStyleMedium9", True), - (100, 8, "TableStyleDark1", False), - (5, 20, "TableStyleLight9", False), - (30, 4, "TableStyleMedium1", True), - (3, 1, "TableStyleLight5", False), - (3, 26, "TableStyleMedium3", False), - (3, 2, None, False), - (10, 3, "TableStyleMedium4", True), - (200, 6, "TableStyleMedium5", False), - (3, 2, "TableStyleLight13", False), - (3, 2, "TableStyleLight14", False), - (3, 2, "TableStyleLight15", False), - (3, 2, "TableStyleLight16", False), - (3, 2, "TableStyleLight17", False), - (3, 2, "TableStyleLight18", False), - (3, 2, "TableStyleLight19", False), - (3, 2, "TableStyleLight20", False), - (3, 2, "TableStyleLight21", False), - ]): - slug = f"table_{idx:02d}_{rows}r_{cols}c" - wb = Workbook() - ws = wb.active - ws.title = "Table" - for c in range(1, cols + 1): - ws.cell(row=1, column=c, value=f"H{c}") - for r in range(2, rows + 2): - for c in range(1, cols + 1): - ws.cell(row=r, column=c, value=(r + c) % 97) - ref = f"A1:{get_column_letter(cols)}{rows + 1}" - tab = Table(displayName=f"Tbl{idx}", ref=ref) - if style: - tab.tableStyleInfo = TableStyleInfo(name=style, showRowStripes=True) - if totals: - tab.totalsRowShown = False # openpyxl can be finicky about totals; keep simple - ws.add_table(tab) - out = _matrix_path(slug) - _finalize(wb, out) - files.append( - GeneratedFile( - path=out, - group="matrix/table", - features=["table", f"{rows}r{cols}c"], - expected_cells=(rows + 1) * cols, - ) - ) - return files - - -# --- charts --------------------------------------------------------------- - - -def build_chart_files() -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - chart_types = [ - ("chart_bar", BarChart, {"type": "col"}), - ("chart_bar_stacked", BarChart, {"type": "col", "grouping": "stacked", "overlap": 100}), - ("chart_bar_horizontal", BarChart, {"type": "bar"}), - ("chart_line", LineChart, {}), - ("chart_pie", PieChart, {}), - ("chart_area", AreaChart, {}), - ("chart_radar", RadarChart, {}), - ("chart_scatter", ScatterChart, {}), - ("chart_bubble", BubbleChart, {}), - ("chart_with_title", BarChart, {"title": "Q1 Sales"}), - ("chart_no_title", BarChart, {}), - ("chart_many_series", BarChart, {"series_count": 6}), - ("chart_one_datapoint", BarChart, {"rows": 2}), - ("chart_long_labels", BarChart, {"long_labels": True}), - ("chart_unicode_labels", BarChart, {"unicode": True}), - ("chart_two_charts_one_sheet", BarChart, {"double": True}), - ("chart_chart_plus_table", BarChart, {"with_table": True}), - ("chart_line_dashed", LineChart, {"smooth": True}), - ("chart_pie_exploded", PieChart, {}), - ("chart_scatter_with_lines", ScatterChart, {"scatterStyle": "lineMarker"}), - ] - for slug, ChartCls, opts in chart_types: - wb = Workbook() - ws = wb.active - ws.title = "Data" - rows = opts.pop("rows", 6) - series_count = opts.pop("series_count", 2) - long_labels = opts.pop("long_labels", False) - unicode_flag = opts.pop("unicode", False) - double = opts.pop("double", False) - with_table = opts.pop("with_table", False) - - ws.cell(row=1, column=1, value="Label") - for s in range(1, series_count + 1): - ws.cell(row=1, column=1 + s, value=f"Series{s}") - for r in range(2, rows + 1): - label = f"Item{r-1}" - if long_labels: - label = "A very long label " * 5 + str(r) - if unicode_flag: - label = f"标签{r} 🚀" - ws.cell(row=r, column=1, value=label) - for s in range(1, series_count + 1): - ws.cell(row=r, column=1 + s, value=((r * s * 7) % 50) + 1) - - chart = ChartCls() - for k, v in opts.items(): - try: - setattr(chart, k, v) - except Exception: - pass - data = Reference(ws, min_col=2, min_row=1, max_col=1 + series_count, max_row=rows) - cats = Reference(ws, min_col=1, min_row=2, max_row=rows) - chart.add_data(data, titles_from_data=True) - try: - chart.set_categories(cats) - except Exception: - pass - ws.add_chart(chart, f"{get_column_letter(series_count + 3)}2") - - if double: - chart2 = BarChart() - chart2.add_data(data, titles_from_data=True) - chart2.set_categories(cats) - ws.add_chart(chart2, "H20") - if with_table: - ws.add_table(Table(displayName="ChartTable", ref=f"A1:{get_column_letter(series_count + 1)}{rows}")) - - out = _matrix_path(slug) - _finalize(wb, out) - files.append( - GeneratedFile( - path=out, - group="matrix/chart", - features=["chart", slug], - expected_cells=rows * (series_count + 1), - ) - ) - return files - - -# --- rich text / styles / fonts ------------------------------------------ - - -def build_style_files() -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - styles = [ - ("style_bold", lambda c: setattr(c, "font", Font(bold=True))), - ("style_italic", lambda c: setattr(c, "font", Font(italic=True))), - ("style_underline", lambda c: setattr(c, "font", Font(underline="single"))), - ("style_strike", lambda c: setattr(c, "font", Font(strike=True))), - ("style_color_red", lambda c: setattr(c, "font", Font(color="FF0000"))), - ("style_font_size_24", lambda c: setattr(c, "font", Font(size=24))), - ("style_font_family_courier", lambda c: setattr(c, "font", Font(name="Courier New"))), - ("style_bg_yellow", lambda c: setattr(c, "fill", PatternFill("solid", start_color="FFFF00"))), - ("style_bg_pattern_gray125", lambda c: setattr(c, "fill", PatternFill(patternType="gray125"))), - ("style_border_thin_all", lambda c: setattr(c, "border", Border(left=Side(style="thin"), right=Side(style="thin"), top=Side(style="thin"), bottom=Side(style="thin")))), - ("style_border_thick_bottom", lambda c: setattr(c, "border", Border(bottom=Side(style="thick")))), - ("style_border_dashed", lambda c: setattr(c, "border", Border(top=Side(style="dashed")))), - ("style_border_double", lambda c: setattr(c, "border", Border(bottom=Side(style="double")))), - ("style_alignment_center", lambda c: setattr(c, "alignment", Alignment(horizontal="center", vertical="center"))), - ("style_alignment_wrap", lambda c: setattr(c, "alignment", Alignment(wrap_text=True))), - ("style_alignment_rotate_45", lambda c: setattr(c, "alignment", Alignment(text_rotation=45))), - ("style_alignment_rotate_90", lambda c: setattr(c, "alignment", Alignment(text_rotation=90))), - ("style_indent", lambda c: setattr(c, "alignment", Alignment(indent=3))), - ("style_shrink_to_fit", lambda c: setattr(c, "alignment", Alignment(shrink_to_fit=True))), - ("style_vertical_text", lambda c: setattr(c, "alignment", Alignment(text_rotation=255))), - ("style_combined", lambda c: ( - setattr(c, "font", Font(bold=True, italic=True, size=16, color="0000FF")), - setattr(c, "fill", PatternFill("solid", start_color="FFE0E0")), - setattr(c, "alignment", Alignment(horizontal="center", vertical="center", wrap_text=True)), - setattr(c, "border", Border(left=Side("thin"), right=Side("thin"), top=Side("medium"), bottom=Side("medium"))), - )), - ("style_number_format_currency", lambda c: setattr(c, "number_format", "$#,##0.00")), - ("style_number_format_percent", lambda c: setattr(c, "number_format", "0.0%")), - ("style_number_format_scientific", lambda c: setattr(c, "number_format", "0.00E+00")), - ("style_number_format_date_iso", lambda c: setattr(c, "number_format", "yyyy-mm-dd")), - ("style_number_format_date_long", lambda c: setattr(c, "number_format", "dddd, mmmm dd, yyyy")), - ("style_number_format_time", lambda c: setattr(c, "number_format", "hh:mm:ss")), - ("style_number_format_negative_red", lambda c: setattr(c, "number_format", "#,##0;[Red]-#,##0")), - ("style_number_format_accounting", lambda c: setattr(c, "number_format", "_($* #,##0.00_)")), - ("style_number_format_fraction", lambda c: setattr(c, "number_format", "# ?/?")), - ] - for slug, apply in styles: - wb = Workbook() - ws = wb.active - ws.title = "Style" - ws["A1"] = "Styled Cell" - if "number_format" in slug: - ws["A1"] = 1234.567 - if "date" in slug or "time" in slug: - ws["A1"] = datetime(2024, 6, 15, 14, 30, 45) - apply(ws["A1"]) - out = _matrix_path(slug) - _finalize(wb, out) - files.append( - GeneratedFile( - path=out, - group="matrix/style", - features=["style", slug], - expected_cells=1, - ) - ) - return files - - -# --- dates & times -------------------------------------------------------- - - -def build_date_files() -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - entries = [ - ("date_today", datetime.now()), - ("date_epoch_1900", datetime(1900, 1, 1)), - ("date_epoch_1904", datetime(1904, 1, 2)), - ("date_y2k", datetime(2000, 1, 1)), - ("date_future_2099", datetime(2099, 12, 31)), - ("date_leap_year", datetime(2020, 2, 29)), - ("date_weird_feb28", datetime(1900, 2, 28)), - ("date_first_valid", datetime(1900, 3, 1)), - ("date_midnight", datetime(2024, 6, 1, 0, 0, 0)), - ("date_nearmidnight", datetime(2024, 6, 1, 23, 59, 59)), - ("date_iso_string", "2024-06-15"), - ("date_us_string", "06/15/2024"), - ("date_eu_string", "15/06/2024"), - ("date_just_time", time(13, 30, 0)), - ("date_date_only", date(2024, 6, 15)), - ("date_with_timedelta_format", datetime(2024, 6, 15)), - ("date_mixed_formats_in_column", None), - ("date_fractional_days", 44500.5), # excel serial - ("date_negative_serial", -1), # invalid - ("date_text_like_date", "2024-06-15 but not really"), - ] - for slug, val in entries: - wb = Workbook() - ws = wb.active - ws.title = "Dates" - if slug == "date_mixed_formats_in_column": - ws["A1"] = datetime(2024, 1, 1) - ws["A2"] = "2024-02-01" - ws["A3"] = 44593 - ws["A4"] = date(2024, 4, 1) - ws["A5"] = datetime(2024, 5, 1, 12, 30) - else: - ws["A1"] = val - ws["A1"].number_format = "yyyy-mm-dd hh:mm:ss" - out = _matrix_path(slug) - _finalize(wb, out) - files.append( - GeneratedFile(path=out, group="matrix/date", features=["date", slug], expected_cells=1), - ) - return files - - -# --- errors --------------------------------------------------------------- - - -def build_error_files() -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - errors = [ - ("error_div_zero", "=1/0"), - ("error_name", "=UNKNOWN_FN()"), - ("error_ref", "=#REF!"), - ("error_value", "=\"a\"+1"), - ("error_num", "=SQRT(-1)"), - ("error_null", "=A1 A2"), # intersection of disjoint ranges - ("error_na", "=NA()"), - ("error_getting_data", "=VLOOKUP(999,A1:B2,2,FALSE)"), - ("error_mixed_with_text", "=IF(TRUE,1/0,\"ok\")"), - ("error_chained", "=1/0+2"), - ("error_deliberate_bad_ref", "=BadSheet!A1"), - ("error_unclosed_paren", "=SUM(A1"), # may get rewritten by openpyxl - ("error_bad_range", "=SUM(A1:)"), - ("error_too_many_args", "=IF(1,2,3,4,5)"), - ("error_circular_simple", "=A1"), # A1 refers to itself - ] - for slug, formula in errors: - wb = Workbook() - ws = wb.active - ws.title = "Err" - try: - if slug == "error_circular_simple": - ws["A1"] = formula - else: - ws["A2"] = 1 - ws["A1"] = formula - except Exception: - pass # a few are too malformed even for openpyxl to accept - out = _matrix_path(slug) - try: - _finalize(wb, out) - except Exception: - continue - files.append( - GeneratedFile(path=out, group="matrix/error", features=["error", slug], expected_cells=2, expected_formulas=1), - ) - return files - - -# --- hidden rows/cols/sheets --------------------------------------------- - - -def build_hidden_files() -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - specs = [ - ("hidden_single_row", "row", [3]), - ("hidden_single_col", "col", ["B"]), - ("hidden_many_rows", "row", list(range(2, 20, 2))), - ("hidden_many_cols", "col", ["B", "D", "F", "H"]), - ("hidden_first_row", "row", [1]), - ("hidden_last_row", "row", [100]), - ("hidden_row_at_boundary", "row", [50, 51, 52]), - ("hidden_entire_block", "row", list(range(5, 15))), - ("hidden_sheet_tab", "sheet", None), - ("hidden_very_hidden_sheet", "veryhidden", None), - ("hidden_with_outline_group", "outline", None), - ("hidden_mixed_rows_cols", "mixed", None), - ] - for slug, kind, items in specs: - wb = Workbook() - ws = wb.active - ws.title = "Main" - for r in range(1, 30): - for c in range(1, 10): - ws.cell(row=r, column=c, value=(r + c) % 100) - if kind == "row": - for r in items: - ws.row_dimensions[r].hidden = True - elif kind == "col": - for col in items: - ws.column_dimensions[col].hidden = True - elif kind == "sheet": - hs = wb.create_sheet("HiddenSheet") - hs["A1"] = "hidden content" - hs.sheet_state = "hidden" - elif kind == "veryhidden": - hs = wb.create_sheet("VeryHidden") - hs["A1"] = "very hidden" - hs.sheet_state = "veryHidden" - elif kind == "outline": - for r in range(5, 15): - ws.row_dimensions[r].outline_level = 1 - ws.row_dimensions[r].hidden = True - elif kind == "mixed": - ws.row_dimensions[3].hidden = True - ws.row_dimensions[5].hidden = True - ws.column_dimensions["C"].hidden = True - ws.column_dimensions["E"].hidden = True - hs = wb.create_sheet("MixedHidden") - hs.sheet_state = "hidden" - out = _matrix_path(slug) - _finalize(wb, out) - files.append( - GeneratedFile(path=out, group="matrix/hidden", features=["hidden", slug], expected_cells=29 * 9), - ) - return files - - -# --- edge addresses ------------------------------------------------------- - - -def build_edge_address_files() -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - specs = [ - ("addr_xfd1", "XFD1", "lastcol_row1"), - ("addr_a1048576", "A1048576", "col_a_lastrow"), - ("addr_xfd1048576", "XFD1048576", "last_cell"), - ("addr_zz1000", "ZZ1000", "mid_extreme"), - ("addr_aaa1", "AAA1", "col_aaa"), - ("addr_aa500", "AA500", "col_aa_500"), - ("addr_very_sparse", None, "sparse"), - ("addr_column_1000", f"{get_column_letter(1000)}1", "col_1000"), - ("addr_row_100000", "A100000", "row_100k"), - ("addr_gaps", None, "gaps"), - ] - for slug, addr, kind in specs: - wb = Workbook() - ws = wb.active - ws.title = "Edge" - ws["A1"] = "anchor" - if kind == "sparse": - ws["A1"] = "tl" - ws["XFD1048576"] = "br" - elif kind == "gaps": - for offset in [0, 100, 1000, 10000]: - ws.cell(row=1 + offset, column=1 + min(offset // 100, 50), value=f"v{offset}") - elif addr: - ws[addr] = f"marker_{slug}" - out = _matrix_path(slug) - _finalize(wb, out) - files.append( - GeneratedFile(path=out, group="matrix/edge_address", features=["edge_address", slug], expected_cells=2), - ) - return files - - -# --- sheet name variations ------------------------------------------------ - - -SHEET_NAME_VARIANTS = [ - ("sheetname_ascii", "Simple"), - ("sheetname_spaces", "Has Spaces"), - ("sheetname_quote", "Has'Quote"), - ("sheetname_unicode_jp", "日本語シート"), - ("sheetname_unicode_emoji", "📊 Sheet"), - ("sheetname_leading_digits", "1stSheet"), - ("sheetname_long_30chars", "X" * 30), - ("sheetname_dash_underscore", "my-sheet_name"), - ("sheetname_hash_unicode", "Résumé-2025"), - ("sheetname_parens", "Sheet (copy)"), -] - - -def build_sheet_name_files() -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - for slug, name in SHEET_NAME_VARIANTS: - wb = Workbook() - ws = wb.active - try: - ws.title = name[:31] # Excel limit - except Exception: - ws.title = "Fallback" - ws["A1"] = f"in {name!r}" - out = _matrix_path(slug) - _finalize(wb, out) - files.append( - GeneratedFile(path=out, group="matrix/sheet_name", features=["sheet_name", slug], expected_cells=1), - ) - return files - - -# --- hyperlinks / comments / misc ---------------------------------------- - - -def build_misc_files() -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - - # hyperlinks - hl_specs = [ - ("hyperlink_external_http", "https://example.com"), - ("hyperlink_external_https", "https://www.anthropic.com"), - ("hyperlink_mailto", "mailto:test@example.com"), - ("hyperlink_file", "file:///tmp/x.txt"), - ("hyperlink_internal_cell", "#Sheet1!B5"), - ("hyperlink_internal_named", "#NamedRng"), - ("hyperlink_many_links", None), - ] - for slug, url in hl_specs: - wb = Workbook() - ws = wb.active - ws.title = "Sheet1" - if slug == "hyperlink_many_links": - for i in range(1, 21): - ws.cell(row=i, column=1, value=f"link{i}").hyperlink = f"https://example.com/page/{i}" - else: - ws["A1"].hyperlink = url - ws["A1"].value = f"click ({slug})" - if slug == "hyperlink_internal_named": - wb.defined_names.add(DefinedName("NamedRng", attr_text="Sheet1!$A$1")) - out = _matrix_path(slug) - _finalize(wb, out) - files.append(GeneratedFile(path=out, group="matrix/hyperlink", features=["hyperlink", slug], expected_cells=20 if url is None else 1)) - - # comments - comment_specs = [ - ("comment_short", "Quick note"), - ("comment_multiline", "line1\nline2\nline3"), - ("comment_unicode", "注释 🔍 ملاحظة"), - ("comment_long", "Note " * 500), - ("comment_many_cells", None), - ] - for slug, text in comment_specs: - wb = Workbook() - ws = wb.active - ws.title = "Comments" - if slug == "comment_many_cells": - for i in range(1, 21): - ws.cell(row=i, column=1, value=f"c{i}").comment = Comment(f"comment on row {i}", "Builder") - else: - ws["A1"] = "Cell with comment" - ws["A1"].comment = Comment(text, "Builder") - out = _matrix_path(slug) - _finalize(wb, out) - files.append(GeneratedFile(path=out, group="matrix/comment", features=["comment", slug], expected_cells=20 if text is None else 1)) - - # freeze panes - for slug, freeze in [ - ("freeze_row_1", "A2"), - ("freeze_col_a", "B1"), - ("freeze_both_a1", "B2"), - ("freeze_mid_sheet", "C5"), - ("freeze_deep", "E10"), - ]: - wb = Workbook() - ws = wb.active - ws.title = "Freeze" - for r in range(1, 21): - for c in range(1, 10): - ws.cell(row=r, column=c, value=f"{r},{c}") - ws.freeze_panes = freeze - out = _matrix_path(slug) - _finalize(wb, out) - files.append(GeneratedFile(path=out, group="matrix/freeze_panes", features=["freeze_panes", slug], expected_cells=20 * 9)) - - # rich text (mixed fonts within a cell) — openpyxl exposes this via CellRichText - try: - from openpyxl.cell.rich_text import CellRichText, TextBlock - from openpyxl.cell.text import InlineFont - for slug, blocks in [ - ("rich_text_bold_plain", [TextBlock(InlineFont(b=True), "Bold "), TextBlock(InlineFont(), "plain")]), - ("rich_text_colors", [TextBlock(InlineFont(color="FF0000"), "Red "), TextBlock(InlineFont(color="0000FF"), "Blue")]), - ("rich_text_sizes", [TextBlock(InlineFont(sz="8"), "small "), TextBlock(InlineFont(sz="18"), "BIG")]), - ]: - wb = Workbook() - ws = wb.active - ws.title = "Rich" - ws["A1"] = CellRichText(blocks) - out = _matrix_path(slug) - _finalize(wb, out) - files.append(GeneratedFile(path=out, group="matrix/rich_text", features=["rich_text", slug], expected_cells=1)) - except Exception: - pass - - # 3D refs / cross-sheet - for slug in ["threed_sum_across_sheets"]: - wb = Workbook() - ws = wb.active - ws.title = "A" - for r in range(1, 6): - ws.cell(row=r, column=1, value=r) - wb.create_sheet("B") - for r in range(1, 6): - wb["B"].cell(row=r, column=1, value=r * 10) - summary = wb.create_sheet("Summary") - summary["A1"] = "=SUM(A:B!A1:A5)" # Excel 3D ref syntax - out = _matrix_path(slug) - _finalize(wb, out) - files.append(GeneratedFile(path=out, group="matrix/3d_ref", features=["3d_ref", slug], expected_cells=11, expected_formulas=1)) - - return files - - -MATRIX_BUILDERS: list[Callable[[], list[GeneratedFile]]] = [ - build_formula_files, - build_merge_files, - build_named_range_files, - build_data_validation_files, - build_conditional_formatting_files, - build_table_files, - build_chart_files, - build_style_files, - build_date_files, - build_error_files, - build_hidden_files, - build_edge_address_files, - build_sheet_name_files, - build_misc_files, -] - - -# ---------------------------------------------------------------------------- -# Combinatoric group — randomised feature cocktails -# ---------------------------------------------------------------------------- - - -COMBO_DIR = OUT_ROOT / "combo" -DENSITIES = [5, 10, 25, 50, 100] -SEEDS_PER_DENSITY = 80 # → 400 combo files - - -def _rand_cell_value(rng: random.Random): - kind = rng.choice(["int", "float", "str", "bool", "date", "blank"]) - if kind == "int": - return rng.randint(-10_000, 10_000) - if kind == "float": - return rng.uniform(-1000.0, 1000.0) - if kind == "str": - return "".join(rng.choices(string.ascii_letters + string.digits + " ", k=rng.randint(1, 30))) - if kind == "bool": - return rng.choice([True, False]) - if kind == "date": - return date(rng.randint(2000, 2030), rng.randint(1, 12), rng.randint(1, 28)) - return None - - -def _safe_set(ws, row: int, col: int, value) -> bool: - """Try to set ws cell; return True on success, False if cell is part of a merge.""" - try: - ws.cell(row=row, column=col, value=value) - return True - except (AttributeError, TypeError): - return False - - -def build_combo_file(seed: int, density: int) -> GeneratedFile | None: - rng = random.Random(seed * 10_000 + density) - wb = Workbook() - ws = wb.active - ws.title = f"Main_{seed}_{density}" - cells_written = 0 - formulas = 0 - features: set[str] = set() - - for _ in range(density): - op = rng.choices( - population=["cell", "formula", "merge", "style", "comment", "hyperlink", "validation", "table", "named"], - weights=[45, 20, 8, 12, 3, 3, 3, 3, 3], - k=1, - )[0] - r = rng.randint(1, 100) - c = rng.randint(1, 30) - if op == "cell": - if _safe_set(ws, r, c, _rand_cell_value(rng)): - cells_written += 1 - features.add("cells") - elif op == "formula": - if _safe_set(ws, r, c, f"=SUM({get_column_letter(c)}1:{get_column_letter(c)}{max(1, r-1)})"): - formulas += 1 - features.add("formulas") - elif op == "merge": - try: - r2 = min(r + rng.randint(0, 3), 100) - c2 = min(c + rng.randint(0, 3), 30) - if (r, c) != (r2, c2): - _safe_set(ws, r, c, f"m{seed}") # write before merge - ws.merge_cells(start_row=r, start_column=c, end_row=r2, end_column=c2) - features.add("merge") - except Exception: - pass - elif op == "style": - try: - cell = ws.cell(row=r, column=c) - if cell.value is None: - if _safe_set(ws, r, c, rng.randint(0, 99)): - cells_written += 1 - cell = ws.cell(row=r, column=c) - cell.font = Font(bold=rng.choice([True, False]), italic=rng.choice([True, False]), color=f"{rng.randint(0, 0xFFFFFF):06X}") - cell.fill = PatternFill("solid", start_color=f"{rng.randint(0xAAAAAA, 0xFFFFFF):06X}") - features.add("style") - except AttributeError: - pass - elif op == "comment": - try: - if _safe_set(ws, r, c, "c"): - ws.cell(row=r, column=c).comment = Comment(f"seed{seed}", "combo") - cells_written += 1 - features.add("comment") - except Exception: - pass - elif op == "hyperlink": - try: - if _safe_set(ws, r, c, "lnk"): - ws.cell(row=r, column=c).hyperlink = f"https://example.com/{seed}/{r}-{c}" - cells_written += 1 - features.add("hyperlink") - except Exception: - pass - elif op == "validation": - try: - dv = DataValidation(type="list", formula1='"A,B,C"') - ws.add_data_validation(dv) - dv.add(f"{get_column_letter(c)}{r}") - features.add("validation") - except Exception: - pass - elif op == "table": - try: - r2 = min(r + 3, 100) - c2 = min(c + 2, 30) - if r2 > r and c2 > c: - for rr in range(r, r2 + 1): - for cc in range(c, c2 + 1): - try: - if ws.cell(row=rr, column=cc).value is None: - _safe_set(ws, rr, cc, rr * cc) - except AttributeError: - pass - for cc in range(c, c2 + 1): - _safe_set(ws, r, cc, f"H{cc}") - tab_name = f"T{seed}_{density}_{rng.randint(0, 99)}" - ws.add_table(Table(displayName=tab_name, ref=f"{get_column_letter(c)}{r}:{get_column_letter(c2)}{r2}")) - features.add("table") - except Exception: - pass - elif op == "named": - try: - nm = f"N_{seed}_{density}_{rng.randint(0, 99)}" - wb.defined_names.add(DefinedName(nm, attr_text=f"{ws.title}!${get_column_letter(c)}${r}")) - features.add("named_range") - except Exception: - pass - - out = COMBO_DIR / f"combo_d{density:03d}_s{seed:03d}.xlsx" - try: - _finalize(wb, out) - except Exception: - return None - return GeneratedFile( - path=out, - group="combo", - features=sorted(features), - expected_cells=cells_written, - expected_formulas=formulas, - notes=f"seed={seed} density={density}", - ) - - -def build_combo_files(limit: int | None) -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - count = 0 - for density in DENSITIES: - for seed in range(SEEDS_PER_DENSITY): - if limit is not None and count >= limit: - return files - gf = build_combo_file(seed, density) - if gf: - files.append(gf) - count += 1 - return files - - -# ---------------------------------------------------------------------------- -# Adversarial group — try to break the parser -# ---------------------------------------------------------------------------- - - -ADVERSARIAL_DIR = OUT_ROOT / "adversarial" - - -def _adv_path(slug: str) -> Path: - return ADVERSARIAL_DIR / f"{slug}.xlsx" - - -def build_adversarial_files(limit: int | None) -> list[GeneratedFile]: - files: list[GeneratedFile] = [] - specs: list[tuple[str, Callable[[Workbook], tuple[int, int, str]]]] = [] - - def _mk(slug: str): - def deco(fn: Callable[[Workbook], tuple[int, int, str]]): - specs.append((slug, fn)) - return fn - return deco - - @_mk("adv_empty_workbook") - def _(wb): - # openpyxl always has one sheet; clear it - ws = wb.active - ws.title = "Empty" - return 0, 0, "no cells" - - @_mk("adv_one_cell_1e300") - def _(wb): - wb.active["A1"] = 1e300 - return 1, 0, "huge float" - - @_mk("adv_one_cell_neg_1e300") - def _(wb): - wb.active["A1"] = -1e300 - return 1, 0, "huge negative" - - @_mk("adv_one_cell_tiny") - def _(wb): - wb.active["A1"] = 1e-300 - return 1, 0, "tiny float" - - @_mk("adv_unicode_bomb") - def _(wb): - ws = wb.active - emojis = "🚀🔥💀🎯🌀⚡️🌈🎨🧪💡" * 20 - rtl = "مرحبا بكم في اختبار التحليل" * 5 - cjk = "こんにちは世界 你好世界 안녕하세요" * 5 - ws["A1"] = emojis + " " + rtl + " " + cjk - ws["A2"] = "\u200B\u200C\u200D\ufeff" # zero-width chars - ws["A3"] = "a" * 32_000 # long string - return 3, 0, "unicode stress" - - @_mk("adv_circular_chain_10") - def _(wb): - ws = wb.active - for i in range(1, 10): - ws.cell(row=i, column=1, value=f"=A{i+1}") - ws["A10"] = "=A1" - return 10, 10, "10-step cycle" - - @_mk("adv_formula_chain_deep_500") - def _(wb): - ws = wb.active - ws["A1"] = 1 - for i in range(2, 501): - ws.cell(row=i, column=1, value=f"=A{i-1}+1") - return 500, 499, "500-deep chain" - - @_mk("adv_huge_merge_1000x100") - def _(wb): - ws = wb.active - ws.merge_cells("A1:CV1000") # 100 cols × 1000 rows - ws["A1"] = "one giant merge" - return 1, 0, "100k-cell merge" - - @_mk("adv_many_merges_5000") - def _(wb): - ws = wb.active - for i in range(5000): - r = i // 50 + 1 - c = (i % 50) * 2 + 1 - try: - ws.merge_cells(start_row=r, start_column=c, end_row=r, end_column=c + 1) - ws.cell(row=r, column=c, value="m") - except Exception: - pass - return 2500, 0, "5000 merges" - - @_mk("adv_100_sheets") - def _(wb): - wb.active.title = "S0" - for i in range(1, 100): - ws = wb.create_sheet(f"S{i}") - ws["A1"] = i - return 100, 0, "100 sheets" - - @_mk("adv_very_wide_2000_cols") - def _(wb): - ws = wb.active - for c in range(1, 2001): - ws.cell(row=1, column=c, value=c) - return 2000, 0, "2000 cols in one row" - - @_mk("adv_very_tall_20k_rows") - def _(wb): - ws = wb.active - for r in range(1, 20_001): - ws.cell(row=r, column=1, value=r) - return 20_000, 0, "20k rows" - - @_mk("adv_sparse_million") - def _(wb): - ws = wb.active - for r in [1, 10, 100, 1000, 10_000, 100_000, 500_000, 1_000_000]: - ws.cell(row=r, column=1, value=f"r{r}") - ws["A1"].value = "start" - return 8, 0, "sparse across 1M rows" - - @_mk("adv_all_error_types") - def _(wb): - ws = wb.active - for i, formula in enumerate([ - "=1/0", "=SQRT(-1)", "=NA()", "=BAD_FN()", "=#REF!", '="a"+1', - ], start=1): - ws.cell(row=i, column=1, value=formula) - return 6, 6, "errors galore" - - @_mk("adv_broken_refs") - def _(wb): - ws = wb.active - ws["A1"] = "=MissingSheet!B5" - ws["A2"] = "=OtherBook.xlsx!Sheet1!A1" - ws["A3"] = "=#REF!+1" - return 3, 3, "dangling references" - - @_mk("adv_long_formula") - def _(wb): - ws = wb.active - ws["A1"] = 1 - long_expr = "=" + "+".join("A1" for _ in range(2000)) - ws["B1"] = long_expr - return 2, 1, "very long formula" - - @_mk("adv_long_cell_string") - def _(wb): - ws = wb.active - ws["A1"] = "X" * 32_767 # Excel limit - return 1, 0, "32k char cell" - - @_mk("adv_all_formulas_sheet") - def _(wb): - ws = wb.active - for r in range(1, 101): - for c in range(1, 6): - ws.cell(row=r, column=c, value=f"={get_column_letter(c)}{((r - 1) % 5) + 1}+1") - return 500, 500, "500 formulas" - - @_mk("adv_massive_table") - def _(wb): - ws = wb.active - for c in range(1, 51): - ws.cell(row=1, column=c, value=f"C{c}") - for r in range(2, 202): - for c in range(1, 51): - ws.cell(row=r, column=c, value=(r * c) % 997) - ws.add_table(Table(displayName="Huge", ref=f"A1:{get_column_letter(50)}201")) - return 10_050, 0, "50x200 table" - - @_mk("adv_cyclic_cross_sheet") - def _(wb): - a = wb.active - a.title = "A" - a["A1"] = "=B!A1" - b = wb.create_sheet("B") - b["A1"] = "=A!A1" - return 2, 2, "cross-sheet cycle" - - @_mk("adv_many_named_ranges") - def _(wb): - ws = wb.active - for i in range(1, 301): - wb.defined_names.add(DefinedName(f"N{i}", attr_text=f"Sheet!${get_column_letter((i % 30) + 1)}${(i % 100) + 1}")) - ws["A1"] = "seed" - return 1, 0, "300 named ranges" - - @_mk("adv_duplicate_sheet_names_almost") - def _(wb): - wb.active.title = "Data" - wb.create_sheet("data") - wb.create_sheet("DATA") - return 0, 0, "case-sensitive sheet names" - - @_mk("adv_rtl_sheet") - def _(wb): - ws = wb.active - ws.sheet_view.rightToLeft = True - ws["A1"] = "النص يقرأ من اليمين" - return 1, 0, "RTL view" - - @_mk("adv_extreme_column_width") - def _(wb): - ws = wb.active - ws.column_dimensions["A"].width = 255 - ws.row_dimensions[1].height = 409 # excel max - ws["A1"] = "wide+tall" - return 1, 0, "max col/row size" - - @_mk("adv_autofilter_large") - def _(wb): - ws = wb.active - for c in range(1, 11): - ws.cell(row=1, column=c, value=f"H{c}") - for r in range(2, 301): - for c in range(1, 11): - ws.cell(row=r, column=c, value=r * c) - ws.auto_filter.ref = "A1:J300" - return 3000, 0, "autofilter 3k cells" - - @_mk("adv_mixed_types_same_column") - def _(wb): - ws = wb.active - for r in range(1, 51): - if r % 5 == 0: - ws.cell(row=r, column=1, value=f"text_{r}") - elif r % 5 == 1: - ws.cell(row=r, column=1, value=r) - elif r % 5 == 2: - ws.cell(row=r, column=1, value=float(r) / 7.0) - elif r % 5 == 3: - ws.cell(row=r, column=1, value=date(2024, (r % 12) + 1, 1)) - else: - ws.cell(row=r, column=1, value=(r % 2 == 0)) - return 50, 0, "mixed types in one column" - - _SAFE_STR_CHARS = string.ascii_letters + string.digits + " -_.,:;!?@#$%^&*()[]{}<>+=/|~" - - # adversarial via parametrised generator to pad counts to ~1000 total - for i in range(1, 278): # 277 parametric adversarial files → 1000 total generated - rng = random.Random(10_000 + i) - - @_mk(f"adv_param_{i:03d}") - def _(wb, rng=rng, i=i): - ws = wb.active - # Keep sizes modest so the full bench runs under 10 min wall-clock. - n_cells = rng.randint(100, 800) - cells = 0 - formulas = 0 - for _ in range(n_cells): - r = rng.randint(1, 300) - c = rng.randint(1, 50) - kind = rng.choice(["int", "str", "formula", "date", "bool"]) - try: - if kind == "int": - val = rng.randint(-1_000_000, 1_000_000) - elif kind == "str": - val = "".join(rng.choices(_SAFE_STR_CHARS, k=rng.randint(1, 50))) - elif kind == "formula": - val = f"={get_column_letter(max(1, c - 1))}{max(1, r - 1)}+1" - elif kind == "date": - val = date(rng.randint(1900, 2099), rng.randint(1, 12), rng.randint(1, 28)) - else: - val = rng.choice([True, False]) - if _safe_set(ws, r, c, val): - cells += 1 - if kind == "formula": - formulas += 1 - except Exception: - pass - for _ in range(rng.randint(0, 20)): - try: - r0 = rng.randint(1, 100) - c0 = rng.randint(1, 50) - ws.merge_cells(start_row=r0, start_column=c0, end_row=r0 + rng.randint(0, 5), end_column=c0 + rng.randint(0, 5)) - except Exception: - pass - return cells, formulas, f"param seed {i}" - - files: list[GeneratedFile] = [] - count = 0 - for slug, fn in specs: - if limit is not None and count >= limit: - break - wb = Workbook() - try: - cells, formulas, notes = fn(wb) - except Exception as exc: - # skip uncooperative generators - print(f" ⚠ adversarial {slug} failed to build: {exc}", file=sys.stderr) - continue - out = _adv_path(slug) - try: - _finalize(wb, out) - except Exception as exc: - print(f" ⚠ adversarial {slug} failed to save: {exc}", file=sys.stderr) - continue - files.append( - GeneratedFile( - path=out, - group="adversarial", - features=["adversarial", slug], - expected_cells=cells, - expected_formulas=formulas, - notes=notes, - ) - ) - count += 1 - return files - - -# ---------------------------------------------------------------------------- -# Entry point -# ---------------------------------------------------------------------------- - - -def build_all(groups: set[str], force: bool, limit: int | None) -> list[GeneratedFile]: - all_files: list[GeneratedFile] = [] - if "matrix" in groups: - MATRIX_DIR.mkdir(parents=True, exist_ok=True) - for builder in MATRIX_BUILDERS: - for gf in builder(): - all_files.append(gf) - if limit is not None and len(all_files) >= limit: - return all_files - if "combo" in groups: - COMBO_DIR.mkdir(parents=True, exist_ok=True) - remaining = None if limit is None else max(0, limit - len(all_files)) - all_files.extend(build_combo_files(remaining)) - if limit is not None and len(all_files) >= limit: - return all_files - if "adversarial" in groups: - ADVERSARIAL_DIR.mkdir(parents=True, exist_ok=True) - remaining = None if limit is None else max(0, limit - len(all_files)) - all_files.extend(build_adversarial_files(remaining)) - return all_files - - -def write_manifest(files: list[GeneratedFile]) -> None: - by_group: dict[str, int] = {} - rows = [] - for gf in files: - rows.append(gf.to_manifest_row()) - by_group[gf.group] = by_group.get(gf.group, 0) + 1 - manifest = { - "version": 1, - "generated_at": "deterministic", - "total_files": len(files), - "by_group": by_group, - "files": rows, - } - MANIFEST_PATH.write_text(json.dumps(manifest, indent=2, sort_keys=False)) - print(f"✓ manifest written → {MANIFEST_PATH.relative_to(ROOT)}") - - -def main() -> int: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--group", choices=["matrix", "combo", "adversarial", "all"], default="all") - parser.add_argument("--force", action="store_true", help="regenerate even if outputs exist") - parser.add_argument("--limit", type=int, help="stop after N files (smoke mode)") - parser.add_argument("--clean", action="store_true", help="wipe testBench/generated/ first") - args = parser.parse_args() - - if args.clean and OUT_ROOT.exists(): - import shutil - shutil.rmtree(OUT_ROOT) - print(f"✓ cleaned {OUT_ROOT.relative_to(ROOT)}") - - groups = {"matrix", "combo", "adversarial"} if args.group == "all" else {args.group} - OUT_ROOT.mkdir(parents=True, exist_ok=True) - - print(f"building testBench into {OUT_ROOT.relative_to(ROOT)} groups={sorted(groups)} limit={args.limit}") - files = build_all(groups, args.force, args.limit) - write_manifest(files) - - print(f"\n{'═' * 60}") - print(f" Generated {len(files)} workbooks") - by_group: dict[str, int] = {} - for gf in files: - by_group[gf.group] = by_group.get(gf.group, 0) + 1 - for g in sorted(by_group): - print(f" {g:32s} {by_group[g]:4d}") - print(f"{'═' * 60}") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/scripts/generate_enterprise_fixtures.py b/scripts/generate_enterprise_fixtures.py deleted file mode 100644 index 189bb78..0000000 --- a/scripts/generate_enterprise_fixtures.py +++ /dev/null @@ -1,150 +0,0 @@ -"""Generate small, deterministic enterprise-style Excel fixtures. - -These fixtures are used by enterprise scoring tests and corpus metrics. -They are intentionally lightweight so they can be generated at test time -without network access or large disk usage. -""" - - - -from pathlib import Path -from typing import Callable - -from openpyxl import Workbook -from openpyxl.styles import Font -from openpyxl.workbook.defined_name import DefinedName - - -ROOT = Path(__file__).resolve().parent.parent -TARGET_DIR = ROOT / "testBench" / "enterprise" - - -def _prepare_target() -> None: - TARGET_DIR.mkdir(parents=True, exist_ok=True) - - -def create_financial_model() -> Workbook: - wb = Workbook() - ws = wb.active - ws.title = "Model" - - ws.merge_cells("A1:D1") - ws["A1"] = "Financial Model Q1 2026" - ws["A1"].font = Font(bold=True, size=14) - - ws["A3"] = "ASSUMPTIONS" - ws["A4"] = "Rent per unit" - ws["B4"] = 2500 - ws["A5"] = "Units occupied" - ws["B5"] = 42 - - ws["A7"] = "RESULTS" - ws["A8"] = "Total Revenue" - ws["B8"] = "=B4*B5" - - wb.defined_names.add(DefinedName("UnitCount", attr_text="Model!$B$5")) - wb.defined_names.add(DefinedName("RentPerUnit", attr_text="Model!$B$4")) - - return wb - - -def create_inventory_tracker() -> Workbook: - wb = Workbook() - ws = wb.active - ws.title = "Master" - - ws["A1"] = "SKU" - ws["B1"] = "Description" - ws["C1"] = "Qty" - ws["D1"] = "Unit Cost" - - for i in range(2, 52): - ws[f"A{i}"] = f"SKU-{i:04d}" - ws[f"B{i}"] = f"Product {i}" - ws[f"C{i}"] = i * 100 - ws[f"D{i}"] = i * 1.5 - - tx = wb.create_sheet("Transactions") - tx["A1"] = "SKU" - tx["B1"] = "Qty" - tx["C1"] = "Total" - - for i in range(2, 102): - tx[f"A{i}"] = f"=Master!A{(i % 50) + 2}" - tx[f"B{i}"] = (i % 10) + 1 - tx[f"C{i}"] = f"=VLOOKUP(A{i},Master!A:D,4,0)*B{i}" - - return wb - - -def create_forecast_model() -> Workbook: - wb = Workbook() - base = wb.active - base.title = "Base" - - for month in range(1, 13): - base[f"A{month}"] = f"Month {month}" - base[f"B{month}"] = 10000 * (1 + month * 0.05) - - pess = wb.create_sheet("Pessimistic") - opt = wb.create_sheet("Optimistic") - for month in range(1, 13): - pess[f"B{month}"] = f"=Base!B{month}*0.8" - opt[f"B{month}"] = f"=Base!B{month}*1.2" - - return wb - - -def create_operations_tracker() -> Workbook: - wb = Workbook() - ws = wb.active - ws.title = "Ops" - - ws["A1"] = "Project" - ws["B1"] = "Status" - ws["C1"] = "Budget" - ws["D1"] = "Actual" - ws["E1"] = "Variance %" - - statuses = ["Active", "Complete", "On Hold"] - for i in range(2, 22): - ws[f"A{i}"] = f"Project {i-1}" - ws[f"B{i}"] = statuses[i % 3] - ws[f"C{i}"] = i * 50000 - ws[f"D{i}"] = i * 50000 * (1 + (i % 5) * 0.1) - ws[f"E{i}"] = f"=(D{i}-C{i})/C{i}" - - ref = wb.create_sheet("Reference", 1) - ref.sheet_state = "hidden" - ref["A1"] = "Rate" - ref["A2"] = 1.05 - - return wb - - -def _write_workbook(name: str, builder: Callable[[], Workbook]) -> Path: - _prepare_target() - path = TARGET_DIR / name - if path.exists(): - return path - wb = builder() - wb.save(path) - return path - - -def generate_all() -> list[Path]: - """Generate all enterprise fixtures and return their paths.""" - fixtures = [ - ("financial_model.xlsx", create_financial_model), - ("inventory_tracker.xlsx", create_inventory_tracker), - ("forecast_model.xlsx", create_forecast_model), - ("operations_tracker.xlsx", create_operations_tracker), - ] - - return [_write_workbook(name, builder) for name, builder in fixtures] - - -if __name__ == "__main__": - paths = generate_all() - for p in paths: - print(f"✓ Generated {p.relative_to(ROOT)}") diff --git a/site/index.html b/site/index.html index 585f9af..37fca39 100644 --- a/site/index.html +++ b/site/index.html @@ -160,7 +160,7 @@ "name": "What file formats does ks-xlsx-parser support?", "acceptedAnswer": { "@type": "Answer", - "text": "ks-xlsx-parser supports .xlsx and .xlsm (OOXML). Legacy .xls (BIFF) is not supported — convert those externally first. The parser handles unicode content, very wide sheets, very tall sheets, sparse workbooks, 250-sheet workbooks, circular formula chains, and files with 32k-character cells, all covered in the 1054-workbook testBench that runs in CI." + "text": "ks-xlsx-parser supports .xlsx and .xlsm (OOXML). Legacy .xls (BIFF) is not supported — convert those externally first. The parser handles unicode content, very wide sheets, very tall sheets, sparse workbooks, 250-sheet workbooks, circular formula chains, and files with 32k-character cells, all benchmarked on the 5,458-workbook SpreadsheetBench corpus." } }, { @@ -168,7 +168,7 @@ "name": "How fast is ks-xlsx-parser?", "acceptedAnswer": { "@type": "Answer", - "text": "The full 1054-workbook testBench round-trips in approximately 70 seconds on a single machine. A real-world 21k-cell, 13-sheet financial model parses in about 4.6 seconds (previously 307 seconds before a circular-ref caching fix). Sparse workbooks with extreme addresses parse in under 200 ms." + "text": "SpreadsheetBench's full 5,458-workbook corpus parses end-to-end in roughly 20 minutes on a single machine (low double-digit ms P50 parse time). A real-world 21k-cell, 13-sheet financial model parses in about 4.6 seconds (previously 307 seconds before a circular-ref caching fix). Sparse workbooks with extreme addresses parse in under 200 ms." } } ] @@ -432,7 +432,7 @@ Features Demo Compare - testBench + Benchmarks Docs ⭐ Star on GitHub @@ -551,14 +551,14 @@

What you get back

TESTED & FAST
-

1054-workbook stress corpus. Every commit.

-

testBench ships with the repo and runs in CI. One-feature-per-file matrix, randomised density cocktails, and engineered adversarial files — unicode bombs, circular refs, sparse 1M-row sheets, 250-sheet workbooks.

+

SpreadsheetBench: 5,458 real-world workbooks.

+

We benchmark against the public SpreadsheetBench v0.1 corpus — 912 instruction tasks, 5,458 unique xlsx files spanning financial models, project trackers, HR records, and a long tail of small-business spreadsheets.

-
1054/1054tests passing on every CI run
-
~70send-to-end bench wall time
-
66×Walbridge financial model speedup (0.1.1)
-
17 MBdataset zip attached to each release
+
5,455 / 5,458parsed cleanly (99.945%)
+
912instruction × retrieval tasks measured
+
66×21k-cell financial model speedup (0.1.1)
+
vs Doclingtied @1, +2.7pp @3, +1.8pp @5
@@ -684,7 +684,7 @@

Frequently asked questions

How fast is it? -

The full 1054-workbook testBench round-trips in about 70 seconds. A real 21k-cell, 13-sheet financial model parses in ~4.6 s. Sparse workbooks with extreme addresses parse in under 200 ms. Details in the CHANGELOG.

+

SpreadsheetBench's full 5,458-workbook corpus parses end-to-end in roughly 20 minutes on a single machine. A real 21k-cell, 13-sheet financial model parses in ~4.6 s. Sparse workbooks with extreme addresses parse in under 200 ms. Details in the CHANGELOG.

diff --git a/src/models/common.py b/src/models/common.py index d199da8..0d4af5b 100644 --- a/src/models/common.py +++ b/src/models/common.py @@ -64,7 +64,7 @@ class CellCoord: """A single cell coordinate (1-indexed row and column). **Not a Pydantic model** — frozen slotted dataclass. Profiling showed - 339k Pydantic inits on Walbridge contributed ~0.65 s of parse time; + 339k Pydantic inits on a real-world workbook contributed ~0.65 s of parse time; dataclass construction is ~2.2× faster with the same immutability and equality semantics. Validation of ``row >= 1`` / ``col >= 1`` is dropped: all producers in this codebase build coords from parsed diff --git a/src/parsers/workbook_parser.py b/src/parsers/workbook_parser.py index 899402f..bee0452 100644 --- a/src/parsers/workbook_parser.py +++ b/src/parsers/workbook_parser.py @@ -81,9 +81,9 @@ def __init__( max_workers: Number of parallel workers. build_dep_graph: Build the formula dependency graph + run cycle detection. Fast mode sets this False — on formula-heavy - workbooks (Walbridge: 17.6k formulas → 48k edges) the dep - graph is one of the largest remaining costs and nothing in - fast mode consumes it. + workbooks (17k formulas → 48k edges is typical for a real + financial model) the dep graph is one of the largest + remaining costs and nothing in fast mode consumes it. """ if path is None and content is None: raise ValueError("Either path or content must be provided") @@ -249,8 +249,8 @@ def parse(self) -> WorkbookDTO: # Build dependency graph (skippable in fast mode — this stage scans # every formula, runs the parser, creates thousands of edges, and - # then runs cycle detection; on Walbridge alone it accounts for - # ~25% of the full-mode wall clock). + # then runs cycle detection; on a 17k-formula real-world workbook it + # accounts for ~25% of the full-mode wall clock). if self._build_dep_graph: try: from formula.dependency_builder import DependencyBuilder diff --git a/testBench/README.md b/testBench/README.md deleted file mode 100644 index 68cc9eb..0000000 --- a/testBench/README.md +++ /dev/null @@ -1,62 +0,0 @@ -# testBench — the ks-xlsx-parser stress corpus - -A single, self-contained dataset of **1053 `.xlsx` workbooks** used to -regression-test and stress-test [ks-xlsx-parser](https://github.com/knowledgestack/ks-xlsx-parser). - -It is MIT-licensed, free to reuse for any Excel parser research (commercial or -otherwise). If it saves you time, please [star the repo](https://github.com/knowledgestack/ks-xlsx-parser) — -that's the only signal we have that open-sourcing this was worth doing. - -## Layout - -| Directory | Files | What's in it | -|-----------|------:|--------------| -| `real_world/` | 8 | Real anonymised workbooks shipped as demos (financial models, project trackers, engineering calcs). | -| `enterprise/` | 4 | Deterministic enterprise templates (financial / forecast / inventory / operations). | -| `github_datasets/` | 10 | Public CSV→XLSX conversions (iris, titanic, superstore, apple stock, …). | -| `stress/curated/` | 26 | 26 hand-authored progressive stress levels (`stress_level_0`…`stress_level_25`). | -| `stress/merges/` | 5 | Pathological merge patterns that historically broke parsers. | -| `generated/matrix/` | ~297 | **One feature per file** across 18 categories (formulas, merges, named ranges, data validation, conditional formatting, tables, charts, styles, dates, errors, hidden rows/cols, hyperlinks, comments, rich text, freeze panes, edge addresses, sheet names, 3D refs). | -| `generated/combo/` | 400 | Deterministically randomised cocktails at 5 densities × 80 seeds. | -| `generated/adversarial/`| 300 | Files engineered to break parsers: deep formula chains, 1M-row sparse sheets, 250-sheet workbooks, unicode bombs, huge merges, broken refs, 32 k-char cells, circular refs, long formulas. | -| **Total** | **1053** | | - -The `generated/` tree is produced by [`scripts/build_testbench.py`](../scripts/build_testbench.py) -and is deterministic — identical commits produce byte-identical files. The other -directories are checked in as-is. - -## Manifest - -`generated/MANIFEST.json` lists every generated file with: - -* `group` — matrix category, combo, or adversarial -* `features` — tags describing what the file exercises -* `expected_cells` — sanity check count -* `expected_formulas` — sanity check count -* `sha256` / `size_bytes` — integrity + packaging info -* `notes` — e.g. seed/density for combo files - -## How we use it - -```bash -# regenerate the 1000-file generated tree (idempotent) -make testbench-build - -# parse every file and record failures to metrics/testbench/failures.json -make testbench - -# package for a GitHub release -make testbench-zip -``` - -The round-trip test (`tests/test_testbench_roundtrip.py`) asserts every -workbook parses without raising and produces a non-empty JSON result. The -failure log is a first-class artifact — every parser regression shows up as a -new entry. - -## Licensing - -All files generated by `build_testbench.py` are synthetic and released under -MIT alongside the parser. The `real_world/`, `enterprise/`, and -`github_datasets/` contents are either authored for this project or sourced -from public-domain datasets; attribution is in the parent repo. diff --git a/testBench/enterprise/financial_model.xlsx b/testBench/enterprise/financial_model.xlsx deleted file mode 100644 index f84c12d..0000000 Binary files a/testBench/enterprise/financial_model.xlsx and /dev/null differ diff --git a/testBench/enterprise/forecast_model.xlsx b/testBench/enterprise/forecast_model.xlsx deleted file mode 100644 index 7f08d91..0000000 Binary files a/testBench/enterprise/forecast_model.xlsx and /dev/null differ diff --git a/testBench/enterprise/inventory_tracker.xlsx b/testBench/enterprise/inventory_tracker.xlsx deleted file mode 100644 index a13fcd1..0000000 Binary files a/testBench/enterprise/inventory_tracker.xlsx and /dev/null differ diff --git a/testBench/enterprise/operations_tracker.xlsx b/testBench/enterprise/operations_tracker.xlsx deleted file mode 100644 index a3997b3..0000000 Binary files a/testBench/enterprise/operations_tracker.xlsx and /dev/null differ diff --git a/testBench/github_datasets/apple_stock.xlsx b/testBench/github_datasets/apple_stock.xlsx deleted file mode 100644 index 62edeb6..0000000 Binary files a/testBench/github_datasets/apple_stock.xlsx and /dev/null differ diff --git a/testBench/github_datasets/bestsellers.xlsx b/testBench/github_datasets/bestsellers.xlsx deleted file mode 100644 index 665b312..0000000 Binary files a/testBench/github_datasets/bestsellers.xlsx and /dev/null differ diff --git a/testBench/github_datasets/boston.xlsx b/testBench/github_datasets/boston.xlsx deleted file mode 100644 index ab85439..0000000 Binary files a/testBench/github_datasets/boston.xlsx and /dev/null differ diff --git a/testBench/github_datasets/breast_cancer.xlsx b/testBench/github_datasets/breast_cancer.xlsx deleted file mode 100644 index adca3b2..0000000 Binary files a/testBench/github_datasets/breast_cancer.xlsx and /dev/null differ diff --git a/testBench/github_datasets/iris.xlsx b/testBench/github_datasets/iris.xlsx deleted file mode 100644 index fede151..0000000 Binary files a/testBench/github_datasets/iris.xlsx and /dev/null differ diff --git a/testBench/github_datasets/superstore.xlsx b/testBench/github_datasets/superstore.xlsx deleted file mode 100644 index c51783b..0000000 Binary files a/testBench/github_datasets/superstore.xlsx and /dev/null differ diff --git a/testBench/github_datasets/titanic.xlsx b/testBench/github_datasets/titanic.xlsx deleted file mode 100644 index 5cba13b..0000000 Binary files a/testBench/github_datasets/titanic.xlsx and /dev/null differ diff --git a/testBench/github_datasets/winequality_red.xlsx b/testBench/github_datasets/winequality_red.xlsx deleted file mode 100644 index 58ddf1e..0000000 Binary files a/testBench/github_datasets/winequality_red.xlsx and /dev/null differ diff --git a/testBench/github_datasets/world_happiness_2019.xlsx b/testBench/github_datasets/world_happiness_2019.xlsx deleted file mode 100644 index 6de5ad7..0000000 Binary files a/testBench/github_datasets/world_happiness_2019.xlsx and /dev/null differ diff --git a/testBench/github_datasets/worldcups.xlsx b/testBench/github_datasets/worldcups.xlsx deleted file mode 100644 index 4b122f5..0000000 Binary files a/testBench/github_datasets/worldcups.xlsx and /dev/null differ diff --git a/testBench/real_world/Employee Sample Data.xlsx b/testBench/real_world/Employee Sample Data.xlsx deleted file mode 100644 index 4cc5a38..0000000 Binary files a/testBench/real_world/Employee Sample Data.xlsx and /dev/null differ diff --git a/testBench/real_world/Financials Sample Data.xlsx b/testBench/real_world/Financials Sample Data.xlsx deleted file mode 100644 index 76bc6dd..0000000 Binary files a/testBench/real_world/Financials Sample Data.xlsx and /dev/null differ diff --git a/testBench/real_world/data_inventory.xlsx b/testBench/real_world/data_inventory.xlsx deleted file mode 100644 index 3371e0c..0000000 Binary files a/testBench/real_world/data_inventory.xlsx and /dev/null differ diff --git a/testBench/real_world/engineering_calcs.xlsx b/testBench/real_world/engineering_calcs.xlsx deleted file mode 100644 index 49e1fb0..0000000 Binary files a/testBench/real_world/engineering_calcs.xlsx and /dev/null differ diff --git a/testBench/real_world/financial_model.xlsx b/testBench/real_world/financial_model.xlsx deleted file mode 100644 index 276ea8a..0000000 Binary files a/testBench/real_world/financial_model.xlsx and /dev/null differ diff --git a/testBench/real_world/project_tracker.xlsx b/testBench/real_world/project_tracker.xlsx deleted file mode 100644 index bca638a..0000000 Binary files a/testBench/real_world/project_tracker.xlsx and /dev/null differ diff --git a/testBench/real_world/sales_dashboard.xlsx b/testBench/real_world/sales_dashboard.xlsx deleted file mode 100644 index fb05bb8..0000000 Binary files a/testBench/real_world/sales_dashboard.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_0.xlsx b/testBench/stress/curated/stress_level_0.xlsx deleted file mode 100644 index 4a620f0..0000000 Binary files a/testBench/stress/curated/stress_level_0.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_1.xlsx b/testBench/stress/curated/stress_level_1.xlsx deleted file mode 100644 index 76a7e01..0000000 Binary files a/testBench/stress/curated/stress_level_1.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_10.xlsx b/testBench/stress/curated/stress_level_10.xlsx deleted file mode 100644 index 7578615..0000000 Binary files a/testBench/stress/curated/stress_level_10.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_11.xlsx b/testBench/stress/curated/stress_level_11.xlsx deleted file mode 100644 index 72d5c8d..0000000 Binary files a/testBench/stress/curated/stress_level_11.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_12.xlsx b/testBench/stress/curated/stress_level_12.xlsx deleted file mode 100644 index 56e10e4..0000000 Binary files a/testBench/stress/curated/stress_level_12.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_13.xlsx b/testBench/stress/curated/stress_level_13.xlsx deleted file mode 100644 index 274c560..0000000 Binary files a/testBench/stress/curated/stress_level_13.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_14.xlsx b/testBench/stress/curated/stress_level_14.xlsx deleted file mode 100644 index 7d69a4c..0000000 Binary files a/testBench/stress/curated/stress_level_14.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_15.xlsx b/testBench/stress/curated/stress_level_15.xlsx deleted file mode 100644 index 50aa2a4..0000000 Binary files a/testBench/stress/curated/stress_level_15.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_16.xlsx b/testBench/stress/curated/stress_level_16.xlsx deleted file mode 100644 index a22617a..0000000 Binary files a/testBench/stress/curated/stress_level_16.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_17.xlsx b/testBench/stress/curated/stress_level_17.xlsx deleted file mode 100644 index 3e8fc4c..0000000 Binary files a/testBench/stress/curated/stress_level_17.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_18.xlsx b/testBench/stress/curated/stress_level_18.xlsx deleted file mode 100644 index 56ae03b..0000000 Binary files a/testBench/stress/curated/stress_level_18.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_19.xlsx b/testBench/stress/curated/stress_level_19.xlsx deleted file mode 100644 index 98c9f4a..0000000 Binary files a/testBench/stress/curated/stress_level_19.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_2.xlsx b/testBench/stress/curated/stress_level_2.xlsx deleted file mode 100644 index 97fb325..0000000 Binary files a/testBench/stress/curated/stress_level_2.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_20.xlsx b/testBench/stress/curated/stress_level_20.xlsx deleted file mode 100644 index 72154d7..0000000 Binary files a/testBench/stress/curated/stress_level_20.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_21.xlsx b/testBench/stress/curated/stress_level_21.xlsx deleted file mode 100644 index 7df3bc8..0000000 Binary files a/testBench/stress/curated/stress_level_21.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_22.xlsx b/testBench/stress/curated/stress_level_22.xlsx deleted file mode 100644 index 1dca4d7..0000000 Binary files a/testBench/stress/curated/stress_level_22.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_23.xlsx b/testBench/stress/curated/stress_level_23.xlsx deleted file mode 100644 index 489bae5..0000000 Binary files a/testBench/stress/curated/stress_level_23.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_24.xlsx b/testBench/stress/curated/stress_level_24.xlsx deleted file mode 100644 index 82f946e..0000000 Binary files a/testBench/stress/curated/stress_level_24.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_25.xlsx b/testBench/stress/curated/stress_level_25.xlsx deleted file mode 100644 index 6ba2f67..0000000 Binary files a/testBench/stress/curated/stress_level_25.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_3.xlsx b/testBench/stress/curated/stress_level_3.xlsx deleted file mode 100644 index e43c5d2..0000000 Binary files a/testBench/stress/curated/stress_level_3.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_4.xlsx b/testBench/stress/curated/stress_level_4.xlsx deleted file mode 100644 index 0464f9d..0000000 Binary files a/testBench/stress/curated/stress_level_4.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_5.xlsx b/testBench/stress/curated/stress_level_5.xlsx deleted file mode 100644 index f279818..0000000 Binary files a/testBench/stress/curated/stress_level_5.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_6.xlsx b/testBench/stress/curated/stress_level_6.xlsx deleted file mode 100644 index e5b3f85..0000000 Binary files a/testBench/stress/curated/stress_level_6.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_7.xlsx b/testBench/stress/curated/stress_level_7.xlsx deleted file mode 100644 index dff80f4..0000000 Binary files a/testBench/stress/curated/stress_level_7.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_8.xlsx b/testBench/stress/curated/stress_level_8.xlsx deleted file mode 100644 index 780d0a3..0000000 Binary files a/testBench/stress/curated/stress_level_8.xlsx and /dev/null differ diff --git a/testBench/stress/curated/stress_level_9.xlsx b/testBench/stress/curated/stress_level_9.xlsx deleted file mode 100644 index a3a6650..0000000 Binary files a/testBench/stress/curated/stress_level_9.xlsx and /dev/null differ diff --git a/testBench/stress/merges/merge_stress_across.xlsx b/testBench/stress/merges/merge_stress_across.xlsx deleted file mode 100644 index 52db4d7..0000000 Binary files a/testBench/stress/merges/merge_stress_across.xlsx and /dev/null differ diff --git a/testBench/stress/merges/merge_stress_dense_grid.xlsx b/testBench/stress/merges/merge_stress_dense_grid.xlsx deleted file mode 100644 index 7c938bf..0000000 Binary files a/testBench/stress/merges/merge_stress_dense_grid.xlsx and /dev/null differ diff --git a/testBench/stress/merges/merge_stress_empty_master.xlsx b/testBench/stress/merges/merge_stress_empty_master.xlsx deleted file mode 100644 index 06713b0..0000000 Binary files a/testBench/stress/merges/merge_stress_empty_master.xlsx and /dev/null differ diff --git a/testBench/stress/merges/merge_stress_table_header.xlsx b/testBench/stress/merges/merge_stress_table_header.xlsx deleted file mode 100644 index 13d1092..0000000 Binary files a/testBench/stress/merges/merge_stress_table_header.xlsx and /dev/null differ diff --git a/testBench/stress/merges/merge_stress_vertical.xlsx b/testBench/stress/merges/merge_stress_vertical.xlsx deleted file mode 100644 index 1a44d8e..0000000 Binary files a/testBench/stress/merges/merge_stress_vertical.xlsx and /dev/null differ diff --git a/tests/benchmarks/README.md b/tests/benchmarks/README.md index 1cb056f..412102e 100644 --- a/tests/benchmarks/README.md +++ b/tests/benchmarks/README.md @@ -4,7 +4,7 @@ Two benchmarks, both reproducible: | Benchmark | What it measures | Corpus | Cost | |---|---|---|---| -| `vs_hucre.py` (structural) | Parse-success rate + structural counts (cells, formulas, tables, merges, etc.) across many files | `testBench/` (53 curated) or `data/corpora/spreadsheetbench/` (5,458 real-world) | Cheap — 1–20 min | +| `vs_hucre.py` (structural) | Parse-success rate + structural counts (cells, formulas, tables, merges, etc.) across many files | `data/corpora/spreadsheetbench/` (5,458 real-world) | Cheap — 1–20 min | | `scripts/eval_retrieval.py` (chunk quality) | Recall@k for retrieving the relevant chunk given a natural-language instruction, + table-integrity fragmentation rate | SpreadsheetBench `dataset.json` (912 instruction + position pairs) | Medium — 10 min on 100 instances | ## 1. Structural benchmark — `vs_hucre.py` @@ -18,9 +18,9 @@ Long-running NDJSON-protocol workers, per-file timeout, batch respawn, randomize Supported parsers today: `ks` (ks-xlsx-parser), `hucre` (TypeScript, requires `pnpm install` under `hucre_node/`), `docling` (IBM Docling — `uv pip install docling`). ```bash -# Quick smoke (50 random files from testBench) +# Quick smoke (50 random files from SpreadsheetBench) PYTHONPATH=src uv run python -m tests.benchmarks.vs_hucre \ - --corpus testBench --sample 50 --parsers ks + --corpus data/corpora/spreadsheetbench --sample 50 --parsers ks # Robustness on full SpreadsheetBench (5,458 files, ~20 min) PYTHONPATH=src uv run python -m tests.benchmarks.vs_hucre \ diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py index 4e77399..4558721 100644 --- a/tests/benchmarks/__init__.py +++ b/tests/benchmarks/__init__.py @@ -2,11 +2,11 @@ Local-only benchmark harness. Not part of the public test suite. Runs `ks-xlsx-parser` head-to-head against external parsers (currently `hucre`, -a TypeScript zero-dependency spreadsheet I/O library) across the `testBench/` -corpus and produces per-file perf + feature-coverage records. +a TypeScript zero-dependency spreadsheet I/O library) across the +SpreadsheetBench corpus and produces per-file perf + feature-coverage records. Not committed by default — reports and node_modules are git-ignored. Invoke -via `python -m tests.benchmarks.vs_hucre --corpus testBench`. +via `python -m tests.benchmarks.vs_hucre --corpus data/corpora/spreadsheetbench`. Pitfalls this harness is designed to avoid (read before editing): diff --git a/tests/benchmarks/_driver.py b/tests/benchmarks/_driver.py index dad231d..a66acd5 100644 --- a/tests/benchmarks/_driver.py +++ b/tests/benchmarks/_driver.py @@ -257,13 +257,7 @@ def generate_summary(out_dir: Path) -> None: continue try: rel = Path(r["file"]).resolve() - # Find segment after 'testBench/' or use file's parent name. - parts = rel.parts - if "testBench" in parts: - idx = parts.index("testBench") - sub = "/".join(parts[idx + 1: idx + 3]) if idx + 2 < len(parts) else parts[idx + 1] - else: - sub = rel.parent.name + sub = rel.parent.name except Exception: # noqa: BLE001 sub = "?" by_sub[(r["parser"], sub)].append(r["parse_time_ms"]) diff --git a/tests/benchmarks/vs_hucre.py b/tests/benchmarks/vs_hucre.py index 24e0e50..44ca561 100644 --- a/tests/benchmarks/vs_hucre.py +++ b/tests/benchmarks/vs_hucre.py @@ -4,7 +4,7 @@ Usage (from repo root, with venv active): python -m tests.benchmarks.vs_hucre \\ - --corpus testBench \\ + --corpus data/corpora/spreadsheetbench \\ --out tests/benchmarks/reports \\ [--subset real_world,enterprise] \\ [--sample 50] \\ @@ -33,7 +33,7 @@ def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser(description=__doc__.splitlines()[1] if __doc__ else "") - parser.add_argument("--corpus", type=Path, default=Path("testBench"), + parser.add_argument("--corpus", type=Path, default=Path("data/corpora/spreadsheetbench"), help="Corpus directory containing .xlsx/.xlsm files.") parser.add_argument("--out", type=Path, default=Path("tests/benchmarks/reports"), help="Root directory for reports; a timestamped subdir is created.") diff --git a/tests/conftest.py b/tests/conftest.py index 85d21a0..d422b9b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,14 +22,9 @@ from openpyxl.worksheet.table import Table, TableStyleInfo # --------------------------------------------------------------------------- -# All-xlsx-files collection for cross-validation and invariant tests +# Programmatic fixture collection for cross-validation and invariant tests # --------------------------------------------------------------------------- -_PROJECT_ROOT = Path(__file__).parent.parent -_TESTBENCH_DIR = _PROJECT_ROOT / "testBench" -_EXAMPLES_DIR = _TESTBENCH_DIR / "real_world" -_DATASETS_DIR = _TESTBENCH_DIR / "github_datasets" - # Names of conftest fixtures that produce .xlsx files PROGRAMMATIC_FIXTURE_NAMES = [ "simple_workbook", @@ -69,33 +64,12 @@ ] -def collect_static_xlsx_files() -> list[Path]: - """Collect all static .xlsx files from examples and github_datasets.""" - files = [] - for d in [_EXAMPLES_DIR, _DATASETS_DIR]: - if d.exists(): - files.extend(sorted(d.glob("*.xlsx"))) - return files - - -STATIC_XLSX_FILES = collect_static_xlsx_files() - - @pytest.fixture(params=PROGRAMMATIC_FIXTURE_NAMES) def programmatic_xlsx(request, tmp_dir) -> Path: """Yields each programmatic fixture as a Path (re-uses other fixtures).""" return request.getfixturevalue(request.param) -@pytest.fixture( - params=STATIC_XLSX_FILES, - ids=[f.stem for f in STATIC_XLSX_FILES], -) -def static_xlsx(request) -> Path: - """Yields each static .xlsx file path.""" - return request.param - - @pytest.fixture def tmp_dir(): """Provide a temporary directory for test workbooks.""" diff --git a/tests/test_cross_validation.py b/tests/test_cross_validation.py deleted file mode 100644 index 94f73f1..0000000 --- a/tests/test_cross_validation.py +++ /dev/null @@ -1,334 +0,0 @@ -""" -Cross-validation tests comparing parser output against python-calamine. - -Calamine is a Rust-based Excel reader, completely independent from openpyxl. -These tests verify that our parser reads the same data that calamine does. -""" - - - -import datetime - -import pytest - -from pipeline import parse_workbook - -from tests.helpers.calamine_reader import CalamineResult -from tests.helpers.value_comparator import Mismatch, compare_cell_value, values_match - - -# --------------------------------------------------------------------------- -# Cross-validation on programmatic fixtures -# --------------------------------------------------------------------------- - - -@pytest.mark.crossval -class TestSheetNamesCrossVal: - """Verify sheet names match between parser and calamine.""" - - def test_sheet_names_match(self, programmatic_xlsx): - parser_result = parse_workbook(path=programmatic_xlsx) - calamine = CalamineResult.from_path(programmatic_xlsx) - - parser_names = [s.sheet_name for s in parser_result.workbook.sheets] - assert parser_names == calamine.sheet_names, ( - f"Sheet names differ:\n parser: {parser_names}\n" - f" calamine: {calamine.sheet_names}" - ) - - def test_sheet_count_match(self, programmatic_xlsx): - parser_result = parse_workbook(path=programmatic_xlsx) - calamine = CalamineResult.from_path(programmatic_xlsx) - assert len(parser_result.workbook.sheets) == len(calamine.sheet_names) - - -@pytest.mark.crossval -class TestCellValuesCrossVal: - """Verify cell values match between parser and calamine.""" - - def test_non_formula_values_match(self, programmatic_xlsx): - parser_result = parse_workbook(path=programmatic_xlsx) - calamine = CalamineResult.from_path(programmatic_xlsx) - mismatches = _collect_mismatches(parser_result, calamine, formula_cells=False) - assert len(mismatches) == 0, ( - f"{len(mismatches)} non-formula value mismatches:\n" - + _format_mismatches(mismatches[:10]) - ) - - def test_formula_computed_values_match(self, programmatic_xlsx): - """For formula cells with cached values, parser's formula_value should - match calamine's computed value. Programmatic fixtures often have no - cached values, so we use a lenient threshold.""" - parser_result = parse_workbook(path=programmatic_xlsx) - calamine = CalamineResult.from_path(programmatic_xlsx) - mismatches = _collect_mismatches(parser_result, calamine, formula_cells=True) - - total_formulas = sum( - 1 for s in parser_result.workbook.sheets - for c in s.cells.values() - if c.formula - ) - # Allow up to 100% mismatch for programmatic fixtures (no cached values) - # This test is more meaningful for real-world files - if total_formulas > 0 and len(mismatches) > 0: - rate = len(mismatches) / total_formulas - # Only fail if we have actual cached values but they don't match - hard_mismatches = [ - m for m in mismatches - if m.parser_value is not None and m.calamine_value is not None - ] - assert len(hard_mismatches) == 0, ( - f"{len(hard_mismatches)} formula value mismatches " - f"(with cached values):\n" - + _format_mismatches(hard_mismatches[:10]) - ) - - -@pytest.mark.crossval -class TestDimensionsCrossVal: - """Verify dimensions roughly match between parser and calamine.""" - - def test_row_count_similar(self, programmatic_xlsx): - parser_result = parse_workbook(path=programmatic_xlsx) - calamine = CalamineResult.from_path(programmatic_xlsx) - - for sheet in parser_result.workbook.sheets: - cal_sheet = calamine.sheets.get(sheet.sheet_name) - if not cal_sheet or not sheet.used_range: - continue - parser_rows = sheet.used_range.row_count() - # calamine total_height is the total row count of the sheet - # For comparison, use the data area (start/end) - if cal_sheet.start is not None and cal_sheet.end is not None: - cal_rows = cal_sheet.end[0] - cal_sheet.start[0] + 1 - # Allow ±2 row difference (calamine may include trailing empty rows) - assert abs(parser_rows - cal_rows) <= 2, ( - f"Sheet '{sheet.sheet_name}' row count: " - f"parser={parser_rows}, calamine={cal_rows}" - ) - - def test_column_count_similar(self, programmatic_xlsx): - parser_result = parse_workbook(path=programmatic_xlsx) - calamine = CalamineResult.from_path(programmatic_xlsx) - - for sheet in parser_result.workbook.sheets: - cal_sheet = calamine.sheets.get(sheet.sheet_name) - if not cal_sheet or not sheet.used_range: - continue - parser_cols = sheet.used_range.col_count() - if cal_sheet.start is not None and cal_sheet.end is not None: - cal_cols = cal_sheet.end[1] - cal_sheet.start[1] + 1 - assert abs(parser_cols - cal_cols) <= 2, ( - f"Sheet '{sheet.sheet_name}' col count: " - f"parser={parser_cols}, calamine={cal_cols}" - ) - - -@pytest.mark.crossval -class TestMergedRegionsCrossVal: - """Verify merged regions match between parser and calamine.""" - - def test_merged_region_count(self, programmatic_xlsx): - parser_result = parse_workbook(path=programmatic_xlsx) - calamine = CalamineResult.from_path(programmatic_xlsx) - - for sheet in parser_result.workbook.sheets: - cal_sheet = calamine.sheets.get(sheet.sheet_name) - if not cal_sheet or cal_sheet.merged_ranges is None: - continue - parser_count = len(sheet.merged_regions) - cal_count = len(cal_sheet.merged_ranges) - assert parser_count == cal_count, ( - f"Sheet '{sheet.sheet_name}' merge count: " - f"parser={parser_count}, calamine={cal_count}" - ) - - def test_merged_region_ranges(self, programmatic_xlsx): - parser_result = parse_workbook(path=programmatic_xlsx) - calamine = CalamineResult.from_path(programmatic_xlsx) - - for sheet in parser_result.workbook.sheets: - cal_sheet = calamine.sheets.get(sheet.sheet_name) - if not cal_sheet or cal_sheet.merged_ranges is None: - continue - # Convert calamine ranges to comparable format - # calamine: ((start_row, start_col), (end_row, end_col)) 0-indexed - cal_ranges = set() - for (sr, sc), (er, ec) in cal_sheet.merged_ranges: - cal_ranges.add((sr + 1, sc + 1, er + 1, ec + 1)) - - parser_ranges = set() - for region in sheet.merged_regions: - parser_ranges.add(( - region.range.top_left.row, - region.range.top_left.col, - region.range.bottom_right.row, - region.range.bottom_right.col, - )) - - assert parser_ranges == cal_ranges, ( - f"Sheet '{sheet.sheet_name}' merge ranges differ:\n" - f" parser: {sorted(parser_ranges)}\n" - f" calamine: {sorted(cal_ranges)}" - ) - - -@pytest.mark.crossval -class TestMismatchRateCrossVal: - """Overall mismatch rate must be below threshold.""" - - def test_overall_mismatch_rate(self, programmatic_xlsx): - parser_result = parse_workbook(path=programmatic_xlsx) - calamine = CalamineResult.from_path(programmatic_xlsx) - mismatches = _collect_mismatches( - parser_result, calamine, formula_cells=False - ) - total_cells = sum( - s.cell_count() for s in parser_result.workbook.sheets - ) - if total_cells > 0: - rate = len(mismatches) / total_cells - assert rate < 0.01, ( - f"Mismatch rate {rate:.1%} ({len(mismatches)}/{total_cells}) " - f"exceeds 1% threshold" - ) - - -# --------------------------------------------------------------------------- -# Cross-validation on static files (examples + github datasets) -# --------------------------------------------------------------------------- - - -@pytest.mark.crossval -class TestSheetNamesStatic: - def test_sheet_names_match(self, static_xlsx): - parser_result = parse_workbook(path=static_xlsx) - calamine = CalamineResult.from_path(static_xlsx) - parser_names = [s.sheet_name for s in parser_result.workbook.sheets] - assert parser_names == calamine.sheet_names - - -@pytest.mark.crossval -class TestCellValuesStatic: - def test_non_formula_values_match(self, static_xlsx): - parser_result = parse_workbook(path=static_xlsx) - calamine = CalamineResult.from_path(static_xlsx) - mismatches = _collect_mismatches(parser_result, calamine, formula_cells=False) - total_cells = sum(s.cell_count() for s in parser_result.workbook.sheets) - if total_cells > 0: - rate = len(mismatches) / total_cells - assert rate < 0.01, ( - f"{static_xlsx.name}: {len(mismatches)}/{total_cells} " - f"({rate:.1%}) mismatches:\n" - + _format_mismatches(mismatches[:10]) - ) - - def test_formula_cached_values_match(self, static_xlsx): - """For real-world files, formula cached values should match calamine. - - Threshold: <5% mismatch overall. A handful of files with highly nested - dynamic-array or volatile formulas are known to exceed this because - openpyxl doesn't always surface the latest cached value Excel wrote — - we allow up to 15% for those, tracked in docs/PARSER_KNOWN_ISSUES.md. - """ - known_loose_files = { - "Walbridge Coatings 8.9.23.xlsx", # openpyxl cached-value gap - } - threshold = 0.15 if static_xlsx.name in known_loose_files else 0.05 - - parser_result = parse_workbook(path=static_xlsx) - calamine = CalamineResult.from_path(static_xlsx) - mismatches = _collect_mismatches(parser_result, calamine, formula_cells=True) - hard_mismatches = [ - m for m in mismatches - if m.parser_value is not None and m.calamine_value is not None - ] - total_formulas = sum( - 1 for s in parser_result.workbook.sheets - for c in s.cells.values() - if c.formula - ) - if total_formulas > 0 and len(hard_mismatches) > 0: - rate = len(hard_mismatches) / total_formulas - assert rate < threshold, ( - f"{static_xlsx.name}: {len(hard_mismatches)}/{total_formulas} " - f"formula mismatches ({rate:.1%}, threshold {threshold:.0%}):\n" - + _format_mismatches(hard_mismatches[:10]) - ) - - -@pytest.mark.crossval -class TestDimensionsStatic: - def test_dimensions_similar(self, static_xlsx): - parser_result = parse_workbook(path=static_xlsx) - calamine = CalamineResult.from_path(static_xlsx) - for sheet in parser_result.workbook.sheets: - cal_sheet = calamine.sheets.get(sheet.sheet_name) - if not cal_sheet or not sheet.used_range: - continue - if cal_sheet.start is not None and cal_sheet.end is not None: - parser_rows = sheet.used_range.row_count() - cal_rows = cal_sheet.end[0] - cal_sheet.start[0] + 1 - # Allow ±5 for real-world files (empty trailing rows) - assert abs(parser_rows - cal_rows) <= 5, ( - f"{static_xlsx.name} sheet '{sheet.sheet_name}' rows: " - f"parser={parser_rows}, calamine={cal_rows}" - ) - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -def _collect_mismatches( - parser_result, - calamine: CalamineResult, - formula_cells: bool, -) -> list[Mismatch]: - """Collect all mismatches between parser and calamine.""" - mismatches = [] - for sheet in parser_result.workbook.sheets: - cal_sheet = calamine.sheets.get(sheet.sheet_name) - if not cal_sheet: - continue - - for cell in sheet.cells.values(): - # Filter by formula/non-formula - if formula_cells and not cell.formula: - continue - if not formula_cells and cell.formula: - continue - - # Skip merged slaves - if cell.is_merged_slave: - continue - - cal_val = cal_sheet.get_value(cell.coord.row, cell.coord.col) - - if not compare_cell_value(cell, cal_val): - parser_val = ( - cell.formula_value if cell.formula else cell.raw_value - ) - mismatches.append(Mismatch( - sheet=sheet.sheet_name, - row=cell.coord.row, - col=cell.coord.col, - a1_ref=cell.a1_ref, - parser_value=parser_val, - calamine_value=cal_val, - category="formula" if cell.formula else "value", - )) - - return mismatches - - -def _format_mismatches(mismatches: list[Mismatch]) -> str: - """Format mismatch list for error messages.""" - lines = [] - for m in mismatches: - lines.append( - f" {m.a1_ref}: parser={m.parser_value!r} ({type(m.parser_value).__name__}) " - f"vs calamine={m.calamine_value!r} ({type(m.calamine_value).__name__})" - ) - return "\n".join(lines) diff --git a/tests/test_enterprise_scoring.py b/tests/test_enterprise_scoring.py deleted file mode 100644 index 55b5cb4..0000000 --- a/tests/test_enterprise_scoring.py +++ /dev/null @@ -1,149 +0,0 @@ -"""Enterprise-focused scoring of parser output on synthetic fixtures. - -These tests provide lightweight, deterministic benchmarks that run without -network access. They exercise formulas, tables, cross-sheet references, -named ranges, hidden sheets, and simple calculation lineage. -""" - - - -import json -from pathlib import Path - -import pytest - -from ks_xlsx_parser import parse_workbook - -from scripts.generate_enterprise_fixtures import generate_all - - -ROOT = Path(__file__).resolve().parents[1] -FIXTURE_DIR = ROOT / "testBench" / "enterprise" - - -@pytest.fixture(scope="session") -def enterprise_workbooks() -> list[Path]: - """Generate (or reuse) enterprise fixtures and return their paths.""" - return generate_all() - - -class EnterpriseScorecard: - def __init__(self, parse_result, expected_metadata=None): - self.result = parse_result - self.expected = expected_metadata or {} - - def formula_fidelity(self) -> float: - workbook = self.result.workbook - extracted = 0 - total = 0 - for sheet in workbook.sheets: - for cell in sheet.cells.values(): - if cell.formula: - total += 1 - if cell.formula_value is not None or cell.raw_value is not None: - extracted += 1 - return extracted / total if total else 0.0 - - def table_detection_f1(self) -> float: - detected = len(self.result.workbook.tables) - expected = self.expected.get("expected_tables", detected) - if expected == 0 and detected == 0: - return 1.0 - precision = detected / max(detected, 1) - recall = detected / max(expected, 1) - return 2 * (precision * recall) / (precision + recall + 1e-10) - - def lineage_accuracy(self) -> float: - graph = self.result.workbook.dependency_graph - edges = len(graph.edges) - cycles = 0 # DependencyGraph does not expose cycles directly - accuracy = 1.0 - (cycles / (edges + 1)) * 0.1 - return max(accuracy, 0.0) - - def chunk_quality(self) -> float: - chunks = self.result.chunks - tokens = [c.token_count for c in chunks] - if not tokens: - return 0.0 - mean_tokens = sum(tokens) / len(tokens) - variance = sum((t - mean_tokens) ** 2 for t in tokens) / len(tokens) - std_dev = variance ** 0.5 - cv = std_dev / (mean_tokens + 1e-10) - return max(1.0 - cv, 0.0) - - def layout_recovery(self) -> float: - blocks_by_type = {} - for chunk in self.result.chunks: - blocks_by_type[chunk.block_type] = blocks_by_type.get(chunk.block_type, 0) + 1 - type_count = len(blocks_by_type) - return min(type_count / 3.0, 1.0) - - def composite_score(self): - weights = { - "formula_fidelity": 0.25, - "table_detection": 0.20, - "lineage_accuracy": 0.20, - "chunk_quality": 0.20, - "layout_recovery": 0.15, - } - scores = { - "formula_fidelity": self.formula_fidelity(), - "table_detection": self.table_detection_f1(), - "lineage_accuracy": self.lineage_accuracy(), - "chunk_quality": self.chunk_quality(), - "layout_recovery": self.layout_recovery(), - } - composite = sum(scores[k] * weights[k] for k in weights) - return scores, composite - - def metrics(self): - scores, composite = self.composite_score() - scores["composite"] = composite - return scores - - -@pytest.mark.enterprise -@pytest.mark.parametrize( - "filename,expected", - [ - ("financial_model.xlsx", {"expected_tables": 0, "expected_formulas": 2}), - ("inventory_tracker.xlsx", {"expected_tables": 0, "expected_formulas": 100}), - ("forecast_model.xlsx", {"expected_tables": 0, "expected_formulas": 24}), - ("operations_tracker.xlsx", {"expected_tables": 0, "expected_formulas": 20}), - ], -) -def test_enterprise_scorecard(enterprise_workbooks, filename, expected): - path = FIXTURE_DIR / filename - assert path.exists(), f"Fixture missing: {path}" - - result = parse_workbook(path=path) - scorecard = EnterpriseScorecard(result, expected_metadata=expected) - scores, composite = scorecard.composite_score() - - metrics_dir = ROOT / "metrics" / "corpus" - metrics_dir.mkdir(parents=True, exist_ok=True) - with open(metrics_dir / f"{path.stem}_scorecard.json", "w") as f: - json.dump(scorecard.metrics(), f, indent=2) - - print(scorecard.metrics()) - assert composite >= 0.45, f"Composite {composite:.2%} too low for {filename}" - - -@pytest.mark.enterprise -def test_enterprise_summary(enterprise_workbooks): - paths = enterprise_workbooks - results = [] - for p in paths: - result = parse_workbook(path=p) - scorecard = EnterpriseScorecard(result) - scores = scorecard.metrics() - scores["file"] = p.name - results.append(scores) - - metrics_dir = ROOT / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - summary_path = metrics_dir / "corpus_summary.json" - with open(summary_path, "w") as f: - json.dump({"files": results}, f, indent=2) - - assert len(results) == len(paths) diff --git a/tests/test_real_world_datasets.py b/tests/test_real_world_datasets.py deleted file mode 100644 index d10905a..0000000 --- a/tests/test_real_world_datasets.py +++ /dev/null @@ -1,433 +0,0 @@ -""" -Tests against real-world Excel datasets from GitHub. - -Source: https://github.com/rohanmistry231/Practice-Datasets-for-Excel - -Validates that the parser produces correct, complete JSON output for -a variety of public datasets covering different shapes, sizes, and -content types (numeric, text, dates, mixed). -""" - - - -import json -from pathlib import Path - -import pytest - -from chunking.segmenter import LayoutSegmenter -from models import BlockType -from parsers import WorkbookParser -from pipeline import parse_workbook -from storage.serializer import WorkbookSerializer - - -FIXTURES_DIR = Path(__file__).parent.parent / "testBench" / "github_datasets" - -# Each entry: (filename, expected_sheets, expected_min_rows, expected_header_sample) -DATASET_CATALOG = [ - ("iris.xlsx", 1, 150, ["sepal_length", "sepal_width", "petal_length"]), - ("titanic.xlsx", 1, 891, ["PassengerId", "Survived", "Pclass"]), - ("boston.xlsx", 1, 506, ["CRIM", "ZN", "INDUS"]), - ("world_happiness_2019.xlsx", 1, 156, ["Overall rank", "Country or region", "Score"]), - ("bestsellers.xlsx", 1, 550, ["Name", "Author", "User Rating"]), - ("superstore.xlsx", 3, 1952, ["Row ID", "Order Priority", "Discount"]), - ("worldcups.xlsx", 1, 20, ["Year", "Country", "Winner"]), - ("breast_cancer.xlsx", 1, 569, ["id", "diagnosis", "radius_mean"]), - ("apple_stock.xlsx", 1, 10016, ["Date", "Open", "High"]), - ("winequality_red.xlsx", 1, 1599, None), # semicolon-separated header, skip header check -] - - -def _fixture_path(name: str) -> Path: - return FIXTURES_DIR / name - - -# --------------------------------------------------------------------------- -# Parametrized: every dataset parses without error -# --------------------------------------------------------------------------- - - -@pytest.mark.parametrize( - "filename,expected_sheets,expected_min_rows,expected_headers", - DATASET_CATALOG, - ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG], -) -class TestDatasetParsing: - """Core parsing validation across all datasets.""" - - def test_parses_without_error(self, filename, expected_sheets, expected_min_rows, expected_headers): - """Parser completes without raising an exception.""" - result = parse_workbook(path=_fixture_path(filename)) - assert result.workbook is not None - - def test_correct_sheet_count(self, filename, expected_sheets, expected_min_rows, expected_headers): - """Workbook has the expected number of sheets.""" - result = parse_workbook(path=_fixture_path(filename)) - assert len(result.workbook.sheets) == expected_sheets - - def test_minimum_data_rows(self, filename, expected_sheets, expected_min_rows, expected_headers): - """First sheet has at least the expected number of data rows.""" - result = parse_workbook(path=_fixture_path(filename)) - sheet = result.workbook.sheets[0] - if sheet.used_range: - data_rows = sheet.used_range.row_count() - 1 # minus header row - assert data_rows >= expected_min_rows - - def test_headers_detected(self, filename, expected_sheets, expected_min_rows, expected_headers): - """First row contains the expected column headers.""" - if expected_headers is None: - pytest.skip("Header check skipped for this dataset") - result = parse_workbook(path=_fixture_path(filename)) - sheet = result.workbook.sheets[0] - first_row = sheet.used_range.top_left.row - actual_headers = [] - for col in range(sheet.used_range.top_left.col, sheet.used_range.bottom_right.col + 1): - cell = sheet.get_cell(first_row, col) - if cell and cell.raw_value is not None: - actual_headers.append(str(cell.raw_value)) - for expected in expected_headers: - assert expected in actual_headers, ( - f"Expected header '{expected}' not found in {actual_headers[:10]}" - ) - - def test_produces_chunks(self, filename, expected_sheets, expected_min_rows, expected_headers): - """Pipeline produces at least one chunk per sheet.""" - result = parse_workbook(path=_fixture_path(filename)) - assert result.total_chunks >= expected_sheets - - -# --------------------------------------------------------------------------- -# JSON serialization -# --------------------------------------------------------------------------- - - -class TestJsonSerialization: - """Verify JSON output is valid, complete, and contains expected fields.""" - - @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG], - ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG]) - def test_to_json_valid(self, filename): - """to_json() returns a dict that round-trips through json.dumps/loads.""" - result = parse_workbook(path=_fixture_path(filename)) - data = result.to_json() - json_str = json.dumps(data) - roundtripped = json.loads(json_str) - assert roundtripped["total_chunks"] == result.total_chunks - - @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG], - ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG]) - def test_to_json_has_required_keys(self, filename): - """JSON output contains all required top-level keys.""" - result = parse_workbook(path=_fixture_path(filename)) - data = result.to_json() - assert "workbook" in data - assert "chunks" in data - assert "total_chunks" in data - assert "total_tokens" in data - - @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG], - ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG]) - def test_workbook_metadata_in_json(self, filename): - """Workbook section has all required metadata fields.""" - result = parse_workbook(path=_fixture_path(filename)) - wb_json = result.to_json()["workbook"] - assert wb_json["workbook_id"] - assert wb_json["filename"] - assert wb_json["workbook_hash"] - assert isinstance(wb_json["total_sheets"], int) - assert isinstance(wb_json["total_cells"], int) - assert isinstance(wb_json["errors"], list) - - @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG], - ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG]) - def test_chunk_json_has_required_keys(self, filename): - """Each chunk in JSON has all required fields.""" - result = parse_workbook(path=_fixture_path(filename)) - for chunk in result.to_json()["chunks"]: - assert "chunk_id" in chunk - assert "source_uri" in chunk - assert "sheet_name" in chunk - assert "block_type" in chunk - assert "top_left" in chunk - assert "bottom_right" in chunk - assert "render_text" in chunk - assert chunk["render_text"] # not empty - - @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG], - ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG]) - def test_chunk_render_text_contains_data(self, filename): - """Rendered text in chunks contains actual cell data, not just structure.""" - result = parse_workbook(path=_fixture_path(filename)) - sheet = result.workbook.sheets[0] - # Get a data value from the sheet (short values to avoid semicolon-delimited lines) - if sheet.used_range: - first_data_row = sheet.used_range.top_left.row + 1 - for col in range(sheet.used_range.top_left.col, sheet.used_range.bottom_right.col + 1): - cell = sheet.get_cell(first_data_row, col) - if cell and cell.display_value and 2 < len(str(cell.display_value)) <= 30: - # At least one chunk should contain this value - found = any( - str(cell.display_value) in c.render_text - for c in result.chunks - ) - assert found, f"Value '{cell.display_value}' not found in any chunk render_text" - return - pytest.skip("No suitable data value found to check") - - -# --------------------------------------------------------------------------- -# Serializer records (Postgres-ready) -# --------------------------------------------------------------------------- - - -class TestSerializerRecords: - """Verify WorkbookSerializer produces valid storage records.""" - - @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG], - ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG]) - def test_workbook_record(self, filename): - """Workbook record has all required fields for Postgres.""" - result = parse_workbook(path=_fixture_path(filename)) - serializer = WorkbookSerializer(result.workbook, result.chunks) - rec = serializer.to_workbook_record() - assert rec["id"] - assert rec["file_hash"] - assert rec["filename"] - assert isinstance(rec["total_sheets"], int) - assert isinstance(rec["total_cells"], int) - # Ensure JSON-serializable - json.dumps(rec) - - @pytest.mark.parametrize( - "filename,expected_sheets", - [(d[0], d[1]) for d in DATASET_CATALOG], - ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG], - ) - def test_sheet_records_count(self, filename, expected_sheets): - """Correct number of sheet records produced.""" - result = parse_workbook(path=_fixture_path(filename)) - serializer = WorkbookSerializer(result.workbook, result.chunks) - sheets = serializer.to_sheet_records() - assert len(sheets) == expected_sheets - for s in sheets: - assert s["sheet_name"] - assert s["workbook_id"] - json.dumps(s) - - @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG], - ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG]) - def test_chunk_records(self, filename): - """Chunk records are valid and JSON-serializable.""" - result = parse_workbook(path=_fixture_path(filename)) - serializer = WorkbookSerializer(result.workbook, result.chunks) - chunks = serializer.to_chunk_records() - assert len(chunks) >= 1 - for c in chunks: - assert c["id"] - assert c["sheet_name"] - assert c["block_type"] - assert c["render_text"] - json.dumps(c) - - @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG], - ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG]) - def test_vector_store_entries(self, filename): - """Vector store entries have text and metadata for embedding.""" - result = parse_workbook(path=_fixture_path(filename)) - serializer = WorkbookSerializer(result.workbook, result.chunks) - entries = serializer.to_vector_store_entries() - assert len(entries) >= 1 - for e in entries: - assert e["id"] - assert e["text"] - assert e["metadata"]["workbook_hash"] - assert e["metadata"]["sheet_name"] - assert e["metadata"]["source_uri"] - json.dumps(e) - - -# --------------------------------------------------------------------------- -# Layout detection on real data -# --------------------------------------------------------------------------- - - -class TestRealWorldLayout: - """Verify layout segmentation works correctly on real datasets.""" - - @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG], - ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG]) - def test_blocks_have_valid_ranges(self, filename): - """All detected blocks have non-degenerate cell ranges.""" - result = WorkbookParser(path=_fixture_path(filename)).parse() - for sheet in result.sheets: - tables = [t for t in result.tables if t.sheet_name == sheet.sheet_name] - segmenter = LayoutSegmenter(sheet, tables=tables) - blocks = segmenter.segment() - for block in blocks: - assert block.cell_range is not None - assert block.cell_range.row_count() >= 1 - assert block.cell_range.col_count() >= 1 - assert block.cell_count > 0 - - @pytest.mark.parametrize("filename", [d[0] for d in DATASET_CATALOG], - ids=[d[0].replace(".xlsx", "") for d in DATASET_CATALOG]) - def test_blocks_have_valid_types(self, filename): - """All block types are valid BlockType enum values.""" - result = WorkbookParser(path=_fixture_path(filename)).parse() - for sheet in result.sheets: - tables = [t for t in result.tables if t.sheet_name == sheet.sheet_name] - segmenter = LayoutSegmenter(sheet, tables=tables) - blocks = segmenter.segment() - valid_types = set(BlockType) - for block in blocks: - assert block.block_type in valid_types - - def test_superstore_multi_sheet_layout(self): - """SuperStore has 3 sheets, each producing at least one block.""" - result = WorkbookParser(path=_fixture_path("superstore.xlsx")).parse() - assert len(result.sheets) == 3 - for sheet in result.sheets: - tables = [t for t in result.tables if t.sheet_name == sheet.sheet_name] - segmenter = LayoutSegmenter(sheet, tables=tables) - blocks = segmenter.segment() - assert len(blocks) >= 1, f"Sheet '{sheet.sheet_name}' has no blocks" - - def test_world_happiness_has_table(self): - """World Happiness dataset has an Excel ListObject table.""" - result = WorkbookParser(path=_fixture_path("world_happiness_2019.xlsx")).parse() - assert len(result.tables) >= 1 - table = result.tables[0] - assert table.table_name - assert table.ref_range is not None - - -# --------------------------------------------------------------------------- -# Determinism on real data -# --------------------------------------------------------------------------- - - -class TestRealWorldDeterminism: - """Parsing the same file twice produces identical output.""" - - @pytest.mark.parametrize("filename", ["iris.xlsx", "worldcups.xlsx", "bestsellers.xlsx"], - ids=["iris", "worldcups", "bestsellers"]) - def test_deterministic_json(self, filename): - """Two parses of the same file produce identical JSON (excluding timing).""" - r1 = parse_workbook(path=_fixture_path(filename)) - r2 = parse_workbook(path=_fixture_path(filename)) - j1 = r1.to_json() - j2 = r2.to_json() - # parse_duration_ms varies between runs; exclude from comparison - j1["workbook"]["parse_duration_ms"] = 0 - j2["workbook"]["parse_duration_ms"] = 0 - assert json.dumps(j1, sort_keys=True) == json.dumps(j2, sort_keys=True) - - @pytest.mark.parametrize("filename", ["iris.xlsx", "worldcups.xlsx", "bestsellers.xlsx"], - ids=["iris", "worldcups", "bestsellers"]) - def test_deterministic_hashes(self, filename): - """Chunk IDs and content hashes are stable across runs.""" - r1 = parse_workbook(path=_fixture_path(filename)) - r2 = parse_workbook(path=_fixture_path(filename)) - assert r1.total_chunks == r2.total_chunks - for c1, c2 in zip(r1.chunks, r2.chunks): - assert c1.chunk_id == c2.chunk_id - assert c1.content_hash == c2.content_hash - - -# --------------------------------------------------------------------------- -# Specific dataset content validation -# --------------------------------------------------------------------------- - - -class TestDatasetContent: - """Spot-check specific known values in well-known datasets.""" - - def test_iris_species_values(self): - """Iris dataset contains known species names.""" - result = parse_workbook(path=_fixture_path("iris.xlsx")) - sheet = result.workbook.sheets[0] - species_col = None - # Find the species column - for col in range(1, 20): - cell = sheet.get_cell(1, col) - if cell and cell.raw_value == "species": - species_col = col - break - assert species_col is not None, "species column not found" - # Check known species - species_values = set() - for row in range(2, 152): - cell = sheet.get_cell(row, species_col) - if cell and cell.raw_value: - species_values.add(cell.raw_value) - assert "setosa" in species_values - assert "versicolor" in species_values - assert "virginica" in species_values - - def test_worldcups_has_known_winners(self): - """WorldCups dataset contains known World Cup winners.""" - result = parse_workbook(path=_fixture_path("worldcups.xlsx")) - sheet = result.workbook.sheets[0] - winner_col = None - for col in range(1, 20): - cell = sheet.get_cell(1, col) - if cell and cell.raw_value == "Winner": - winner_col = col - break - assert winner_col is not None, "Winner column not found" - winners = set() - for row in range(2, 25): - cell = sheet.get_cell(row, winner_col) - if cell and cell.raw_value: - winners.add(cell.raw_value) - assert "Brazil" in winners - assert "Germany" in winners - - def test_titanic_numeric_columns(self): - """Titanic dataset has numeric columns (Survived, Pclass, Age).""" - result = parse_workbook(path=_fixture_path("titanic.xlsx")) - sheet = result.workbook.sheets[0] - # Check Survived column has 0/1 values - survived_col = None - for col in range(1, 30): - cell = sheet.get_cell(1, col) - if cell and cell.raw_value == "Survived": - survived_col = col - break - assert survived_col is not None - cell_val = sheet.get_cell(2, survived_col) - assert cell_val is not None - assert cell_val.raw_value in (0, 1, 0.0, 1.0) - - def test_apple_stock_date_column(self): - """Apple stock dataset has a Date column with date values.""" - result = parse_workbook(path=_fixture_path("apple_stock.xlsx")) - sheet = result.workbook.sheets[0] - date_col = None - for col in range(1, 10): - cell = sheet.get_cell(1, col) - if cell and cell.raw_value == "Date": - date_col = col - break - assert date_col is not None - # Check that at least one date cell has a date-like display value - date_cell = sheet.get_cell(2, date_col) - assert date_cell is not None - assert date_cell.display_value is not None - - def test_superstore_multiple_sheets_content(self): - """SuperStore has Orders, Returns, and Users sheets with distinct content.""" - result = parse_workbook(path=_fixture_path("superstore.xlsx")) - sheet_names = {s.sheet_name for s in result.workbook.sheets} - assert "Orders" in sheet_names - assert "Returns" in sheet_names - assert "Users" in sheet_names - - # Orders sheet should be large - orders = next(s for s in result.workbook.sheets if s.sheet_name == "Orders") - assert orders.cell_count() > 40000 - - # Users sheet should be small - users = next(s for s in result.workbook.sheets if s.sheet_name == "Users") - assert users.cell_count() <= 20 diff --git a/tests/test_structural_invariants.py b/tests/test_structural_invariants.py index f11a467..612d11a 100644 --- a/tests/test_structural_invariants.py +++ b/tests/test_structural_invariants.py @@ -278,29 +278,3 @@ def test_sheet_ids_populated(self, programmatic_xlsx): ) -# --------------------------------------------------------------------------- -# Same invariants on static files (examples + github datasets) -# --------------------------------------------------------------------------- - - -@pytest.mark.invariant -class TestAllInvariantsStatic: - """Run full invariant checker against each static xlsx file.""" - - def test_all_invariants_pass(self, static_xlsx): - result = parse_workbook(path=static_xlsx) - violations = check_invariants(result.workbook) - assert len(violations) == 0, ( - f"{len(violations)} violations in {static_xlsx.name}:\n" - + "\n".join(violations[:10]) - ) - - def test_deterministic_hashes(self, static_xlsx): - r1 = parse_workbook(path=static_xlsx) - r2 = parse_workbook(path=static_xlsx) - assert r1.workbook.workbook_hash == r2.workbook.workbook_hash - - def test_json_serializable(self, static_xlsx): - result = parse_workbook(path=static_xlsx) - data = result.to_json() - json.dumps(data) # must not raise diff --git a/tests/test_testbench_roundtrip.py b/tests/test_testbench_roundtrip.py deleted file mode 100644 index bfa4fd7..0000000 --- a/tests/test_testbench_roundtrip.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -testBench round-trip tests. - -Parses every .xlsx under ``testBench/`` and asserts: - -* ``parse_workbook()`` returns without raising. -* ``result.to_json()`` produces non-empty JSON (> 100 bytes). -* ``result.workbook`` has at least one sheet. - -Failures are collected into ``metrics/testbench/failures.json`` so parser -regressions across the whole bench are easy to diff. - -Runs under the ``testbench`` marker only (skipped by default). Invoke with: - - pytest tests/test_testbench_roundtrip.py -m testbench -q - make testbench # convenience wrapper -""" - - -import json -import os -import traceback -from pathlib import Path - -import pytest - -from ks_xlsx_parser import parse_workbook - -ROOT = Path(__file__).resolve().parent.parent -TESTBENCH_DIR = ROOT / "testBench" -METRICS_DIR = ROOT / "metrics" / "testbench" -FAILURES_PATH = METRICS_DIR / "failures.json" -FAILURES_JSONL = METRICS_DIR / "failures.jsonl" # append-only, xdist-safe - - -def _collect_files() -> list[Path]: - if not TESTBENCH_DIR.exists(): - return [] - return sorted(TESTBENCH_DIR.rglob("*.xlsx")) - - -ALL_FILES = _collect_files() - -pytestmark = [pytest.mark.testbench, pytest.mark.timeout(60)] - - -def _record_failure(entry: dict) -> None: - """Append one failure row to the JSONL log. Safe under xdist parallelism.""" - METRICS_DIR.mkdir(parents=True, exist_ok=True) - entry["worker"] = os.environ.get("PYTEST_XDIST_WORKER", "main") - with FAILURES_JSONL.open("a", encoding="utf-8") as f: - f.write(json.dumps(entry) + "\n") - - -@pytest.fixture(scope="session", autouse=True) -def _reset_log(): - """Reset the append log at the start of the session (master worker only).""" - # Under xdist, PYTEST_XDIST_WORKER is set for workers but not the master. - # The master is responsible for cleanup before workers start writing. - if os.environ.get("PYTEST_XDIST_WORKER") is None: - METRICS_DIR.mkdir(parents=True, exist_ok=True) - if FAILURES_JSONL.exists(): - FAILURES_JSONL.unlink() - yield - # After session, aggregate JSONL → JSON summary (master only) - if os.environ.get("PYTEST_XDIST_WORKER") is None: - failures: list[dict] = [] - if FAILURES_JSONL.exists(): - for line in FAILURES_JSONL.read_text().splitlines(): - if line.strip(): - failures.append(json.loads(line)) - FAILURES_PATH.write_text( - json.dumps( - {"total": len(ALL_FILES), "failure_count": len(failures), "failures": failures}, - indent=2, - ) - ) - - -def _relpath(p: Path) -> str: - return str(p.relative_to(ROOT)) - - -@pytest.mark.parametrize("path", ALL_FILES, ids=lambda p: _relpath(p)) -def test_parse_roundtrip(path: Path): - """Each workbook must parse, serialize to JSON, and report ≥1 sheet.""" - try: - result = parse_workbook(path=path) - except Exception as exc: - _record_failure({ - "file": _relpath(path), - "stage": "parse", - "error": f"{type(exc).__name__}: {exc}", - "traceback": traceback.format_exc(limit=5), - }) - raise - - assert result.workbook is not None, f"no workbook DTO for {path}" - assert result.workbook.total_sheets >= 1, f"{path} reports zero sheets" - - try: - js = result.to_json() - except Exception as exc: - _record_failure({ - "file": _relpath(path), - "stage": "to_json", - "error": f"{type(exc).__name__}: {exc}", - "traceback": traceback.format_exc(limit=5), - }) - raise - - assert isinstance(js, dict), f"to_json returned non-dict for {path}" - assert "workbook" in js, f"to_json result missing 'workbook' key for {path}" - try: - encoded = json.dumps(js, default=str) - except Exception as exc: - _record_failure({ - "file": _relpath(path), - "stage": "json_encode", - "error": f"{type(exc).__name__}: {exc}", - "traceback": traceback.format_exc(limit=5), - }) - raise - assert len(encoded) > 100, f"encoded JSON suspiciously short ({len(encoded)} chars) for {path}" - - -def test_testbench_has_files(): - """Guard against an empty testBench (e.g. missing dataset zip).""" - assert ALL_FILES, ( - f"No .xlsx files found under {TESTBENCH_DIR}. " - "Run `make testbench-build` or download the dataset zip from the GitHub release." - )