diff --git a/CHANGELOG.md b/CHANGELOG.md index 81c9420..a733dd0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,35 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.3] - 2026-05-10 + +### Changed + +- **Validate stage no longer hard-fails on row/schema_hash drift by + default.** A mismatch now emits a `[WARN]` line to stderr and the build + continues. Users invoking `python -m scripts.pipeline.build ` have + already opted into "fetch whatever is upstream now"; an upstream Arrow- + conversion bump or a slightly-grown row count shouldn't turn that into a + failed build. Pass `--strict` (new flag on `scripts.pipeline.build`) to + upgrade warnings to errors — recommended for CI / pre-release gates. +- The previous `--loose` flag has been removed; its behaviour (warn, don't + raise) is now the default. Migrate `--loose` invocations to dropping the + flag entirely; replace any "default-strict" CI invocations with + `--strict`. + +### Fixed + +- **`validate.py` now compares `expect.schema_hash` as a prefix when the + manifest value is shorter than the full 64-char SHA-256.** All 37 slugs + with `schema_hash` set in `sources.json` use a 12-char short hash + (matching the `[validate] schema_hash=` print convention, akin to git + short SHAs); the previous full-string equality made every one of them + fail validation on rebuild. Equal-length values still use strict + equality, so full hashes remain enforceable for callers that prefer + them. +- `sources.schema.md` updated to document the prefix-match rule and the + new warn-vs-`--strict` semantics for the `expect` block. + ## [0.1.2] - 2026-05-10 ### Fixed @@ -106,6 +135,7 @@ This release bundles: this repository" button in the repo sidebar with BibTeX / APA / Chicago exports. +[0.1.3]: https://github.com/spiraldb/raincloud/releases/tag/v0.1.3 [0.1.2]: https://github.com/spiraldb/raincloud/releases/tag/v0.1.2 [0.1.1]: https://github.com/spiraldb/raincloud/releases/tag/v0.1.1 [0.1.0]: https://github.com/spiraldb/raincloud/releases/tag/v0.1.0 diff --git a/pyproject.toml b/pyproject.toml index 7487b5c..f280c3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "raincloud" -version = "0.1.2" +version = "0.1.3" description = "Client-reproducible pipeline for building a curated catalog of public datasets as Parquet + Vortex files." readme = "README.md" requires-python = ">=3.11" diff --git a/scripts/pipeline/build.py b/scripts/pipeline/build.py index e91c650..8f4c6b5 100644 --- a/scripts/pipeline/build.py +++ b/scripts/pipeline/build.py @@ -3,10 +3,17 @@ """End-to-end orchestrator. Runs fetch → extract → parse → transform → write → validate for one or more datasets selected from sources.json. +By default the validate stage treats row/schema_hash drift as a warning +(`[WARN]` to stderr) and continues — users invoking a build already opted +into "download whatever's upstream now," so an upstream Arrow-conversion +bump shouldn't brick their build. Pass `--strict` to upgrade those +warnings to errors; that's the recommended setting for CI / pre-release +gates where drift should block. + Examples: python -m scripts.pipeline.build clickbench-hits python -m scripts.pipeline.build --family uci - python -m scripts.pipeline.build --all --loose + python -m scripts.pipeline.build --all --strict # CI mode """ from __future__ import annotations @@ -57,7 +64,9 @@ def main() -> int: ap.add_argument("slugs", nargs="*", help="specific slugs to build") ap.add_argument("--family", help="build all datasets in this family") ap.add_argument("--all", action="store_true", help="build every dataset") - ap.add_argument("--loose", action="store_true", help="warn on validation failures instead of erroring") + ap.add_argument("--strict", action="store_true", + help="upgrade validate-stage drift warnings to hard errors " + "(off by default; recommended for CI / pre-release gates)") ap.add_argument("--clean-workdir", action="store_true", help="after each successful build, remove _workdir// " "so large decompressed intermediates (e.g. Public BI bz2→csv) " @@ -80,7 +89,7 @@ def main() -> int: ok = failed = 0 for spec in selected: - if run_one(spec, strict=not args.loose, clean_workdir=args.clean_workdir): + if run_one(spec, strict=args.strict, clean_workdir=args.clean_workdir): ok += 1 else: failed += 1 diff --git a/scripts/pipeline/validate.py b/scripts/pipeline/validate.py index 71747ed..cc1fa63 100644 --- a/scripts/pipeline/validate.py +++ b/scripts/pipeline/validate.py @@ -1,9 +1,25 @@ # SPDX-FileCopyrightText: 2026 Raincloud Maintainers # SPDX-License-Identifier: Apache-2.0 -"""Stage 6 — compare actual parquet to the `expect` block.""" +"""Stage 6 — compare actual parquet to the `expect` block. + +Drift is treated as a signal, not an error: a row count or schema_hash +mismatch emits a `[WARN]` line but does NOT abort the build. The intent +is that users who ran `build ` already opted into "download +whatever's currently upstream"; brittle equality checks against a +manifest captured weeks ago shouldn't turn an HF Arrow-conversion bump +into a failed build. Pass `strict=True` (CLI `--strict`) to opt back +into hard failures — useful for CI / pre-release gates. + +Schema-hash comparison is prefix-aware: `expect.schema_hash` may be the +full 64-char SHA-256 or a short prefix (the manifest convention is 12 +chars, matching the `schema_hash=` line printed by this stage). Equal- +length values use strict equality; a shorter expected acts as a prefix +match on the computed hash. +""" from __future__ import annotations import hashlib +import sys from pathlib import Path import pyarrow.parquet as pq @@ -11,7 +27,19 @@ from .spec import spec_field -def validate(spec: dict, written: list[Path], *, strict: bool = True) -> list[dict]: +def _schema_hash_matches(actual: str, expected: str | None) -> bool: + """Strict equality when lengths match; prefix match when expected is + shorter. Manifest entries are typically the 12-char short form.""" + if expected is None: + return True + if len(expected) == len(actual): + return actual == expected + if len(expected) < len(actual): + return actual.startswith(expected) + return False + + +def validate(spec: dict, written: list[Path], *, strict: bool = False) -> list[dict]: results = [] for p in written: md = pq.ParquetFile(p).metadata @@ -20,7 +48,7 @@ def validate(spec: dict, written: list[Path], *, strict: bool = True) -> list[di ok_rows = expected_rows is None or actual_rows == expected_rows schema_hash = _schema_hash(pq.ParquetFile(p).schema_arrow) expected_hash = spec_field(spec, "expect.schema_hash") - ok_schema = expected_hash is None or schema_hash == expected_hash + ok_schema = _schema_hash_matches(schema_hash, expected_hash) result = { "path": str(p), "rows_ok": ok_rows, @@ -29,8 +57,18 @@ def validate(spec: dict, written: list[Path], *, strict: bool = True) -> list[di "schema_hash": schema_hash, } results.append(result) - if strict and not (ok_rows and ok_schema): - raise AssertionError(f"validation failed for {p}: {result}") + if not (ok_rows and ok_schema): + if not ok_rows: + print(f"[WARN] {p.name}: rows drift " + f"(expected={expected_rows:,} actual={actual_rows:,})", + file=sys.stderr) + if not ok_schema: + exp_disp = expected_hash if expected_hash else "—" + print(f"[WARN] {p.name}: schema_hash drift " + f"(expected={exp_disp} actual={schema_hash[:12]})", + file=sys.stderr) + if strict: + raise AssertionError(f"validation failed for {p}: {result}") print(f"[validate] {p.name} rows={actual_rows:,} " f"(expected={expected_rows}) schema_hash={schema_hash[:12]}") return results diff --git a/sources.schema.md b/sources.schema.md index 820ec93..cf3c516 100644 --- a/sources.schema.md +++ b/sources.schema.md @@ -83,8 +83,12 @@ This document defines the shape of `sources.json`, the manifest that drives the /* Stage 6 — validate (scripts/pipeline/validate.py) */ "expect": { - "rows": 99997497, // exact; pipeline errors on mismatch unless --loose - "schema_hash": null, // optional; SHA-256 of canonicalised Arrow schema + "rows": 99997497, // exact; mismatch emits [WARN], does not fail unless --strict + "schema_hash": null, // optional; SHA-256 of canonicalised Arrow schema. + // May be the full 64-char hex or a leading prefix + // (manifest convention is 12 chars, matching the + // schema_hash= line printed by the validate stage). + // Mismatch emits [WARN] only; pass --strict to fail. "notes": null },