Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,35 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.1.3] - 2026-05-10

### Changed

- **Validate stage no longer hard-fails on row/schema_hash drift by
default.** A mismatch now emits a `[WARN]` line to stderr and the build
continues. Users invoking `python -m scripts.pipeline.build <slug>` have
already opted into "fetch whatever is upstream now"; an upstream Arrow-
conversion bump or a slightly-grown row count shouldn't turn that into a
failed build. Pass `--strict` (new flag on `scripts.pipeline.build`) to
upgrade warnings to errors — recommended for CI / pre-release gates.
- The previous `--loose` flag has been removed; its behaviour (warn, don't
raise) is now the default. Migrate `--loose` invocations to dropping the
flag entirely; replace any "default-strict" CI invocations with
`--strict`.

### Fixed

- **`validate.py` now compares `expect.schema_hash` as a prefix when the
manifest value is shorter than the full 64-char SHA-256.** All 37 slugs
with `schema_hash` set in `sources.json` use a 12-char short hash
(matching the `[validate] schema_hash=` print convention, akin to git
short SHAs); the previous full-string equality made every one of them
fail validation on rebuild. Equal-length values still use strict
equality, so full hashes remain enforceable for callers that prefer
them.
- `sources.schema.md` updated to document the prefix-match rule and the
new warn-vs-`--strict` semantics for the `expect` block.

## [0.1.2] - 2026-05-10

### Fixed
Expand Down Expand Up @@ -106,6 +135,7 @@ This release bundles:
this repository" button in the repo sidebar with BibTeX / APA / Chicago
exports.

[0.1.3]: https://github.com/spiraldb/raincloud/releases/tag/v0.1.3
[0.1.2]: https://github.com/spiraldb/raincloud/releases/tag/v0.1.2
[0.1.1]: https://github.com/spiraldb/raincloud/releases/tag/v0.1.1
[0.1.0]: https://github.com/spiraldb/raincloud/releases/tag/v0.1.0
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "raincloud"
version = "0.1.2"
version = "0.1.3"
description = "Client-reproducible pipeline for building a curated catalog of public datasets as Parquet + Vortex files."
readme = "README.md"
requires-python = ">=3.11"
Expand Down
15 changes: 12 additions & 3 deletions scripts/pipeline/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,17 @@
"""End-to-end orchestrator. Runs fetch → extract → parse → transform →
write → validate for one or more datasets selected from sources.json.

By default the validate stage treats row/schema_hash drift as a warning
(`[WARN]` to stderr) and continues — users invoking a build already opted
into "download whatever's upstream now," so an upstream Arrow-conversion
bump shouldn't brick their build. Pass `--strict` to upgrade those
warnings to errors; that's the recommended setting for CI / pre-release
gates where drift should block.

Examples:
python -m scripts.pipeline.build clickbench-hits
python -m scripts.pipeline.build --family uci
python -m scripts.pipeline.build --all --loose
python -m scripts.pipeline.build --all --strict # CI mode
"""
from __future__ import annotations

Expand Down Expand Up @@ -57,7 +64,9 @@ def main() -> int:
ap.add_argument("slugs", nargs="*", help="specific slugs to build")
ap.add_argument("--family", help="build all datasets in this family")
ap.add_argument("--all", action="store_true", help="build every dataset")
ap.add_argument("--loose", action="store_true", help="warn on validation failures instead of erroring")
ap.add_argument("--strict", action="store_true",
help="upgrade validate-stage drift warnings to hard errors "
"(off by default; recommended for CI / pre-release gates)")
ap.add_argument("--clean-workdir", action="store_true",
help="after each successful build, remove _workdir/<slug>/ "
"so large decompressed intermediates (e.g. Public BI bz2→csv) "
Expand All @@ -80,7 +89,7 @@ def main() -> int:

ok = failed = 0
for spec in selected:
if run_one(spec, strict=not args.loose, clean_workdir=args.clean_workdir):
if run_one(spec, strict=args.strict, clean_workdir=args.clean_workdir):
ok += 1
else:
failed += 1
Expand Down
48 changes: 43 additions & 5 deletions scripts/pipeline/validate.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,45 @@
# SPDX-FileCopyrightText: 2026 Raincloud Maintainers
# SPDX-License-Identifier: Apache-2.0
"""Stage 6 — compare actual parquet to the `expect` block."""
"""Stage 6 — compare actual parquet to the `expect` block.

Drift is treated as a signal, not an error: a row count or schema_hash
mismatch emits a `[WARN]` line but does NOT abort the build. The intent
is that users who ran `build <slug>` already opted into "download
whatever's currently upstream"; brittle equality checks against a
manifest captured weeks ago shouldn't turn an HF Arrow-conversion bump
into a failed build. Pass `strict=True` (CLI `--strict`) to opt back
into hard failures — useful for CI / pre-release gates.

Schema-hash comparison is prefix-aware: `expect.schema_hash` may be the
full 64-char SHA-256 or a short prefix (the manifest convention is 12
chars, matching the `schema_hash=` line printed by this stage). Equal-
length values use strict equality; a shorter expected acts as a prefix
match on the computed hash.
"""
from __future__ import annotations

import hashlib
import sys
from pathlib import Path

import pyarrow.parquet as pq

from .spec import spec_field


def validate(spec: dict, written: list[Path], *, strict: bool = True) -> list[dict]:
def _schema_hash_matches(actual: str, expected: str | None) -> bool:
"""Strict equality when lengths match; prefix match when expected is
shorter. Manifest entries are typically the 12-char short form."""
if expected is None:
return True
if len(expected) == len(actual):
return actual == expected
if len(expected) < len(actual):
return actual.startswith(expected)
return False


def validate(spec: dict, written: list[Path], *, strict: bool = False) -> list[dict]:
results = []
for p in written:
md = pq.ParquetFile(p).metadata
Expand All @@ -20,7 +48,7 @@ def validate(spec: dict, written: list[Path], *, strict: bool = True) -> list[di
ok_rows = expected_rows is None or actual_rows == expected_rows
schema_hash = _schema_hash(pq.ParquetFile(p).schema_arrow)
expected_hash = spec_field(spec, "expect.schema_hash")
ok_schema = expected_hash is None or schema_hash == expected_hash
ok_schema = _schema_hash_matches(schema_hash, expected_hash)
result = {
"path": str(p),
"rows_ok": ok_rows,
Expand All @@ -29,8 +57,18 @@ def validate(spec: dict, written: list[Path], *, strict: bool = True) -> list[di
"schema_hash": schema_hash,
}
results.append(result)
if strict and not (ok_rows and ok_schema):
raise AssertionError(f"validation failed for {p}: {result}")
if not (ok_rows and ok_schema):
if not ok_rows:
print(f"[WARN] {p.name}: rows drift "
f"(expected={expected_rows:,} actual={actual_rows:,})",
file=sys.stderr)
if not ok_schema:
exp_disp = expected_hash if expected_hash else "—"
print(f"[WARN] {p.name}: schema_hash drift "
f"(expected={exp_disp} actual={schema_hash[:12]})",
file=sys.stderr)
if strict:
raise AssertionError(f"validation failed for {p}: {result}")
print(f"[validate] {p.name} rows={actual_rows:,} "
f"(expected={expected_rows}) schema_hash={schema_hash[:12]}")
return results
Expand Down
8 changes: 6 additions & 2 deletions sources.schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,12 @@ This document defines the shape of `sources.json`, the manifest that drives the

/* Stage 6 — validate (scripts/pipeline/validate.py) */
"expect": {
"rows": 99997497, // exact; pipeline errors on mismatch unless --loose
"schema_hash": null, // optional; SHA-256 of canonicalised Arrow schema
"rows": 99997497, // exact; mismatch emits [WARN], does not fail unless --strict
"schema_hash": null, // optional; SHA-256 of canonicalised Arrow schema.
// May be the full 64-char hex or a leading prefix
// (manifest convention is 12 chars, matching the
// schema_hash= line printed by the validate stage).
// Mismatch emits [WARN] only; pass --strict to fail.
"notes": null
},

Expand Down
Loading