From c1a7bcf6157c6ff8cf3c01961993e1da8e1b83d7 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Tue, 9 Jun 2026 15:36:34 -0500 Subject: [PATCH 1/4] Harden pre-v0.4 release and manifest gates Signed-off-by: Nelson Spence --- .github/workflows/release.yml | 40 +++++- RELEASING.md | 51 ++++--- THREAT_MODEL.md | 12 +- docs/INDEX_PROVENANCE.md | 4 +- ordvec-manifest/README.md | 3 +- ordvec-manifest/src/lib.rs | 22 ++- ordvec-manifest/src/sqlite.rs | 52 +++++-- ordvec-manifest/tests/manifest.rs | 157 ++++++++++++++++++++- src/bitmap.rs | 1 + src/quant.rs | 15 +- src/sign_bitmap.rs | 3 + tests/index/finite.rs | 26 ++++ tests/redteam_delta.rs | 16 +++ tests/release_publish_invariants.py | 83 +++++++++++ tests/release_signed_release_invariants.sh | 4 + 15 files changed, 440 insertions(+), 49 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ea5baee..28a182c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -169,6 +169,12 @@ jobs: SHA: ${{ github.sha }} run: | set -euo pipefail + MAIN_SHA="$(gh api "repos/${REPO}/commits/main" --jq '.sha')" + echo "current main sha: ${MAIN_SHA}" + if [ "$SHA" != "$MAIN_SHA" ]; then + echo "::error::release tag points at ${SHA}, but current main is ${MAIN_SHA}. Move the tag to the current protected main HEAD and re-run." + exit 1 + fi # Require a SUCCESSFUL push run for this SHA *on main* for each workflow. # Filtering on branch as well as head_sha stops a green run for the same # commit on an unrelated branch from satisfying the gate. @@ -1058,12 +1064,42 @@ jobs: exit 1 fi echo "OK: byte-identity verified ($A_SHA)" + - name: Check for existing ordvec .crate recovery + id: crate_recovery + env: + VERSION: ${{ needs.guard.outputs.version }} + run: | + set -euo pipefail + ATTESTED="${RUNNER_TEMP}/attested/ordvec-${VERSION}.crate" + [ -f "$ATTESTED" ] || { echo "::error::attested .crate missing at $ATTESTED"; exit 1; } + A_SHA=$(sha256sum "$ATTESTED" | cut -d' ' -f1) + API_URL="https://crates.io/api/v1/crates/ordvec/${VERSION}/download" + STATIC_URL="https://static.crates.io/crates/ordvec/ordvec-${VERSION}.crate" + CRATES_IO_USER_AGENT="ordvec-release-verify/${VERSION} (https://github.com/Fieldnote-Echo/ordvec)" + EXISTING="${RUNNER_TEMP}/existing-ordvec.crate" + if curl -fsSL --user-agent "$CRATES_IO_USER_AGENT" "$API_URL" -o "$EXISTING" \ + || curl -fsSL --user-agent "$CRATES_IO_USER_AGENT" "$STATIC_URL" -o "$EXISTING"; then + E_SHA=$(sha256sum "$EXISTING" | cut -d' ' -f1) + echo "attested: $A_SHA" + echo "crates.io-served: $E_SHA" + if [ "$A_SHA" != "$E_SHA" ]; then + echo "::error::crates.io already serves ordvec ${VERSION}, but the served .crate is not byte-identical to the SLSA-attested artifact ($E_SHA != $A_SHA). Refusing recovery." + exit 1 + fi + echo "already_published=true" >> "$GITHUB_OUTPUT" + echo "::notice::crates.io already serves byte-identical ordvec ${VERSION}; skipping upload and verifying served bytes." + else + echo "already_published=false" >> "$GITHUB_OUTPUT" + echo "No existing ordvec ${VERSION} .crate found on crates.io; proceeding with publish." + fi # Mint the short-lived crates.io credential immediately before publish so # the ephemeral token's exposure window is minimal. No stored secret. - name: Mint a short-lived crates.io credential (OIDC) + if: steps.crate_recovery.outputs.already_published != 'true' id: auth uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4 - name: cargo publish + if: steps.crate_recovery.outputs.already_published != 'true' env: CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }} run: cargo publish -p ordvec --locked @@ -1398,7 +1434,7 @@ jobs: publish-manifest-pypi: name: publish ordvec-manifest to PyPI - needs: [guard, pypi-manifest-canonical-dist, release-manifest-assets-draft] + needs: [guard, pypi-manifest-canonical-dist, release-manifest-assets-draft, publish-manifest-crate] if: needs.guard.outputs.ok == 'true' runs-on: ubuntu-latest environment: @@ -1441,7 +1477,7 @@ jobs: publish-pypi: name: publish to PyPI - needs: [guard, pypi-canonical-dist, release-assets-draft] + needs: [guard, pypi-canonical-dist, release-assets-draft, publish-crate] if: needs.guard.outputs.ok == 'true' runs-on: ubuntu-latest environment: diff --git a/RELEASING.md b/RELEASING.md index 9982628..fc9a763 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -8,9 +8,10 @@ > push. `ordvec` (the Rust crate), `ordvec-manifest` (the lockstep manifest verifier -crate), and `ordvec` on PyPI (the PyO3 wheel built from `ordvec-python/`) are -released by **pushing a `vMAJOR.MINOR.PATCH` tag** to a commit on `main`. The -release workflow handles build, canonical Python artifact selection, +crate), `ordvec` on PyPI (the PyO3 wheel built from `ordvec-python/`), and +`ordvec-manifest` on PyPI (the PyO3 wheel built from +`ordvec-manifest-python/`) are released by **pushing a `vMAJOR.MINOR.PATCH` tag** +to current `main` HEAD. The release workflow handles build, canonical Python artifact selection, attestation, SLSA provenance, Release-asset attach, and un-draft automatically; only the registry environment approvals are manual. @@ -21,13 +22,13 @@ The unified `release.yml`: - triggers on **tag push** (`v[0-9]*.[0-9]*.[0-9]*`); a strict-SemVer guard step rejects pre-release / leading-zero / non-SemVer tags so they wake the workflow but skip every job below the gate; -- runs a **`require-ci-green`** gate confirming the per-commit CI is green on - `main` for the tagged SHA — `ci.yml`, `python.yml`, `fuzz.yml`, `codeql.yml`, - `actionlint.yml`, `zizmor.yml` - (a *successful* run for that exact SHA on `main`); +- runs a **`require-ci-green`** gate confirming the tag points at current `main` + HEAD and that per-commit CI is green on `main` for that SHA — `ci.yml`, + `python.yml`, `fuzz.yml`, `codeql.yml`, `actionlint.yml`, `zizmor.yml` (a + *successful* run for that exact SHA on `main`); - publishes via **OIDC trusted publishing** (no long-lived crates.io / PyPI - tokens in the repo) for both Rust crates and the Python distribution; -- canonicalizes the Python dist before attestation and release upload: for a + tokens in the repo) for both Rust crates and both Python distributions; +- canonicalizes each Python dist before attestation and release upload: for a new PyPI version it uses the current run's wheels/sdist; if PyPI already owns that immutable version during recovery, it downloads the exact PyPI-served files, verifies their SHA-256 digests from PyPI JSON, and uses those bytes as @@ -69,7 +70,7 @@ The unified `release.yml`: `cargo package -p ordvec-manifest --locked` and byte-compares that output to the attested artifact before minting its own OIDC token; - **un-drafts the GitHub Release ONLY after `publish-crate`, - `publish-manifest-crate`, AND `publish-pypi` succeed** + `publish-manifest-crate`, `publish-pypi`, AND `publish-manifest-pypi` succeed** (`publish-github-release` is the sole un-draft point). If any publish fails or is skipped, the Release stays DRAFT — no public Release ever exists for a version the registries refused; @@ -116,7 +117,10 @@ filename. Until a record is updated, the corresponding gated publish fails requires an initial owner bootstrap before a new crate's Trusted Publisher can be configured, do that explicit maintainer-approved bootstrap before tagging. - **PyPI** → `ordvec` → Publishing → GitHub publisher: `workflow = release.yml`, - `environment = pypi`. + `environment = pypi`, project URL `https://pypi.org/p/ordvec`. +- **PyPI** → `ordvec-manifest` → Publishing → GitHub publisher: + `workflow = release.yml`, `environment = pypi`, project URL + `https://pypi.org/p/ordvec-manifest`. ### Tag and branch protection @@ -188,7 +192,7 @@ filename. Until a record is updated, the corresponding gated publish fails and accept only the stable release tag pattern. Separately verify the registry Trusted Publisher records by hand: crates.io must point both `ordvec` and `ordvec-manifest` to `release.yml` / `crates-io`, and PyPI must - point `ordvec` to `release.yml` / `pypi`. + point both `ordvec` and `ordvec-manifest` to `release.yml` / `pypi`. 6. Get the maintainer's explicit go to publish. 7. Push the version tag from `main` (signed): @@ -198,18 +202,20 @@ filename. Until a record is updated, the corresponding gated publish fails ``` `release.yml` triggers automatically. It builds the core `.crate`, wheels, - and sdist; selects the canonical Python dist (current build for a new PyPI - version, verified PyPI bytes for an existing immutable version); attests the - files this run can honestly attest (GitHub attestation store + + and sdist for both Python packages; selects the canonical Python dists + (current build for a new PyPI version, verified PyPI bytes for an existing + immutable version); attests the files this run can honestly attest (GitHub + attestation store + `*.sigstore.json`); generates SLSA `*.intoto.jsonl`; and stages the core and Python assets on the GitHub Release — **as a DRAFT**. After `publish-crate` succeeds, it builds, attests, generates SLSA provenance for, and stages the lockstep `ordvec-manifest` `.crate`, then pauses at the manifest registry environment gate. 8. **Approve each publish environment pause** in the Actions UI. There are - three registry publish jobs: `publish-crate`, `publish-manifest-crate`, and - `publish-pypi`. The two crates.io jobs use the same `crates-io` environment - and may require separate approvals; PyPI uses the `pypi` environment. + four registry publish jobs: `publish-crate`, `publish-manifest-crate`, + `publish-pypi`, and `publish-manifest-pypi`. The two crates.io jobs use the + same `crates-io` environment and may require separate approvals; the two PyPI + jobs use the `pypi` environment and may also require separate approvals. Required-reviewer approval is what authorises each registry push. - `publish-crate` and `publish-manifest-crate` first sha256-compare their repackaged `.crate` to the SLSA-attested artifact — if either diverges @@ -219,10 +225,11 @@ filename. Until a record is updated, the corresponding gated publish fails un-drafts the GitHub Release automatically. If one gate fails, the Release stays DRAFT — investigate and re-run from a fixed workflow rather than approving another registry into a partial state. - - `publish-pypi` either uploads the fresh canonical dist or, if PyPI already - serves that version, skips upload and verifies the existing files. In both - modes it compares every PyPI-served wheel/sdist SHA-256 digest against the - canonical `dist/` files before the GitHub Release can un-draft. + - `publish-pypi` and `publish-manifest-pypi` either upload their fresh + canonical dist or, if PyPI already serves that version, skip upload and + verify the existing files. In both modes they compare every PyPI-served + wheel/sdist SHA-256 digest against the canonical `dist/` files before the + GitHub Release can un-draft. 9. Verify each published artifact and its provenance: - crates.io / docs.rs for `ordvec` and `ordvec-manifest`; - PyPI (confirm the post-publish hash-verification log, optionally diff --git a/THREAT_MODEL.md b/THREAT_MODEL.md index f6574a7..6dcbaf3 100644 --- a/THREAT_MODEL.md +++ b/THREAT_MODEL.md @@ -307,11 +307,13 @@ trust model requires be pinned by version *tag*); `persist-credentials: false` on every checkout; `permissions: contents: read` default. The **release workflow** (`release.yml`) is tag-triggered with a strict-SemVer guard; build, GitHub attestation, SLSA provenance, Release-asset attach, and un-draft all -run automatically, while the two **`crates.io`** publish jobs (`ordvec` first, -then lockstep `ordvec-manifest`) and the **`pypi`** publish job are gated -behind GitHub Environments with **Required reviewers** (the only manual step). -It runs a `require-ci-green` gate against `main`, publishes via **OIDC trusted -publishing** (no long-lived registry tokens), and emits **SLSA build +run automatically, while the two **`crates.io`** publish jobs (`publish-crate` +for `ordvec` first, then `publish-manifest-crate` for lockstep +`ordvec-manifest`) and the two **`pypi`** publish jobs (`publish-pypi` and +`publish-manifest-pypi`) are gated behind GitHub Environments with **Required +reviewers** (the only manual step). It runs a `require-ci-green` gate against +current `main` HEAD, publishes via **OIDC trusted publishing** (no long-lived +registry tokens), and emits **SLSA build provenance** (`actions/attest-build-provenance` + a `slsa-github-generator` `*.intoto.jsonl` attached to the GitHub Release) **before** publish — a failed attestation fails the release closed. Each Rust publish job proves pre- and diff --git a/docs/INDEX_PROVENANCE.md b/docs/INDEX_PROVENANCE.md index aee9bbe..aaf82c6 100644 --- a/docs/INDEX_PROVENANCE.md +++ b/docs/INDEX_PROVENANCE.md @@ -108,7 +108,9 @@ OrdinalDB v0.1 should use `row_id_identity` for the ordvec vector row count and declare `ids.bin` as required auxiliary artifact name `ordinaldb.ids`. The OrdinalDB `u64` IDs remain caller-owned sidecar bytes. Do not model `ids.bin` as JSONL row identity: v1 JSONL row identity is UUID-only, and generic row-map -ID formats are deferred until there is a separate schema contract for them. +ID formats are deferred until there is a separate schema contract for them. The +reserved `row_identity.db` block is rejected in v1 because it is not byte-bound +or path-checked. When present, `encoder_distortion` records a scoped encoder geometry profile: source metric, embedding metric, lower/upper distortion-style bounds when diff --git a/ordvec-manifest/README.md b/ordvec-manifest/README.md index d294653..170c426 100644 --- a/ordvec-manifest/README.md +++ b/ordvec-manifest/README.md @@ -150,7 +150,8 @@ file as required auxiliary artifact name `ordinaldb.ids`. That makes the vector row count an ordvec invariant while leaving OrdinalDB's `u64` document IDs as a caller-owned sidecar. Do not encode `ids.bin` as `RowIdentity::Jsonl`: v1 JSONL row identity is UUID-oriented (`id_kind = "uuid"`), and generic row-map ID -formats are intentionally deferred. +formats are intentionally deferred. The reserved `row_identity.db` metadata +block is rejected in v1 because it is not byte-bound or path-checked. The unified JSON report carries per-sidecar audit fields. A successful auxiliary artifact verification includes the manifest path, resolved/canonical diff --git a/ordvec-manifest/src/lib.rs b/ordvec-manifest/src/lib.rs index f542748..901deed 100644 --- a/ordvec-manifest/src/lib.rs +++ b/ordvec-manifest/src/lib.rs @@ -379,6 +379,7 @@ fn validate_manifest_shape( path, sha256, id_kind, + db, .. } = &manifest.row_identity { @@ -400,6 +401,12 @@ fn validate_manifest_shape( "row_identity.id_kind must be uuid in v1", ); } + if db.is_some() { + report.error( + "row_identity_db_unsupported", + "row_identity.db is reserved for a future schema and is not verified in v1", + ); + } } validate_auxiliary_artifact_shape(manifest, limits, report); @@ -1839,7 +1846,9 @@ fn expected_profile_shape( ProfileParameterization::MarginalTopKFrequency => Some(vec![ordinalization.dim()]), ProfileParameterization::SignFrequency => Some(vec![ordinalization.dim()]), ProfileParameterization::BucketFrequency => match ordinalization { - CalibrationOrdinalization::Bucket { dim, bits } => Some(vec![*dim, 1usize << *bits]), + CalibrationOrdinalization::Bucket { dim, bits } if matches!(*bits, 1 | 2 | 4) => { + Some(vec![*dim, 1usize << *bits]) + } _ => None, }, ProfileParameterization::RankPositionFrequency => { @@ -3790,7 +3799,9 @@ fn validate_row_id_string( limits: &ResourceLimits, errors: &mut Vec, ) { + let mut structurally_invalid = false; if value.is_empty() { + structurally_invalid = true; push_report_issue_bounded( errors, limits, @@ -3799,6 +3810,7 @@ fn validate_row_id_string( ); } if value.contains('\0') { + structurally_invalid = true; push_report_issue_bounded( errors, limits, @@ -3806,6 +3818,14 @@ fn validate_row_id_string( format!("line {line_idx} {field} contains NUL"), ); } + if !structurally_invalid && Uuid::parse_str(value).is_err() { + push_report_issue_bounded( + errors, + limits, + format!("row_identity_{field}_invalid_uuid"), + format!("line {line_idx} {field} must be a UUID because row_identity.id_kind is uuid"), + ); + } } fn is_limit_issue_code(code: &str) -> bool { diff --git a/ordvec-manifest/src/sqlite.rs b/ordvec-manifest/src/sqlite.rs index 414cd30..a84f156 100644 --- a/ordvec-manifest/src/sqlite.rs +++ b/ordvec-manifest/src/sqlite.rs @@ -685,20 +685,44 @@ fn verification_reports_needs_migration(conn: &Connection) -> Result, _>>() .map_err(sqlite_err)?; - Ok(!columns.iter().any(|column| column == "report_id") - || !columns.iter().any(|column| column == "manifest_sha256") - || !columns - .iter() - .any(|column| column == "manifest_location_sha256") - || !columns - .iter() - .any(|column| column == "calibration_profile_sha256") - || !columns - .iter() - .any(|column| column == "auxiliary_artifacts_sha256") - || !columns - .iter() - .any(|column| column == "encoder_distortion_profile_sha256")) + let current_required = [ + "report_id", + "manifest_id", + "manifest_path", + "checked_at", + "ok", + "manifest_location_sha256", + "manifest_sha256", + "options_sha256", + "artifact_sha256", + "row_identity_sha256", + "calibration_profile_sha256", + "auxiliary_artifacts_sha256", + "encoder_distortion_profile_sha256", + "report_json", + ]; + if current_required + .iter() + .all(|required| columns.iter().any(|column| column == required)) + { + return Ok(false); + } + + let legacy_schema = [ + "manifest_id", + "manifest_path", + "checked_at", + "ok", + "report_json", + ]; + if columns.iter().map(String::as_str).eq(legacy_schema) { + return Ok(true); + } + + Err(ManifestError::invalid(format!( + "unsupported verification_reports schema {:?}; refusing destructive migration", + columns + ))) } fn sqlite_err(err: rusqlite::Error) -> ManifestError { diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs index a3414d8..1762e47 100644 --- a/ordvec-manifest/tests/manifest.rs +++ b/ordvec-manifest/tests/manifest.rs @@ -1495,6 +1495,42 @@ fn calibration_encoder_identity_must_match_embedding() { } } +#[test] +fn calibration_invalid_bucket_bits_reports_without_panic() { + let temp = tempfile::tempdir().unwrap(); + let case = tempfile::tempdir_in(temp.path()).unwrap(); + let index = write_index_kind(case.path(), FixtureKind::RankQuant); + let manifest_path = case.path().join("manifest.json"); + let mut manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + let profile_hash = write_profile( + &case.path().join("bucket.f64"), + manifest.artifact.dim * std::mem::size_of::(), + ); + manifest.calibration = Some(weighted_calibration( + &manifest, + "bucket.f64", + profile_hash, + CalibrationOrdinalization::Bucket { + dim: manifest.artifact.dim, + bits: 255, + }, + ProfileParameterization::BucketFrequency, + vec![manifest.artifact.dim, 1], + )); + + let report = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + verify_manifest_with_base(manifest, case.path(), VerifyOptions::default()) + })) + .expect("invalid bucket bits must report errors instead of panicking"); + assert!(error_codes(&report).contains(&"calibration_ordinalization_artifact_mismatch")); +} + #[test] fn calibration_ordinalization_matches_artifact_formats() { let temp = tempfile::tempdir().unwrap(); @@ -2359,6 +2395,66 @@ fn jsonl_row_identity_is_strict_and_duplicate_ids_need_opt_in() { .any(|issue| issue.code == "row_identity_row_id_mismatch")); } +#[test] +fn jsonl_row_identity_rejects_non_uuid_ids() { + let temp = tempfile::tempdir().unwrap(); + let index = write_rankquant_index(temp.path(), 2); + let rows = temp.path().join("rows.jsonl"); + write_row_map(&rows, &[("doc-a", None), ("doc-b", Some("doc-a"))]); + let row_hash = sha256_file(&rows).unwrap(); + let manifest_path = temp.path().join("manifest.json"); + let mut manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + manifest.row_identity = RowIdentity::Jsonl { + path: "rows.jsonl".to_string(), + sha256: row_hash.sha256, + row_count: 2, + id_kind: "uuid".to_string(), + db: None, + }; + + let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); + let codes = error_codes(&report); + assert!(codes.contains(&"row_identity_db_id_invalid_uuid")); + assert!(codes.contains(&"row_identity_parent_id_invalid_uuid")); +} + +#[test] +fn jsonl_row_identity_rejects_reserved_db_metadata() { + let temp = tempfile::tempdir().unwrap(); + let index = write_rankquant_index(temp.path(), 1); + let rows = temp.path().join("rows.jsonl"); + write_row_map(&rows, &[("00000000-0000-0000-0000-000000000001", None)]); + let row_hash = sha256_file(&rows).unwrap(); + let manifest_path = temp.path().join("manifest.json"); + let mut manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + manifest.row_identity = RowIdentity::Jsonl { + path: "rows.jsonl".to_string(), + sha256: row_hash.sha256, + row_count: 1, + id_kind: "uuid".to_string(), + db: Some(ordvec_manifest::RowIdentityDb { + path: Some("/etc/passwd".to_string()), + table: Some("documents".to_string()), + id_column: Some("id".to_string()), + }), + }; + + let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); + assert!(error_codes(&report).contains(&"row_identity_db_unsupported")); +} + #[test] fn auxiliary_artifacts_verify_and_report_deterministically() { let root = tempfile::tempdir().unwrap(); @@ -2927,6 +3023,56 @@ fn verify_index_manifest_uses_explicit_index_override() { assert!(report.ok, "{:?}", report.errors); } +#[cfg(feature = "sqlite")] +#[test] +fn sqlite_refuses_to_migrate_unknown_verification_reports_table() { + use rusqlite::Connection; + + let temp = tempfile::tempdir().unwrap(); + let index = write_index(temp.path()); + let manifest_path = temp.path().join("manifest.json"); + let manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + fs::write( + &manifest_path, + serde_json::to_string_pretty(&manifest).unwrap(), + ) + .unwrap(); + let document = load_manifest_file(&manifest_path).unwrap(); + let db = temp.path().join("foreign.sqlite"); + let conn = Connection::open(&db).unwrap(); + conn.execute("CREATE TABLE verification_reports(id INTEGER)", []) + .unwrap(); + drop(conn); + + let err = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + true, + ) + .unwrap_err(); + assert!(err + .to_string() + .contains("unsupported verification_reports schema")); + + let conn = Connection::open(&db).unwrap(); + let columns = conn + .prepare("PRAGMA table_info(verification_reports)") + .unwrap() + .query_map([], |row| row.get::<_, String>(1)) + .unwrap() + .collect::, _>>() + .unwrap(); + assert_eq!(columns, vec!["id"]); +} + #[cfg(feature = "sqlite")] #[test] fn sqlite_cache_is_explicit_and_activation_reverifies_by_default() { @@ -3076,7 +3222,16 @@ fn sqlite_combined_verified_load_matrix_respects_limits_paths_and_cache() { let index = write_index_kind(&assets, FixtureKind::RankQuant); let row_map_path = assets.join("rows.jsonl"); - write_row_map(&row_map_path, &[("doc-a", None), ("doc-b", Some("doc-a"))]); + write_row_map( + &row_map_path, + &[ + ("00000000-0000-0000-0000-000000000001", None), + ( + "00000000-0000-0000-0000-000000000002", + Some("00000000-0000-0000-0000-000000000001"), + ), + ], + ); let required_path = assets.join("required-sidecar.json"); fs::write(&required_path, b"{\"required\":true}\n").unwrap(); let required_hash = sha256_file(&required_path).unwrap(); diff --git a/src/bitmap.rs b/src/bitmap.rs index 3e7c4f4..35ca962 100644 --- a/src/bitmap.rs +++ b/src/bitmap.rs @@ -234,6 +234,7 @@ impl Bitmap { /// cheap relative to the cost it saves at M ≥ 1000. #[must_use = "this scans the corpus to generate candidates; dropping the result discards that work"] pub fn top_m_candidates(&self, q: &[f32], m: usize) -> Vec { + assert_eq!(q.len(), self.dim); assert_all_finite(q); let m_eff = m.min(self.n_vectors); if m_eff == 0 { diff --git a/src/quant.rs b/src/quant.rs index ccd65bd..2c66403 100644 --- a/src/quant.rs +++ b/src/quant.rs @@ -596,7 +596,9 @@ impl RankQuant { /// `candidates` may contain duplicate global row IDs. Each candidate entry /// is scored independently, so duplicate IDs may produce duplicate returned /// global IDs. Callers that require unique hits should deduplicate the - /// candidate list before calling this method. + /// candidate list before calling this method. The candidate list length is + /// still bounded by `n_vectors`; this keeps duplicate-heavy inputs from + /// allocating more scratch space than a full-corpus scan. /// /// Uses the same AVX-512 → AVX2 → scalar dispatch as /// [`Self::search_asymmetric`] and the same centre-drop math, just @@ -621,6 +623,12 @@ impl RankQuant { ) -> (Vec, Vec) { assert_eq!(query.len(), self.dim); assert_all_finite(query); + assert!( + candidates.len() <= self.n_vectors, + "search_asymmetric_subset: candidate list length {} exceeds n_vectors {}; deduplicate repeated ids before calling", + candidates.len(), + self.n_vectors, + ); // Bounds-check candidate ids before the gather below indexes // `self.packed[src..src + bpv]` with `src = di * bpv`. An OOB id // otherwise surfaces as a cryptic slice-range panic; fail fast @@ -657,7 +665,10 @@ impl RankQuant { // Pack the candidate docs' bytes into a contiguous buffer so // the SIMD kernels can scan them as if they were a small dense // sub-index. Cost: m * bpv copy (small for typical m). - let mut sub_packed = vec![0u8; m * bpv]; + let sub_packed_len = m + .checked_mul(bpv) + .expect("search_asymmetric_subset: candidate scratch length overflows usize"); + let mut sub_packed = vec![0u8; sub_packed_len]; for (i, &di) in candidates.iter().enumerate() { let src = (di as usize) * bpv; sub_packed[i * bpv..(i + 1) * bpv].copy_from_slice(&self.packed[src..src + bpv]); diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs index 8cae9a1..2e3473c 100644 --- a/src/sign_bitmap.rs +++ b/src/sign_bitmap.rs @@ -158,6 +158,8 @@ impl SignBitmap { /// [`crate::Bitmap::top_m_candidates`]. #[must_use = "this scans the corpus to generate candidates; dropping the result discards that work"] pub fn top_m_candidates(&self, q: &[f32], m: usize) -> Vec { + assert_eq!(q.len(), self.dim); + crate::util::assert_all_finite(q); let m_eff = m.min(self.n_vectors); if m_eff == 0 { return Vec::new(); @@ -194,6 +196,7 @@ impl SignBitmap { let dim = self.dim; let batch = queries.len() / dim; assert_eq!(queries.len(), batch * dim); + crate::util::assert_all_finite(queries); let m_eff = m.min(self.n_vectors); if batch == 0 || m_eff == 0 { return vec![Vec::new(); batch]; diff --git a/tests/index/finite.rs b/tests/index/finite.rs index f2e3710..a2dfbef 100644 --- a/tests/index/finite.rs +++ b/tests/index/finite.rs @@ -36,6 +36,14 @@ fn bitmap_top_m_candidates_rejects_nan() { let _ = idx.top_m_candidates(&q, 16); } +#[test] +#[should_panic] +fn bitmap_top_m_candidates_zero_m_validates_query_len() { + let idx = Bitmap::new(D, D / 4); + let q = vec![0.1f32; D - 1]; + let _ = idx.top_m_candidates(&q, 0); +} + #[test] #[should_panic(expected = "non-finite")] fn sign_bitmap_build_query_rejects_neg_inf() { @@ -74,3 +82,21 @@ fn bitmap_build_query_bitmap_fp32_rejects_nan() { q[0] = f32::NAN; let _ = idx.build_query_bitmap_fp32(&q); } + +#[test] +#[should_panic(expected = "non-finite")] +fn sign_bitmap_top_m_candidates_zero_m_rejects_nan() { + let idx = SignBitmap::new(D); + let mut q = vec![0.1f32; D]; + q[0] = f32::NAN; + let _ = idx.top_m_candidates(&q, 0); +} + +#[test] +#[should_panic(expected = "non-finite")] +fn sign_bitmap_batched_zero_m_rejects_nan() { + let idx = SignBitmap::new(D); + let mut queries = vec![0.1f32; D * 2]; + queries[D] = f32::NAN; + let _ = idx.top_m_candidates_batched(&queries, 0); +} diff --git a/tests/redteam_delta.rs b/tests/redteam_delta.rs index 95b6da2..f891ad6 100644 --- a/tests/redteam_delta.rs +++ b/tests/redteam_delta.rs @@ -539,6 +539,22 @@ fn delta_c4_subset_dup_plus_oob_still_rejected() { let _ = idx.search_asymmetric_subset(&query, &[5, 999, 5], 3); } +/// DELTA-C5: duplicate ids are accepted, but a candidate list longer than the +/// corpus is not. This caps scratch-gather size for adversarial duplicate-heavy +/// lists while still allowing repeated ids within a bounded candidate budget. +#[test] +#[should_panic(expected = "candidate list length")] +fn delta_c5_subset_duplicate_overrun_list_rejected() { + let dim = 64; + let n = 2; + let corpus = make_corpus(8451, n, dim); + let mut idx = RankQuant::new(dim, 2); + idx.add(&corpus); + let query = make_corpus(8452, 1, dim); + + let _ = idx.search_asymmetric_subset(&query, &[0, 0, 0], 3); +} + // ===================================================================== // DELTA-D — empty-index / empty-input search paths. // ===================================================================== diff --git a/tests/release_publish_invariants.py b/tests/release_publish_invariants.py index d91d181..33bc4cd 100644 --- a/tests/release_publish_invariants.py +++ b/tests/release_publish_invariants.py @@ -547,6 +547,26 @@ def check_python_package_metadata() -> None: fail(".github/dependabot.yml must keep the Python NumPy floor comment at >=2.2") +def check_release_docs_include_manifest_pypi_lane() -> None: + releasing = read_text("RELEASING.md") + normalized_releasing = " ".join(releasing.split()) + for required in ( + "`ordvec-manifest` on PyPI", + "`publish-manifest-pypi`", + "four registry publish jobs", + "PyPI must point both `ordvec` and `ordvec-manifest`", + "https://pypi.org/p/ordvec-manifest", + ): + if " ".join(required.split()) not in normalized_releasing: + fail(f"RELEASING.md must mention {required!r}") + + threat_model = read_text("THREAT_MODEL.md") + normalized_threat_model = " ".join(threat_model.split()) + for required in ("`publish-manifest-pypi`", "two **`pypi`** publish jobs"): + if " ".join(required.split()) not in normalized_threat_model: + fail(f"THREAT_MODEL.md must mention {required!r}") + + def check_strict_release_tag_patterns(workflow: dict[str, Any], path: str) -> None: try: tag_pattern = read_toml_string_in_section("cliff.toml", "git", "tag_pattern") @@ -925,6 +945,12 @@ def check_release_security_gates(workflow: dict[str, Any], path: str) -> None: ) if found_gate_run is None or "event=push" not in found_gate_run or '.event == "push"' not in found_gate_run: fail(f"{path}: require-ci-green must require successful push workflow runs") + if ( + found_gate_run is None + or "repos/${REPO}/commits/main" not in found_gate_run + or "MAIN_SHA" not in found_gate_run + ): + fail(f"{path}: require-ci-green must verify the release tag points at current main") allowed_id_token_jobs = { "attest", @@ -1090,6 +1116,7 @@ def check_publish_pypi( canonical_job: str = "pypi-canonical-dist", canonical_artifact_name: str = "pypi-canonical-dist", project: str | None = None, + crate_publish_job: str = "publish-crate", ) -> None: jobs = mapping(workflow.get("jobs"), f"{path}: jobs") job = mapping(jobs.get(job_name), f"{path}: jobs.{job_name}") @@ -1097,6 +1124,8 @@ def check_publish_pypi( if not has_need(job, canonical_job): fail(f"{path}: {job_name} must need {canonical_job}") + if not has_need(job, crate_publish_job): + fail(f"{path}: {job_name} must need {crate_publish_job} to avoid a partial PyPI-first release") publish_steps: list[tuple[int, dict[str, Any]]] = [] canonical_downloads: list[tuple[int, dict[str, Any], dict[str, Any]]] = [] @@ -1174,6 +1203,7 @@ def check_publish_crate_job( publish_runs: list[tuple[int, str]] = [] publish_dry_runs: list[tuple[int, str]] = [] auth_steps: list[int] = [] + recovery_steps: list[tuple[int, dict[str, Any]]] = [] for index, raw_step in enumerate(steps): step = mapping(raw_step, f"{path}: jobs.{job_name}.steps[{index}]") @@ -1188,6 +1218,8 @@ def check_publish_crate_job( publish_runs.append((index, run)) if action_name(step) == "rust-lang/crates-io-auth-action": auth_steps.append(index) + if step.get("name") == f"Check for existing {package} .crate recovery": + recovery_steps.append((index, step)) if action_name(step) == "actions/download-artifact": with_block = step.get("with", {}) with_map = mapping(with_block, f"{path}: {step_label(index, step)} with") @@ -1232,6 +1264,55 @@ def check_publish_crate_job( if found_names != verify_step_names: fail(f"{path}: {job_name} must have both attested .crate verification steps") + recovery_id = "crate_recovery" if package == "ordvec" else "manifest_crate_recovery" + if len(recovery_steps) != 1: + fail(f"{path}: {job_name} must have exactly one first-publish recovery check") + recovery_index, recovery_step = recovery_steps[0] + if recovery_step.get("id") != recovery_id: + fail(f"{path}: {job_name} recovery step must have id {recovery_id}") + recovery_run = recovery_step.get("run") + if not isinstance(recovery_run, str): + fail(f"{path}: {job_name} recovery step must be a run step") + for required in ( + "already_published=true", + "already_published=false", + "Refusing recovery", + f"crates.io already serves byte-identical {package}", + ): + if required not in recovery_run: + fail(f"{path}: {job_name} recovery step must contain {required!r}") + for url_var in ("API_URL", "STATIC_URL"): + if not any( + has_shell_arg(words, shell_vars(url_var)) + and has_shell_option_value( + words, {"--user-agent", "-A"}, shell_vars("CRATES_IO_USER_AGENT") + ) + and has_shell_option_value(words, {"--output", "-o"}, shell_vars("EXISTING")) + for words in shell_curl_commands(recovery_run) + ): + fail( + f"{path}: {job_name} recovery step must curl ${url_var} " + "with CRATES_IO_USER_AGENT into $EXISTING" + ) + + protected_step_names = { + "Mint a short-lived crates.io credential (OIDC)", + "cargo publish", + } + if require_publish_dry_run: + protected_step_names.add("Validate manifest publish dry-run") + for index, raw_step in enumerate(steps): + step = mapping(raw_step, f"{path}: jobs.{job_name}.steps[{index}]") + name = step.get("name") + if name in protected_step_names: + if index < recovery_index: + fail(f"{path}: {name} must run after the {package} crate recovery check") + if step.get("if") != f"steps.{recovery_id}.outputs.already_published != 'true'": + fail( + f"{path}: {name} must be skipped when {package} crate recovery found " + "byte-identical existing bytes" + ) + if require_publish_dry_run: dry_run_index = publish_dry_runs[0][0] byte_identity_index = verify_step_indices["Verify byte-identity vs the attested .crate"] @@ -1557,6 +1638,7 @@ def main() -> None: check_release_compatibility_sync() check_publication_model() check_python_package_metadata() + check_release_docs_include_manifest_pypi_lane() check_strict_release_tag_patterns(workflow, WORKFLOW_PATH) check_package_contents() check_ci_package_guards(ci_workflow, CI_WORKFLOW_PATH) @@ -1587,6 +1669,7 @@ def main() -> None: canonical_job="pypi-manifest-canonical-dist", canonical_artifact_name="pypi-manifest-canonical-dist", project="ordvec-manifest", + crate_publish_job="publish-manifest-crate", ) check_sde_cache_invariants() diff --git a/tests/release_signed_release_invariants.sh b/tests/release_signed_release_invariants.sh index e709f1a..7d3df87 100755 --- a/tests/release_signed_release_invariants.sh +++ b/tests/release_signed_release_invariants.sh @@ -257,6 +257,10 @@ printf '%s\n' "$body" | grep -qE '^[[:space:]]+id-token:[[:space:]]*write' \ || fail "publish-manifest-pypi must grant \`id-token: write\` (Trusted Publishing OIDC)" job_needs publish-manifest-pypi release-manifest-assets-draft \ || fail "publish-manifest-pypi must \`needs: release-manifest-assets-draft\`" +job_needs publish-manifest-pypi publish-manifest-crate \ + || fail "publish-manifest-pypi must \`needs: publish-manifest-crate\` (manifest crate publishes before manifest PyPI)" +job_needs publish-pypi publish-crate \ + || fail "publish-pypi must \`needs: publish-crate\` (core crate publishes before core PyPI)" # ---------------------------------------------------------------------- # (9) Rust crate publish jobs prove byte-identity vs the attested .crate on BOTH From cdd6163cd55547c634456c51f5f2a9369341b293 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Tue, 9 Jun 2026 15:44:02 -0500 Subject: [PATCH 2/4] Harden PyPI canonical dist coverage Signed-off-by: Nelson Spence --- .github/workflows/release.yml | 32 +++++- tests/release_publish_invariants.py | 14 +++ tests/release_pypi_canonical_dist.py | 123 +++++++++++++++++++-- tests/release_pypi_canonical_dist_tests.py | 114 +++++++++++++++++++ 4 files changed, 270 insertions(+), 13 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 28a182c..73a4bca 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -704,7 +704,13 @@ jobs: python3 tests/release_pypi_canonical_dist.py canonicalize \ --version "$VERSION" \ --built-dir built-dist \ - --out-dir canonical-dist + --out-dir canonical-dist \ + --expected-wheels 4 \ + --expected-sdists 1 \ + --required-wheel-tag x86_64 \ + --required-wheel-tag aarch64 \ + --required-wheel-tag macosx \ + --required-wheel-tag win_amd64 - name: Upload the canonical Python dist uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: @@ -751,7 +757,13 @@ jobs: --project ordvec-manifest \ --version "$VERSION" \ --built-dir built-dist \ - --out-dir canonical-dist + --out-dir canonical-dist \ + --expected-wheels 4 \ + --expected-sdists 1 \ + --required-wheel-tag x86_64 \ + --required-wheel-tag aarch64 \ + --required-wheel-tag macosx \ + --required-wheel-tag win_amd64 - name: Upload the canonical ordvec-manifest Python dist uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: @@ -1473,7 +1485,13 @@ jobs: python3 tests/release_pypi_canonical_dist.py verify \ --project ordvec-manifest \ --version "$VERSION" \ - --dist-dir dist + --dist-dir dist \ + --expected-wheels 4 \ + --expected-sdists 1 \ + --required-wheel-tag x86_64 \ + --required-wheel-tag aarch64 \ + --required-wheel-tag macosx \ + --required-wheel-tag win_amd64 publish-pypi: name: publish to PyPI @@ -1515,7 +1533,13 @@ jobs: set -euo pipefail python3 tests/release_pypi_canonical_dist.py verify \ --version "$VERSION" \ - --dist-dir dist + --dist-dir dist \ + --expected-wheels 4 \ + --expected-sdists 1 \ + --required-wheel-tag x86_64 \ + --required-wheel-tag aarch64 \ + --required-wheel-tag macosx \ + --required-wheel-tag win_amd64 publish-github-release: name: un-draft the GitHub Release (only after all registry publishes succeed) diff --git a/tests/release_publish_invariants.py b/tests/release_publish_invariants.py index 33bc4cd..bf98c38 100644 --- a/tests/release_publish_invariants.py +++ b/tests/release_publish_invariants.py @@ -28,6 +28,14 @@ SDE_ACTION_PATH = os.environ.get( "SDE_ACTION_PATH", ".github/actions/setup-intel-sde/action.yml" ) +PYPI_CANONICAL_EXPECTED_ARGS = ( + "--expected-wheels 4", + "--expected-sdists 1", + "--required-wheel-tag x86_64", + "--required-wheel-tag aarch64", + "--required-wheel-tag macosx", + "--required-wheel-tag win_amd64", +) def fail(message: str) -> None: @@ -1089,6 +1097,9 @@ def check_pypi_canonical_dist( fail(f"{path}: {job_name} canonicalize step must read built-dist and write canonical-dist") if project is not None and f"--project {project}" not in run: fail(f"{path}: {job_name} canonicalize step must pass --project {project}") + for required_arg in PYPI_CANONICAL_EXPECTED_ARGS: + if required_arg not in run: + fail(f"{path}: {job_name} canonicalize step must pass {required_arg}") if len(wheels_downloads) != 1: fail(f"{path}: {job_name} must download exactly one {wheel_artifact_pattern} artifact set") @@ -1151,6 +1162,9 @@ def check_publish_pypi( fail(f"{path}: {job_name} PyPI verify step must verify dist") if project is not None and f"--project {project}" not in run: fail(f"{path}: {job_name} PyPI verify step must pass --project {project}") + for required_arg in PYPI_CANONICAL_EXPECTED_ARGS: + if required_arg not in run: + fail(f"{path}: {job_name} PyPI verify step must pass {required_arg}") if len(publish_steps) != 1: fail(f"{path}: {job_name} must have exactly one pypa/gh-action-pypi-publish step") diff --git a/tests/release_pypi_canonical_dist.py b/tests/release_pypi_canonical_dist.py index b0be14f..86de8e8 100644 --- a/tests/release_pypi_canonical_dist.py +++ b/tests/release_pypi_canonical_dist.py @@ -65,6 +65,29 @@ def dist_files(directory: Path) -> dict[str, Path]: return files +def validate_expected_dist( + files: dict[str, Any], + *, + expected_wheels: int | None = None, + expected_sdists: int | None = None, + required_wheel_tags: tuple[str, ...] = (), +) -> None: + wheels = sorted(name for name in files if name.endswith(".whl")) + sdists = sorted(name for name in files if name.endswith(".tar.gz")) + if expected_wheels is not None and len(wheels) != expected_wheels: + fail(f"expected {expected_wheels} wheel files, found {len(wheels)}: {wheels!r}") + if expected_sdists is not None and len(sdists) != expected_sdists: + fail(f"expected {expected_sdists} sdist files, found {len(sdists)}: {sdists!r}") + missing_tags = [ + tag for tag in required_wheel_tags if not any(tag in wheel for wheel in wheels) + ] + if missing_tags: + fail( + "wheel dist is missing required platform tag substrings: " + f"missing={missing_tags!r} wheels={wheels!r}" + ) + + def fetch_pypi_payload(project: str, version: str) -> dict[str, Any] | None: url = f"https://pypi.org/pypi/{project}/{version}/json" try: @@ -130,8 +153,23 @@ def ensure_same_filenames(local: dict[str, Path], remote: dict[str, dict[str, st ) -def canonicalize(project: str, version: str, built_dir: Path, out_dir: Path) -> None: +def canonicalize( + project: str, + version: str, + built_dir: Path, + out_dir: Path, + *, + expected_wheels: int | None = None, + expected_sdists: int | None = None, + required_wheel_tags: tuple[str, ...] = (), +) -> None: built = dist_files(built_dir) + validate_expected_dist( + built, + expected_wheels=expected_wheels, + expected_sdists=expected_sdists, + required_wheel_tags=required_wheel_tags, + ) prepare_empty_dir(out_dir) try: payload = fetch_pypi_payload(project, version) @@ -151,6 +189,12 @@ def canonicalize(project: str, version: str, built_dir: Path, out_dir: Path) -> except PyPIReadError as exc: fail(str(exc)) ensure_same_filenames(built, remote) + validate_expected_dist( + remote, + expected_wheels=expected_wheels, + expected_sdists=expected_sdists, + required_wheel_tags=required_wheel_tags, + ) mismatched: list[str] = [] for filename, path in built.items(): @@ -180,12 +224,40 @@ def remote_hashes(project: str, version: str) -> dict[str, str] | None: return {name: item["sha256"] for name, item in pypi_dist_map(payload).items()} -def local_hashes(dist_dir: Path) -> dict[str, str]: - return {name: sha256_file(path) for name, path in dist_files(dist_dir).items()} - - -def verify(project: str, version: str, dist_dir: Path, attempts: int, sleep_seconds: float) -> None: - local = local_hashes(dist_dir) +def local_hashes( + dist_dir: Path, + *, + expected_wheels: int | None = None, + expected_sdists: int | None = None, + required_wheel_tags: tuple[str, ...] = (), +) -> dict[str, str]: + files = dist_files(dist_dir) + validate_expected_dist( + files, + expected_wheels=expected_wheels, + expected_sdists=expected_sdists, + required_wheel_tags=required_wheel_tags, + ) + return {name: sha256_file(path) for name, path in files.items()} + + +def verify( + project: str, + version: str, + dist_dir: Path, + attempts: int, + sleep_seconds: float, + *, + expected_wheels: int | None = None, + expected_sdists: int | None = None, + required_wheel_tags: tuple[str, ...] = (), +) -> None: + local = local_hashes( + dist_dir, + expected_wheels=expected_wheels, + expected_sdists=expected_sdists, + required_wheel_tags=required_wheel_tags, + ) url = f"https://pypi.org/pypi/{project}/{version}/json" last_error = "not checked" for attempt in range(1, attempts + 1): @@ -212,6 +284,14 @@ def parse_args() -> argparse.Namespace: canonical.add_argument("--version", required=True) canonical.add_argument("--built-dir", required=True, type=Path) canonical.add_argument("--out-dir", required=True, type=Path) + canonical.add_argument("--expected-wheels", type=int) + canonical.add_argument("--expected-sdists", type=int) + canonical.add_argument( + "--required-wheel-tag", + action="append", + default=[], + help="Require at least one wheel filename containing this substring; may be repeated.", + ) verify_parser = subparsers.add_parser("verify") verify_parser.add_argument("--project", default=DEFAULT_PROJECT) @@ -219,6 +299,14 @@ def parse_args() -> argparse.Namespace: verify_parser.add_argument("--dist-dir", required=True, type=Path) verify_parser.add_argument("--attempts", default=24, type=int) verify_parser.add_argument("--sleep-seconds", default=5.0, type=float) + verify_parser.add_argument("--expected-wheels", type=int) + verify_parser.add_argument("--expected-sdists", type=int) + verify_parser.add_argument( + "--required-wheel-tag", + action="append", + default=[], + help="Require at least one wheel filename containing this substring; may be repeated.", + ) return parser.parse_args() @@ -226,10 +314,27 @@ def parse_args() -> argparse.Namespace: def main() -> None: args = parse_args() if args.command == "canonicalize": - canonicalize(args.project, args.version, args.built_dir, args.out_dir) + canonicalize( + args.project, + args.version, + args.built_dir, + args.out_dir, + expected_wheels=args.expected_wheels, + expected_sdists=args.expected_sdists, + required_wheel_tags=tuple(args.required_wheel_tag), + ) return if args.command == "verify": - verify(args.project, args.version, args.dist_dir, args.attempts, args.sleep_seconds) + verify( + args.project, + args.version, + args.dist_dir, + args.attempts, + args.sleep_seconds, + expected_wheels=args.expected_wheels, + expected_sdists=args.expected_sdists, + required_wheel_tags=tuple(args.required_wheel_tag), + ) return raise AssertionError(f"unknown command: {args.command}") diff --git a/tests/release_pypi_canonical_dist_tests.py b/tests/release_pypi_canonical_dist_tests.py index 0022cb1..63c76b3 100644 --- a/tests/release_pypi_canonical_dist_tests.py +++ b/tests/release_pypi_canonical_dist_tests.py @@ -24,6 +24,17 @@ def write(path: Path, data: bytes) -> str: return hashlib.sha256(data).hexdigest() +def write_complete_release_dist(directory: Path, project: str = "ordvec") -> dict[str, str]: + files = { + f"{project}-0.3.0.tar.gz": b"sdist", + f"{project}-0.3.0-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl": b"linux x86_64", + f"{project}-0.3.0-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl": b"linux aarch64", + f"{project}-0.3.0-cp310-abi3-macosx_11_0_arm64.whl": b"macos arm64", + f"{project}-0.3.0-cp310-abi3-win_amd64.whl": b"windows amd64", + } + return {name: write(directory / name, data) for name, data in files.items()} + + class CanonicalPyPIDistTests(unittest.TestCase): def test_missing_pypi_release_uses_current_build(self) -> None: with tempfile.TemporaryDirectory() as tmp: @@ -45,6 +56,86 @@ def test_missing_pypi_release_uses_current_build(self) -> None: self.assertEqual((out / "ordvec-0.3.0.tar.gz").read_bytes(), b"fresh sdist") self.assertEqual((out / "ordvec-0.3.0-cp310-abi3-win_amd64.whl").read_bytes(), b"fresh wheel") + def test_missing_pypi_release_accepts_complete_expected_release_dist(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + built = root / "built" + out = root / "out" + built.mkdir() + write_complete_release_dist(built) + + old_fetch = canonical.fetch_pypi_payload + canonical.fetch_pypi_payload = lambda project, version: None + try: + with redirect_stdout(io.StringIO()): + canonical.canonicalize( + "ordvec", + "0.3.0", + built, + out, + expected_wheels=4, + expected_sdists=1, + required_wheel_tags=("x86_64", "aarch64", "macosx", "win_amd64"), + ) + finally: + canonical.fetch_pypi_payload = old_fetch + + self.assertEqual(len(list(out.glob("*.whl"))), 4) + self.assertEqual(len(list(out.glob("*.tar.gz"))), 1) + + def test_canonicalize_rejects_incomplete_expected_wheel_set(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + built = root / "built" + out = root / "out" + built.mkdir() + write(built / "ordvec-0.3.0.tar.gz", b"fresh sdist") + write(built / "ordvec-0.3.0-cp310-abi3-win_amd64.whl", b"fresh wheel") + + old_fetch = canonical.fetch_pypi_payload + canonical.fetch_pypi_payload = lambda project, version: self.fail("unexpected PyPI fetch") + try: + with redirect_stderr(io.StringIO()), self.assertRaises(SystemExit): + canonical.canonicalize( + "ordvec", + "0.3.0", + built, + out, + expected_wheels=4, + expected_sdists=1, + required_wheel_tags=("x86_64", "aarch64", "macosx", "win_amd64"), + ) + finally: + canonical.fetch_pypi_payload = old_fetch + + def test_canonicalize_rejects_missing_required_platform_tag(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + built = root / "built" + out = root / "out" + built.mkdir() + write(built / "ordvec-0.3.0.tar.gz", b"fresh sdist") + write(built / "ordvec-0.3.0-cp310-abi3-manylinux_2_17_x86_64.whl", b"linux x86_64") + write(built / "ordvec-0.3.0-cp310-abi3-manylinux_2_17_aarch64.whl", b"linux aarch64") + write(built / "ordvec-0.3.0-cp310-abi3-macosx_11_0_arm64.whl", b"macos arm64") + write(built / "ordvec-0.3.0-cp310-abi3-macosx_12_0_universal2.whl", b"extra macos") + + old_fetch = canonical.fetch_pypi_payload + canonical.fetch_pypi_payload = lambda project, version: self.fail("unexpected PyPI fetch") + try: + with redirect_stderr(io.StringIO()), self.assertRaises(SystemExit): + canonical.canonicalize( + "ordvec", + "0.3.0", + built, + out, + expected_wheels=4, + expected_sdists=1, + required_wheel_tags=("x86_64", "aarch64", "macosx", "win_amd64"), + ) + finally: + canonical.fetch_pypi_payload = old_fetch + def test_existing_pypi_release_uses_verified_remote_bytes(self) -> None: with tempfile.TemporaryDirectory() as tmp: root = Path(tmp) @@ -175,6 +266,29 @@ def test_verify_retries_after_empty_pypi_dist_payload(self) -> None: self.assertEqual(sleeps, [0.5]) + def test_verify_rejects_incomplete_local_dist_before_remote_check(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + dist = Path(tmp) + write(dist / "ordvec-0.3.0.tar.gz", b"canonical sdist") + write(dist / "ordvec-0.3.0-cp310-abi3-win_amd64.whl", b"canonical wheel") + + old_fetch = canonical.fetch_pypi_payload + canonical.fetch_pypi_payload = lambda project, version: self.fail("unexpected PyPI fetch") + try: + with redirect_stderr(io.StringIO()), self.assertRaises(SystemExit): + canonical.verify( + "ordvec", + "0.3.0", + dist, + attempts=1, + sleep_seconds=0.0, + expected_wheels=4, + expected_sdists=1, + required_wheel_tags=("x86_64", "aarch64", "macosx", "win_amd64"), + ) + finally: + canonical.fetch_pypi_payload = old_fetch + def test_canonicalize_reports_pypi_read_error(self) -> None: with tempfile.TemporaryDirectory() as tmp: root = Path(tmp) From 41d8cc2839b42860aa8b7209aeb596bfe0e60aa6 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Tue, 9 Jun 2026 16:13:21 -0500 Subject: [PATCH 3/4] Address PR review and fuzz smoke findings Signed-off-by: Nelson Spence --- .../signbitmap_rankquant_twostage.rs | 20 ++-- ordvec-manifest/src/lib.rs | 2 +- ordvec-manifest/src/sqlite.rs | 15 +-- ordvec-manifest/tests/manifest.rs | 97 +++++++++++++++++++ 4 files changed, 119 insertions(+), 15 deletions(-) diff --git a/fuzz/fuzz_targets/signbitmap_rankquant_twostage.rs b/fuzz/fuzz_targets/signbitmap_rankquant_twostage.rs index 1e35582..22bd60e 100644 --- a/fuzz/fuzz_targets/signbitmap_rankquant_twostage.rs +++ b/fuzz/fuzz_targets/signbitmap_rankquant_twostage.rs @@ -6,14 +6,15 @@ //! //! The fuzzer builds both indexes over one generated finite corpus, derives a //! bounded structured shape for `(dim, bits, n_vectors, m, k)`, and feeds -//! duplicate candidate IDs into the subset path. When sign candidate generation -//! returns the full corpus (`m >= n`), the target also checks that subset -//! reranking agrees with a full RankQuant search. +//! duplicate candidate IDs into the subset path while preserving the public +//! subset API's corpus-sized candidate-budget contract. When sign candidate +//! generation returns the full corpus (`m >= n`), the target also checks that +//! subset reranking agrees with a full RankQuant search. //! -//! Contract: no panic, abort, or out-of-bounds access on any in-range candidate -//! input, subset reranking must preserve score-descending/doc-ID-ascending -//! ordering, and full-corpus candidate reranking must match full RankQuant -//! search. +//! Contract: no panic, abort, or out-of-bounds access on any bounded in-range +//! candidate input, subset reranking must preserve score-descending/doc-ID- +//! ascending ordering, and full-corpus candidate reranking must match full +//! RankQuant search. #![no_main] use libfuzzer_sys::{ @@ -116,12 +117,15 @@ fuzz_target!(|input: TwoStageInput| { 0 => subset_candidates.clear(), 1 => { let id = subset_candidates.first().copied().unwrap_or(0); - subset_candidates.push(id); + if subset_candidates.len() < input.n_vectors { + subset_candidates.push(id); + } } 2 if subset_candidates.is_empty() => subset_candidates.push(0), _ => {} } } + assert!(subset_candidates.len() <= input.n_vectors); let (scores, ids) = rankquant.search_asymmetric_subset(query, &subset_candidates, input.k); diff --git a/ordvec-manifest/src/lib.rs b/ordvec-manifest/src/lib.rs index 901deed..acdfc5a 100644 --- a/ordvec-manifest/src/lib.rs +++ b/ordvec-manifest/src/lib.rs @@ -3823,7 +3823,7 @@ fn validate_row_id_string( errors, limits, format!("row_identity_{field}_invalid_uuid"), - format!("line {line_idx} {field} must be a UUID because row_identity.id_kind is uuid"), + format!("line {line_idx} {field} must be a UUID in v1"), ); } } diff --git a/ordvec-manifest/src/sqlite.rs b/ordvec-manifest/src/sqlite.rs index a84f156..bd6694c 100644 --- a/ordvec-manifest/src/sqlite.rs +++ b/ordvec-manifest/src/sqlite.rs @@ -701,21 +701,18 @@ fn verification_reports_needs_migration(conn: &Connection) -> Result Result bool { + required + .iter() + .all(|required| columns.iter().any(|column| column == required)) +} + fn sqlite_err(err: rusqlite::Error) -> ManifestError { ManifestError::invalid(format!("sqlite error: {err}")) } diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs index 1762e47..7c9ff43 100644 --- a/ordvec-manifest/tests/manifest.rs +++ b/ordvec-manifest/tests/manifest.rs @@ -2424,6 +2424,43 @@ fn jsonl_row_identity_rejects_non_uuid_ids() { assert!(codes.contains(&"row_identity_parent_id_invalid_uuid")); } +#[test] +fn jsonl_row_identity_uuid_error_message_is_v1_scoped() { + let temp = tempfile::tempdir().unwrap(); + let index = write_rankquant_index(temp.path(), 1); + let rows = temp.path().join("rows.jsonl"); + write_row_map(&rows, &[("doc-a", None)]); + let row_hash = sha256_file(&rows).unwrap(); + let manifest_path = temp.path().join("manifest.json"); + let mut manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + manifest.row_identity = RowIdentity::Jsonl { + path: "rows.jsonl".to_string(), + sha256: row_hash.sha256, + row_count: 1, + id_kind: "u64".to_string(), + db: None, + }; + + let report = verify_manifest_with_base(manifest, temp.path(), VerifyOptions::default()); + let codes = error_codes(&report); + assert!(codes.contains(&"row_identity_id_kind_unsupported")); + let issue = report + .errors + .iter() + .find(|issue| issue.code == "row_identity_db_id_invalid_uuid") + .expect("non-UUID db_id should still report v1 UUID validation"); + assert!(issue.message.contains("must be a UUID in v1")); + assert!(!issue + .message + .contains("because row_identity.id_kind is uuid")); +} + #[test] fn jsonl_row_identity_rejects_reserved_db_metadata() { let temp = tempfile::tempdir().unwrap(); @@ -3073,6 +3110,66 @@ fn sqlite_refuses_to_migrate_unknown_verification_reports_table() { assert_eq!(columns, vec!["id"]); } +#[cfg(feature = "sqlite")] +#[test] +fn sqlite_migrates_legacy_verification_reports_by_required_column_names() { + use rusqlite::Connection; + + let temp = tempfile::tempdir().unwrap(); + let index = write_index(temp.path()); + let manifest_path = temp.path().join("manifest.json"); + let manifest = create_manifest_for_index( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + ) + .unwrap(); + fs::write( + &manifest_path, + serde_json::to_string_pretty(&manifest).unwrap(), + ) + .unwrap(); + let document = load_manifest_file(&manifest_path).unwrap(); + let db = temp.path().join("legacy.sqlite"); + let conn = Connection::open(&db).unwrap(); + conn.execute( + "CREATE TABLE verification_reports( + report_json TEXT, + checked_at TEXT, + extra TEXT, + ok INTEGER, + manifest_path TEXT, + manifest_id TEXT + )", + [], + ) + .unwrap(); + drop(conn); + + let report = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + VerifyOptions::default(), + true, + ) + .unwrap(); + assert!(report.ok, "{:?}", report.errors); + + let conn = Connection::open(&db).unwrap(); + let columns = conn + .prepare("PRAGMA table_info(verification_reports)") + .unwrap() + .query_map([], |row| row.get::<_, String>(1)) + .unwrap() + .collect::, _>>() + .unwrap(); + assert!(columns.contains(&"report_id".to_string())); + assert!(columns.contains(&"manifest_sha256".to_string())); + assert!(!columns.contains(&"extra".to_string())); +} + #[cfg(feature = "sqlite")] #[test] fn sqlite_cache_is_explicit_and_activation_reverifies_by_default() { From 2d15584c77e1ed4b65a44fc406a57a23f5fc5701 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Tue, 9 Jun 2026 16:35:31 -0500 Subject: [PATCH 4/4] Fail closed on crates.io recovery errors Signed-off-by: Nelson Spence --- .github/workflows/release.yml | 120 ++++++++++++++++++++++++++-- tests/release_publish_invariants.py | 54 +++++++++---- 2 files changed, 154 insertions(+), 20 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 73a4bca..24af2ba 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1089,8 +1089,62 @@ jobs: STATIC_URL="https://static.crates.io/crates/ordvec/ordvec-${VERSION}.crate" CRATES_IO_USER_AGENT="ordvec-release-verify/${VERSION} (https://github.com/Fieldnote-Echo/ordvec)" EXISTING="${RUNNER_TEMP}/existing-ordvec.crate" - if curl -fsSL --user-agent "$CRATES_IO_USER_AGENT" "$API_URL" -o "$EXISTING" \ - || curl -fsSL --user-agent "$CRATES_IO_USER_AGENT" "$STATIC_URL" -o "$EXISTING"; then + API_STATUS_FILE="${RUNNER_TEMP}/existing-ordvec-api-status.txt" + STATIC_STATUS_FILE="${RUNNER_TEMP}/existing-ordvec-static-status.txt" + already_present=false + + rm -f "$EXISTING" "$API_STATUS_FILE" "$STATIC_STATUS_FILE" + API_CURL_EXIT=0 + curl -sSL --retry 3 --retry-delay 2 --retry-all-errors --connect-timeout 10 --max-time 60 \ + --user-agent "$CRATES_IO_USER_AGENT" \ + --write-out "%{http_code}" \ + --output "$EXISTING" \ + "$API_URL" > "$API_STATUS_FILE" || API_CURL_EXIT=$? + API_STATUS="$(cat "$API_STATUS_FILE")" + if [ "$API_CURL_EXIT" -ne 0 ]; then + echo "::error::could not determine crates.io status while checking ordvec ${VERSION} at $API_URL (curl exit ${API_CURL_EXIT}). Refusing recovery." + exit 1 + fi + case "$API_STATUS" in + 200) + already_present=true + ;; + 404) + rm -f "$EXISTING" + ;; + *) + echo "::error::unexpected crates.io status ${API_STATUS} while checking ordvec ${VERSION} at $API_URL. Refusing recovery." + exit 1 + ;; + esac + + if [ "$already_present" != true ]; then + STATIC_CURL_EXIT=0 + curl -sSL --retry 3 --retry-delay 2 --retry-all-errors --connect-timeout 10 --max-time 60 \ + --user-agent "$CRATES_IO_USER_AGENT" \ + --write-out "%{http_code}" \ + --output "$EXISTING" \ + "$STATIC_URL" > "$STATIC_STATUS_FILE" || STATIC_CURL_EXIT=$? + STATIC_STATUS="$(cat "$STATIC_STATUS_FILE")" + if [ "$STATIC_CURL_EXIT" -ne 0 ]; then + echo "::error::could not determine crates.io status while checking ordvec ${VERSION} at $STATIC_URL (curl exit ${STATIC_CURL_EXIT}). Refusing recovery." + exit 1 + fi + case "$STATIC_STATUS" in + 200) + already_present=true + ;; + 404) + rm -f "$EXISTING" + ;; + *) + echo "::error::unexpected crates.io status ${STATIC_STATUS} while checking ordvec ${VERSION} at $STATIC_URL. Refusing recovery." + exit 1 + ;; + esac + fi + + if [ "$already_present" = true ]; then E_SHA=$(sha256sum "$EXISTING" | cut -d' ' -f1) echo "attested: $A_SHA" echo "crates.io-served: $E_SHA" @@ -1102,7 +1156,7 @@ jobs: echo "::notice::crates.io already serves byte-identical ordvec ${VERSION}; skipping upload and verifying served bytes." else echo "already_published=false" >> "$GITHUB_OUTPUT" - echo "No existing ordvec ${VERSION} .crate found on crates.io; proceeding with publish." + echo "Both crates.io recovery endpoints returned 404 for ordvec ${VERSION}; proceeding with publish." fi # Mint the short-lived crates.io credential immediately before publish so # the ephemeral token's exposure window is minimal. No stored secret. @@ -1386,8 +1440,62 @@ jobs: STATIC_URL="https://static.crates.io/crates/ordvec-manifest/ordvec-manifest-${VERSION}.crate" CRATES_IO_USER_AGENT="ordvec-release-verify/${VERSION} (https://github.com/Fieldnote-Echo/ordvec)" EXISTING="${RUNNER_TEMP}/existing-ordvec-manifest.crate" - if curl -fsSL --user-agent "$CRATES_IO_USER_AGENT" "$API_URL" -o "$EXISTING" \ - || curl -fsSL --user-agent "$CRATES_IO_USER_AGENT" "$STATIC_URL" -o "$EXISTING"; then + API_STATUS_FILE="${RUNNER_TEMP}/existing-ordvec-manifest-api-status.txt" + STATIC_STATUS_FILE="${RUNNER_TEMP}/existing-ordvec-manifest-static-status.txt" + already_present=false + + rm -f "$EXISTING" "$API_STATUS_FILE" "$STATIC_STATUS_FILE" + API_CURL_EXIT=0 + curl -sSL --retry 3 --retry-delay 2 --retry-all-errors --connect-timeout 10 --max-time 60 \ + --user-agent "$CRATES_IO_USER_AGENT" \ + --write-out "%{http_code}" \ + --output "$EXISTING" \ + "$API_URL" > "$API_STATUS_FILE" || API_CURL_EXIT=$? + API_STATUS="$(cat "$API_STATUS_FILE")" + if [ "$API_CURL_EXIT" -ne 0 ]; then + echo "::error::could not determine crates.io status while checking ordvec-manifest ${VERSION} at $API_URL (curl exit ${API_CURL_EXIT}). Refusing recovery." + exit 1 + fi + case "$API_STATUS" in + 200) + already_present=true + ;; + 404) + rm -f "$EXISTING" + ;; + *) + echo "::error::unexpected crates.io status ${API_STATUS} while checking ordvec-manifest ${VERSION} at $API_URL. Refusing recovery." + exit 1 + ;; + esac + + if [ "$already_present" != true ]; then + STATIC_CURL_EXIT=0 + curl -sSL --retry 3 --retry-delay 2 --retry-all-errors --connect-timeout 10 --max-time 60 \ + --user-agent "$CRATES_IO_USER_AGENT" \ + --write-out "%{http_code}" \ + --output "$EXISTING" \ + "$STATIC_URL" > "$STATIC_STATUS_FILE" || STATIC_CURL_EXIT=$? + STATIC_STATUS="$(cat "$STATIC_STATUS_FILE")" + if [ "$STATIC_CURL_EXIT" -ne 0 ]; then + echo "::error::could not determine crates.io status while checking ordvec-manifest ${VERSION} at $STATIC_URL (curl exit ${STATIC_CURL_EXIT}). Refusing recovery." + exit 1 + fi + case "$STATIC_STATUS" in + 200) + already_present=true + ;; + 404) + rm -f "$EXISTING" + ;; + *) + echo "::error::unexpected crates.io status ${STATIC_STATUS} while checking ordvec-manifest ${VERSION} at $STATIC_URL. Refusing recovery." + exit 1 + ;; + esac + fi + + if [ "$already_present" = true ]; then E_SHA=$(sha256sum "$EXISTING" | cut -d' ' -f1) echo "attested: $A_SHA" echo "crates.io-served: $E_SHA" @@ -1399,7 +1507,7 @@ jobs: echo "::notice::crates.io already serves byte-identical ordvec-manifest ${VERSION}; skipping upload and verifying served bytes." else echo "already_published=false" >> "$GITHUB_OUTPUT" - echo "No existing ordvec-manifest ${VERSION} .crate found on crates.io; proceeding with publish." + echo "Both crates.io recovery endpoints returned 404 for ordvec-manifest ${VERSION}; proceeding with publish." fi - name: Validate manifest publish dry-run if: steps.manifest_crate_recovery.outputs.already_published != 'true' diff --git a/tests/release_publish_invariants.py b/tests/release_publish_invariants.py index bf98c38..4ba80e2 100644 --- a/tests/release_publish_invariants.py +++ b/tests/release_publish_invariants.py @@ -896,6 +896,38 @@ def readback_curl_uses(words: list[str], url_var: str) -> bool: ) +def recovery_curl_uses(words: list[str], url_var: str) -> bool: + return ( + has_shell_arg(words, shell_vars(url_var)) + and has_shell_option_value(words, {"--user-agent", "-A"}, shell_vars("CRATES_IO_USER_AGENT")) + and has_shell_option_value(words, {"--output", "-o"}, shell_vars("EXISTING")) + and has_shell_option_value(words, {"--write-out", "-w"}, {"%{http_code}"}) + and "--retry" in words + and "--retry-all-errors" in words + ) + + +def check_crate_recovery_status_handling( + recovery_run: str, path: str, job_name: str, package: str +) -> None: + required_fragments = ( + "API_CURL_EXIT=0", + 'if [ "$API_CURL_EXIT" -ne 0 ]; then', + "STATIC_CURL_EXIT=0", + 'if [ "$STATIC_CURL_EXIT" -ne 0 ]; then', + 'case "$API_STATUS" in', + 'case "$STATIC_STATUS" in', + "200)", + "404)", + "could not determine crates.io status", + "unexpected crates.io status", + f"Both crates.io recovery endpoints returned 404 for {package}", + ) + for fragment in required_fragments: + if fragment not in recovery_run: + fail(f"{path}: {job_name} recovery step must contain {fragment!r}") + + def check_hash_requirement_temp_paths(paths: list[str]) -> None: for path in paths: workflow_text = read_text(path) @@ -1295,18 +1327,14 @@ def check_publish_crate_job( ): if required not in recovery_run: fail(f"{path}: {job_name} recovery step must contain {required!r}") + check_crate_recovery_status_handling(recovery_run, path, job_name, package) for url_var in ("API_URL", "STATIC_URL"): if not any( - has_shell_arg(words, shell_vars(url_var)) - and has_shell_option_value( - words, {"--user-agent", "-A"}, shell_vars("CRATES_IO_USER_AGENT") - ) - and has_shell_option_value(words, {"--output", "-o"}, shell_vars("EXISTING")) - for words in shell_curl_commands(recovery_run) + recovery_curl_uses(words, url_var) for words in shell_curl_commands(recovery_run) ): fail( f"{path}: {job_name} recovery step must curl ${url_var} " - "with CRATES_IO_USER_AGENT into $EXISTING" + "with CRATES_IO_USER_AGENT into $EXISTING, capture HTTP status, and retry" ) protected_step_names = { @@ -1424,18 +1452,16 @@ def check_publish_crates(workflow: dict[str, Any], path: str) -> None: ): if required not in recovery_run: fail(f"{path}: manifest crate recovery step must contain {required!r}") + check_crate_recovery_status_handling( + recovery_run, path, "publish-manifest-crate", "ordvec-manifest" + ) for url_var in ("API_URL", "STATIC_URL"): if not any( - has_shell_arg(words, shell_vars(url_var)) - and has_shell_option_value( - words, {"--user-agent", "-A"}, shell_vars("CRATES_IO_USER_AGENT") - ) - and has_shell_option_value(words, {"--output", "-o"}, shell_vars("EXISTING")) - for words in shell_curl_commands(recovery_run) + recovery_curl_uses(words, url_var) for words in shell_curl_commands(recovery_run) ): fail( f"{path}: manifest crate recovery step must curl ${url_var} " - "with CRATES_IO_USER_AGENT into $EXISTING" + "with CRATES_IO_USER_AGENT into $EXISTING, capture HTTP status, and retry" ) for index, raw_step in enumerate(manifest_steps): step = mapping(raw_step, f"{path}: jobs.publish-manifest-crate.steps[{index}]")