From 6516d6f814cce7d406b05b49522183f4f7916b63 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Tue, 9 Jun 2026 13:16:02 -0500 Subject: [PATCH 1/4] Add v0.4 manifest sidecars and probe helpers Signed-off-by: Nelson Spence --- .github/workflows/release.yml | 31 +++ docs/INDEX_PROVENANCE.md | 13 +- ordvec-manifest/Cargo.toml | 4 +- ordvec-manifest/README.md | 26 ++- ordvec-manifest/src/lib.rs | 131 +++++++++++- ordvec-manifest/src/main.rs | 73 ++++++- ordvec-manifest/tests/manifest.rs | 300 ++++++++++++++++++++++++---- src/bitmap.rs | 47 ++++- src/lib.rs | 78 +++++++- src/quant.rs | 138 ++++++++++++- src/sign_bitmap.rs | 41 ++++ tests/index/bitmap.rs | 64 ++++++ tests/index/two_stage.rs | 166 ++++++++++++++- tests/release_publish_invariants.py | 54 +++++ 14 files changed, 1115 insertions(+), 51 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0e6599f..4d3c346 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1038,12 +1038,43 @@ jobs: exit 1 fi echo "OK: byte-identity verified ($A_SHA)" + - name: Check for existing ordvec-manifest .crate recovery + id: manifest_crate_recovery + env: + VERSION: ${{ needs.guard.outputs.version }} + run: | + set -euo pipefail + ATTESTED="${RUNNER_TEMP}/attested/ordvec-manifest-${VERSION}.crate" + [ -f "$ATTESTED" ] || { echo "::error::attested .crate missing at $ATTESTED"; exit 1; } + A_SHA=$(sha256sum "$ATTESTED" | cut -d' ' -f1) + API_URL="https://crates.io/api/v1/crates/ordvec-manifest/${VERSION}/download" + STATIC_URL="https://static.crates.io/crates/ordvec-manifest/ordvec-manifest-${VERSION}.crate" + CRATES_IO_USER_AGENT="ordvec-release-verify/${VERSION} (https://github.com/Fieldnote-Echo/ordvec)" + EXISTING="${RUNNER_TEMP}/existing-ordvec-manifest.crate" + if curl -fsSL --user-agent "$CRATES_IO_USER_AGENT" "$API_URL" -o "$EXISTING" \ + || curl -fsSL --user-agent "$CRATES_IO_USER_AGENT" "$STATIC_URL" -o "$EXISTING"; then + E_SHA=$(sha256sum "$EXISTING" | cut -d' ' -f1) + echo "attested: $A_SHA" + echo "crates.io-served: $E_SHA" + if [ "$A_SHA" != "$E_SHA" ]; then + echo "::error::crates.io already serves ordvec-manifest ${VERSION}, but the served .crate is not byte-identical to the SLSA-attested artifact ($E_SHA != $A_SHA). Refusing recovery." + exit 1 + fi + echo "already_published=true" >> "$GITHUB_OUTPUT" + echo "::notice::crates.io already serves byte-identical ordvec-manifest ${VERSION}; skipping upload and verifying served bytes." + else + echo "already_published=false" >> "$GITHUB_OUTPUT" + echo "No existing ordvec-manifest ${VERSION} .crate found on crates.io; proceeding with publish." + fi - name: Validate manifest publish dry-run + if: steps.manifest_crate_recovery.outputs.already_published != 'true' run: cargo publish -p ordvec-manifest --dry-run --locked - name: Mint a short-lived crates.io credential (OIDC) + if: steps.manifest_crate_recovery.outputs.already_published != 'true' id: auth uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4 - name: cargo publish + if: steps.manifest_crate_recovery.outputs.already_published != 'true' env: CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }} run: cargo publish -p ordvec-manifest --locked diff --git a/docs/INDEX_PROVENANCE.md b/docs/INDEX_PROVENANCE.md index 3725fee..aee9bbe 100644 --- a/docs/INDEX_PROVENANCE.md +++ b/docs/INDEX_PROVENANCE.md @@ -47,17 +47,20 @@ loading. The lockstep `ordvec-manifest` crate provides a sidecar verifier for that pre-load step: ```sh -cargo run -p ordvec-manifest -- verify --manifest path/to/index.manifest.json +cargo run -p ordvec-manifest --features cli -- verify --manifest path/to/index.manifest.json ``` The `create` command emits default-verifiable manifests by default: artifact and row-identity paths must resolve under the output manifest directory. If a deployment intentionally keeps those files outside that directory, create with `--allow-path-escape` and verify with the matching path-policy flag. +`create` can also bind caller-owned sidecars with `--aux NAME=PATH` for +required artifacts and `--optional-aux NAME=PATH` for optional artifacts. Rust callers can use `verify_for_load(manifest_path, VerifyOptions)` to get a `VerifiedLoadPlan` containing the canonical artifact path, probed metadata, row-identity summary, auxiliary artifact states, and the full verification -report. Callers that already hold a `ManifestDocument` can use +report, then call `require_auxiliary(name)` for sidecars that must be present +before loading. Callers that already hold a `ManifestDocument` can use `verify_document_for_load(&document, VerifyOptions)` without re-reading the manifest file. The plan helpers do not call an ordvec loader, pin file descriptors, or make mutable shared storage immutable; callers still own the @@ -101,6 +104,12 @@ present or absent, and whether any declared member failed path, size, or digest checks or exceeded the configured auxiliary artifact byte limit. Callers should load sidecars only after the relevant declaration is verified. +OrdinalDB v0.1 should use `row_id_identity` for the ordvec vector row count and +declare `ids.bin` as required auxiliary artifact name `ordinaldb.ids`. The +OrdinalDB `u64` IDs remain caller-owned sidecar bytes. Do not model `ids.bin` +as JSONL row identity: v1 JSONL row identity is UUID-only, and generic row-map +ID formats are deferred until there is a separate schema contract for them. + When present, `encoder_distortion` records a scoped encoder geometry profile: source metric, embedding metric, lower/upper distortion-style bounds when declared, empirical violation statistics when available, evidence kind, and diff --git a/ordvec-manifest/Cargo.toml b/ordvec-manifest/Cargo.toml index 3fc442e..ccca316 100644 --- a/ordvec-manifest/Cargo.toml +++ b/ordvec-manifest/Cargo.toml @@ -22,10 +22,11 @@ path = "src/lib.rs" [[bin]] name = "ordvec-manifest" path = "src/main.rs" +required-features = ["cli"] [dependencies] chrono = { version = "0.4.44", default-features = false, features = ["clock", "std"] } -clap = { version = "4.6.1", features = ["derive"] } +clap = { version = "4.6.1", features = ["derive"], optional = true } hex = "0.4.3" ordvec = { version = "0.4.0", path = ".." } rusqlite = { version = "0.40.0", optional = true } @@ -39,5 +40,6 @@ tempfile = "3.27.0" [features] default = [] +cli = ["dep:clap"] sqlite = ["dep:rusqlite"] sqlite-bundled = ["sqlite", "rusqlite/bundled"] diff --git a/ordvec-manifest/README.md b/ordvec-manifest/README.md index 3b00f48..d294653 100644 --- a/ordvec-manifest/README.md +++ b/ordvec-manifest/README.md @@ -11,13 +11,16 @@ index files, decide deployment trust policy, estimate encoder geometry, compute calibration statistics, or change the C ABI. `ordvec-manifest` is versioned in lockstep with the core `ordvec` crate. From a -workspace checkout, use it with `cargo run -p ordvec-manifest --`; from a -published release, install it with `cargo install ordvec-manifest`. +workspace checkout, use the optional CLI with +`cargo run -p ordvec-manifest --features cli --`; from a published release, +install the binary with `cargo install ordvec-manifest --features cli`. The +library default feature set is empty and does not depend on `clap`. ```sh ordvec-manifest create \ --index path/to/index.tvrq \ --row-id-is-identity \ + --aux ordinaldb.ids=path/to/ids.bin \ --embedding-model bge-small-en-v1.5 \ --out path/to/index.manifest.json @@ -25,7 +28,7 @@ ordvec-manifest verify --manifest path/to/index.manifest.json ``` From a workspace checkout, prefix the same commands with -`cargo run -p ordvec-manifest --`. +`cargo run -p ordvec-manifest --features cli --`. The schema version is `ordvec.index_manifest.v1`. Relative paths resolve from the manifest file's directory, absolute paths are rejected by default, and @@ -53,6 +56,7 @@ Controlled-storage load pattern: ```rust let plan = ordvec_manifest::verify_for_load(&manifest_path, options)?; +let _ordinaldb_ids = plan.require_auxiliary("ordinaldb.ids")?; let index = ordvec::RankQuant::load(plan.artifact_path())?; ``` @@ -132,6 +136,22 @@ by path policy. Optional members are reported as verified when present or as `optional_absent` with a stable reason code when absent. The verifier checks bytes only; application semantics remain with the caller. +`create` can declare sidecars while it hashes them: +`--aux NAME=PATH` creates a required declaration and +`--optional-aux NAME=PATH` creates an optional declaration. Library callers use +`CreateAuxiliaryArtifact { name, path, required }` through +`CreateManifestOptions::auxiliary_artifacts`. `VerifiedLoadPlan` offers +`auxiliary_by_name(name)` for inspection and `require_auxiliary(name)` for +callers that must fail if a named sidecar is not declared and verified. + +For OrdinalDB v0.1, keep the ordvec row identity as +`RowIdentity::RowIdIdentity { row_count }` and declare the OrdinalDB `ids.bin` +file as required auxiliary artifact name `ordinaldb.ids`. That makes the vector +row count an ordvec invariant while leaving OrdinalDB's `u64` document IDs as a +caller-owned sidecar. Do not encode `ids.bin` as `RowIdentity::Jsonl`: v1 JSONL +row identity is UUID-oriented (`id_kind = "uuid"`), and generic row-map ID +formats are intentionally deferred. + The unified JSON report carries per-sidecar audit fields. A successful auxiliary artifact verification includes the manifest path, resolved/canonical paths, declared digest/length, and observed digest/length: diff --git a/ordvec-manifest/src/lib.rs b/ordvec-manifest/src/lib.rs index 556596c..0f84fbc 100644 --- a/ordvec-manifest/src/lib.rs +++ b/ordvec-manifest/src/lib.rs @@ -510,6 +510,13 @@ fn validate_auxiliary_artifact_shape( "auxiliary_artifact_name_empty", "auxiliary artifact name must be non-empty", ); + } else if artifact.name != name { + report.error( + "auxiliary_artifact_name_not_trimmed", + format!( + "auxiliary artifact name {name:?} must not have leading or trailing whitespace" + ), + ); } else if !names.insert(name.to_string()) { report.error( "auxiliary_artifact_name_duplicate", @@ -2843,6 +2850,27 @@ impl VerifiedLoadPlan { &self.auxiliary_artifacts } + pub fn auxiliary_by_name(&self, name: &str) -> Option<&VerifiedAuxiliaryArtifactPlan> { + self.auxiliary_artifacts + .iter() + .find(|artifact| artifact.name() == name) + } + + pub fn require_auxiliary(&self, name: &str) -> Result<&Path, RequireAuxiliaryError> { + let artifact = self.auxiliary_by_name(name).ok_or_else(|| { + RequireAuxiliaryError::MissingDeclaration { + name: name.to_string(), + } + })?; + artifact + .path() + .ok_or_else(|| RequireAuxiliaryError::NotLoadable { + name: name.to_string(), + state: artifact.state(), + reason_code: artifact.reason_code().map(ToOwned::to_owned), + }) + } + pub fn report(&self) -> &VerificationReport { &self.report } @@ -2852,6 +2880,44 @@ impl VerifiedLoadPlan { } } +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum RequireAuxiliaryError { + MissingDeclaration { + name: String, + }, + NotLoadable { + name: String, + state: AuxiliaryArtifactState, + reason_code: Option, + }, +} + +impl fmt::Display for RequireAuxiliaryError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::MissingDeclaration { name } => { + write!(f, "required auxiliary artifact {name:?} is not declared") + } + Self::NotLoadable { + name, + state, + reason_code, + } => { + write!( + f, + "required auxiliary artifact {name:?} is not loadable: state={state:?}" + )?; + if let Some(reason_code) = reason_code { + write!(f, ", reason_code={reason_code}")?; + } + Ok(()) + } + } + } +} + +impl std::error::Error for RequireAuxiliaryError {} + #[derive(Clone, Debug)] pub struct VerifiedRowIdentityPlan { kind: String, @@ -3314,11 +3380,19 @@ pub enum CreateRowIdentity { Jsonl(PathBuf), } +#[derive(Clone, Debug)] +pub struct CreateAuxiliaryArtifact { + pub name: String, + pub path: PathBuf, + pub required: bool, +} + #[derive(Clone, Debug, Default)] pub struct CreateManifestOptions { pub allow_absolute_paths: bool, pub allow_path_escape: bool, pub limits: ResourceLimits, + pub auxiliary_artifacts: Vec, } pub fn create_manifest_for_index( @@ -3417,13 +3491,16 @@ pub fn create_manifest_for_index_with_options( } }; + let auxiliary_artifacts = + create_auxiliary_artifacts(&options.auxiliary_artifacts, out_base, &options)?; + let invocation_id = format!("urn:uuid:{}", Uuid::new_v4()); Ok(IndexManifest { schema_version: SCHEMA_VERSION.to_string(), manifest_id: format!("urn:uuid:{}", Uuid::new_v4()), created_at: Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true), artifact, - auxiliary_artifacts: Vec::new(), + auxiliary_artifacts, embedding: Embedding { model: embedding_model.into(), dim: metadata.dim, @@ -3450,6 +3527,58 @@ pub fn create_manifest_for_index_with_options( }) } +fn create_auxiliary_artifacts( + artifacts: &[CreateAuxiliaryArtifact], + out_base: &Path, + options: &CreateManifestOptions, +) -> Result, ManifestError> { + let count = artifacts.len(); + if count > options.limits.max_auxiliary_artifacts { + return Err(ManifestError::limit_exceeded( + "auxiliary_artifact_count_limit_exceeded", + format!( + "auxiliary_artifacts has {count} entries, exceeding max_auxiliary_artifacts={}", + options.limits.max_auxiliary_artifacts + ), + )); + } + + let mut names = HashSet::new(); + let mut manifest_artifacts = Vec::with_capacity(artifacts.len()); + for artifact in artifacts { + let name = artifact.name.trim(); + if name.is_empty() { + return Err(ManifestError::invalid( + "auxiliary artifact name must be non-empty", + )); + } + if !names.insert(name.to_string()) { + return Err(ManifestError::invalid(format!( + "auxiliary artifact name {name:?} is duplicated" + ))); + } + let hash = sha256_file_bounded( + &artifact.path, + options.limits.max_auxiliary_artifact_bytes, + "auxiliary_artifact_file_too_large", + "auxiliary artifact", + )?; + manifest_artifacts.push(AuxiliaryArtifact { + name: name.to_string(), + path: manifest_path_for_create( + &artifact.path, + out_base, + options, + "auxiliary artifact", + )?, + sha256: hash.sha256, + file_size_bytes: hash.size_bytes, + required: artifact.required, + }); + } + Ok(manifest_artifacts) +} + pub fn write_manifest_file( manifest: &IndexManifest, path: impl AsRef, diff --git a/ordvec-manifest/src/main.rs b/ordvec-manifest/src/main.rs index 767b99d..50d7724 100644 --- a/ordvec-manifest/src/main.rs +++ b/ordvec-manifest/src/main.rs @@ -1,9 +1,9 @@ use clap::{Args, Parser, Subcommand}; use ordvec_manifest::{ create_manifest_for_index_with_options, load_manifest_file_with_options, sha256_file, - verify_manifest, write_manifest_file, CreateManifestOptions, CreateRowIdentity, - ManifestDocument, ManifestError, NullModelSpec, ProfileParameterization, ResourceLimits, - VerifyOptions, + verify_manifest, write_manifest_file, CreateAuxiliaryArtifact, CreateManifestOptions, + CreateRowIdentity, ManifestDocument, ManifestError, NullModelSpec, ProfileParameterization, + ResourceLimits, VerifyOptions, }; use serde_json::json; use std::fs; @@ -57,6 +57,10 @@ enum Commands { row_map: Option, #[arg(long)] row_id_is_identity: bool, + #[arg(long = "aux", value_name = "NAME=PATH", value_parser = parse_auxiliary_artifact_arg)] + auxiliary_artifacts: Vec, + #[arg(long = "optional-aux", value_name = "NAME=PATH", value_parser = parse_auxiliary_artifact_arg)] + optional_auxiliary_artifacts: Vec, #[arg(long)] embedding_model: String, #[arg(long)] @@ -75,6 +79,41 @@ enum Commands { }, } +#[derive(Clone, Debug)] +struct AuxiliaryArtifactArg { + name: String, + path: PathBuf, +} + +fn parse_auxiliary_artifact_arg(value: &str) -> Result { + let (name, path) = value + .split_once('=') + .ok_or_else(|| "expected NAME=PATH".to_string())?; + if name.trim().is_empty() { + return Err("auxiliary artifact name must be non-empty".to_string()); + } + if path.trim().is_empty() { + return Err("auxiliary artifact path must be non-empty".to_string()); + } + Ok(AuxiliaryArtifactArg { + name: name.trim().to_string(), + path: PathBuf::from(path.trim()), + }) +} + +#[cfg(test)] +mod tests { + use super::parse_auxiliary_artifact_arg; + use std::path::PathBuf; + + #[test] + fn auxiliary_artifact_arg_trims_name_and_path() { + let parsed = parse_auxiliary_artifact_arg(" ordinaldb.ids = ids.bin ").unwrap(); + assert_eq!(parsed.name, "ordinaldb.ids"); + assert_eq!(parsed.path, PathBuf::from("ids.bin")); + } +} + #[cfg(feature = "sqlite")] #[derive(Subcommand)] enum SqliteCommands { @@ -259,6 +298,8 @@ fn run() -> Result { index, row_map, row_id_is_identity, + auxiliary_artifacts, + optional_auxiliary_artifacts, embedding_model, out, allow_absolute_paths, @@ -282,6 +323,8 @@ fn run() -> Result { if let Some(parent) = out.parent().filter(|p| !p.as_os_str().is_empty()) { fs::create_dir_all(parent)?; } + let auxiliary_artifacts = + create_auxiliary_options(auxiliary_artifacts, optional_auxiliary_artifacts); let manifest = create_manifest_for_index_with_options( &index, row_identity, @@ -291,6 +334,7 @@ fn run() -> Result { allow_absolute_paths, allow_path_escape, limits: limits.resource_limits(), + auxiliary_artifacts, }, )?; write_manifest_file(&manifest, &out)?; @@ -302,6 +346,29 @@ fn run() -> Result { } } +fn create_auxiliary_options( + required: Vec, + optional: Vec, +) -> Vec { + required + .into_iter() + .map(|artifact| CreateAuxiliaryArtifact { + name: artifact.name, + path: artifact.path, + required: true, + }) + .chain( + optional + .into_iter() + .map(|artifact| CreateAuxiliaryArtifact { + name: artifact.name, + path: artifact.path, + required: false, + }), + ) + .collect() +} + #[cfg(feature = "sqlite")] fn run_sqlite(command: SqliteCommands) -> Result { match command { diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs index 523696b..c1dfc80 100644 --- a/ordvec-manifest/tests/manifest.rs +++ b/ordvec-manifest/tests/manifest.rs @@ -3,10 +3,11 @@ use ordvec_manifest::{ create_manifest_for_index, create_manifest_for_index_with_options, load_manifest_file, load_manifest_file_with_options, sha256_file, verify_document_for_load, verify_for_load, verify_index_manifest, verify_manifest_with_base, AuxiliaryArtifact, AuxiliaryArtifactState, - CalibrationOrdinalization, CalibrationProfileRef, CreateManifestOptions, CreateRowIdentity, - DistortionBounds, DistortionEvidence, DistortionEvidenceKind, DistortionProfileArtifactRef, - DistortionScope, EncoderDistortionProfileRef, EncoderSpec, ManifestIndexKind, - ManifestIndexParams, MetricSpec, NullModelSpec, ProfileArtifactRef, ProfileParameterization, + CalibrationOrdinalization, CalibrationProfileRef, CreateAuxiliaryArtifact, + CreateManifestOptions, CreateRowIdentity, DistortionBounds, DistortionEvidence, + DistortionEvidenceKind, DistortionProfileArtifactRef, DistortionScope, + EncoderDistortionProfileRef, EncoderSpec, ManifestIndexKind, ManifestIndexParams, MetricSpec, + NullModelSpec, ProfileArtifactRef, ProfileParameterization, RequireAuxiliaryError, ResourceLimits, RowIdentity, VerifiedLoadPlanError, VerifyOptions, CALIBRATION_SCHEMA_VERSION, ENCODER_DISTORTION_SCHEMA_VERSION, }; @@ -14,6 +15,7 @@ use serde_json::json; use std::fs; use std::io::Write; use std::path::{Path, PathBuf}; +#[cfg(feature = "cli")] use std::process::Command; fn write_index(dir: &Path) -> PathBuf { @@ -314,6 +316,207 @@ fn create_manifest_creates_output_parent_for_programmatic_callers() { assert_eq!(manifest.row_identity.row_count(), 2); } +#[test] +fn create_manifest_declares_auxiliary_artifacts_for_load_plan_lookup() { + let temp = tempfile::tempdir().unwrap(); + let index = write_index(temp.path()); + let ids = temp.path().join("ids.bin"); + let optional = temp.path().join("optional.json"); + fs::write(&ids, 7u64.to_le_bytes()).unwrap(); + fs::write(&optional, br#"{"optional":true}"#).unwrap(); + let ids_hash = sha256_file(&ids).unwrap(); + let optional_hash = sha256_file(&optional).unwrap(); + let manifest_path = temp.path().join("manifest.json"); + + let manifest = create_manifest_for_index_with_options( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + CreateManifestOptions { + auxiliary_artifacts: vec![ + CreateAuxiliaryArtifact { + name: " ordinaldb.ids ".to_string(), + path: ids.clone(), + required: true, + }, + CreateAuxiliaryArtifact { + name: "optional.stats".to_string(), + path: optional.clone(), + required: false, + }, + ], + ..CreateManifestOptions::default() + }, + ) + .unwrap(); + + assert_eq!(manifest.auxiliary_artifacts.len(), 2); + assert_eq!(manifest.auxiliary_artifacts[0].name, "ordinaldb.ids"); + assert_eq!(manifest.auxiliary_artifacts[0].path, "ids.bin"); + assert_eq!(manifest.auxiliary_artifacts[0].sha256, ids_hash.sha256); + assert_eq!( + manifest.auxiliary_artifacts[0].file_size_bytes, + ids_hash.size_bytes + ); + assert!(manifest.auxiliary_artifacts[0].required); + assert_eq!(manifest.auxiliary_artifacts[1].name, "optional.stats"); + assert_eq!(manifest.auxiliary_artifacts[1].path, "optional.json"); + assert_eq!(manifest.auxiliary_artifacts[1].sha256, optional_hash.sha256); + assert!(!manifest.auxiliary_artifacts[1].required); + + fs::write( + &manifest_path, + serde_json::to_string_pretty(&manifest).unwrap(), + ) + .unwrap(); + fs::remove_file(&optional).unwrap(); + + let plan = verify_for_load(&manifest_path, VerifyOptions::default()).unwrap(); + assert_eq!( + plan.require_auxiliary("ordinaldb.ids").unwrap(), + fs::canonicalize(&ids).unwrap().as_path() + ); + assert_eq!( + plan.auxiliary_by_name("optional.stats").unwrap().state(), + AuxiliaryArtifactState::OptionalAbsent + ); + assert!(matches!( + plan.require_auxiliary("missing"), + Err(RequireAuxiliaryError::MissingDeclaration { .. }) + )); +} + +#[test] +fn create_manifest_rejects_invalid_auxiliary_artifact_declarations() { + let root = tempfile::tempdir().unwrap(); + let case = tempfile::tempdir_in(root.path()).unwrap(); + let index = write_index(case.path()); + let sidecar = case.path().join("ids.bin"); + fs::write(&sidecar, b"sidecar").unwrap(); + let manifest_path = case.path().join("manifest.json"); + + let err = create_manifest_for_index_with_options( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + CreateManifestOptions { + auxiliary_artifacts: vec![CreateAuxiliaryArtifact { + name: " ".to_string(), + path: sidecar.clone(), + required: true, + }], + ..CreateManifestOptions::default() + }, + ) + .unwrap_err(); + assert!(err.to_string().contains("name must be non-empty")); + + let err = create_manifest_for_index_with_options( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + CreateManifestOptions { + auxiliary_artifacts: vec![ + CreateAuxiliaryArtifact { + name: "dup".to_string(), + path: sidecar.clone(), + required: true, + }, + CreateAuxiliaryArtifact { + name: "dup".to_string(), + path: sidecar.clone(), + required: false, + }, + ], + ..CreateManifestOptions::default() + }, + ) + .unwrap_err(); + assert!(err.to_string().contains("duplicated")); + + let err = create_manifest_for_index_with_options( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + CreateManifestOptions { + limits: ResourceLimits { + max_auxiliary_artifacts: 0, + ..ResourceLimits::default() + }, + auxiliary_artifacts: vec![CreateAuxiliaryArtifact { + name: "ids".to_string(), + path: sidecar.clone(), + required: true, + }], + ..CreateManifestOptions::default() + }, + ) + .unwrap_err(); + assert_eq!(err.code(), Some("auxiliary_artifact_count_limit_exceeded")); + + let err = create_manifest_for_index_with_options( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + CreateManifestOptions { + limits: ResourceLimits { + max_auxiliary_artifact_bytes: 1, + ..ResourceLimits::default() + }, + auxiliary_artifacts: vec![CreateAuxiliaryArtifact { + name: "ids".to_string(), + path: sidecar.clone(), + required: true, + }], + ..CreateManifestOptions::default() + }, + ) + .unwrap_err(); + assert_eq!(err.code(), Some("auxiliary_artifact_file_too_large")); + + let missing = case.path().join("missing.bin"); + let err = create_manifest_for_index_with_options( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + CreateManifestOptions { + auxiliary_artifacts: vec![CreateAuxiliaryArtifact { + name: "missing".to_string(), + path: missing, + required: true, + }], + ..CreateManifestOptions::default() + }, + ) + .unwrap_err(); + assert!(err.to_string().contains("No such file") || err.to_string().contains("not found")); + + let outside = root.path().join("outside.bin"); + fs::write(&outside, b"outside").unwrap(); + let err = create_manifest_for_index_with_options( + &index, + CreateRowIdentity::RowIdIdentity, + "test-embedding", + &manifest_path, + CreateManifestOptions { + auxiliary_artifacts: vec![CreateAuxiliaryArtifact { + name: "outside".to_string(), + path: outside, + required: true, + }], + ..CreateManifestOptions::default() + }, + ) + .unwrap_err(); + assert!(err.to_string().contains("outside manifest directory")); +} + #[test] fn schema_rejects_unknown_fields_and_bad_extension_keys() { let root = tempfile::tempdir().unwrap(); @@ -2296,11 +2499,21 @@ fn auxiliary_artifact_schema_rejects_unknown_fields_and_duplicate_names() { manifest.auxiliary_artifacts = vec![ auxiliary_artifact("duplicate", "sidecar.bin", sidecar_hash.clone(), true), - auxiliary_artifact("duplicate", "sidecar.bin", sidecar_hash, false), + auxiliary_artifact("duplicate", "sidecar.bin", sidecar_hash.clone(), false), ]; let report = verify_manifest_with_base(manifest.clone(), temp.path(), VerifyOptions::default()); assert!(error_codes(&report).contains(&"auxiliary_artifact_name_duplicate")); + let mut padded = manifest.clone(); + padded.auxiliary_artifacts = vec![auxiliary_artifact( + " duplicate ", + "sidecar.bin", + sidecar_hash, + true, + )]; + let report = verify_manifest_with_base(padded, temp.path(), VerifyOptions::default()); + assert!(error_codes(&report).contains(&"auxiliary_artifact_name_not_trimmed")); + let mut value = serde_json::to_value(&manifest).unwrap(); value["auxiliary_artifacts"][0]["unexpected"] = json!(true); let parsed = serde_json::from_value::(value); @@ -2435,24 +2648,33 @@ fn attestation_shape_requires_matching_subject_sha256() { ); } +#[cfg(feature = "cli")] #[test] fn cli_create_verify_and_exit_codes() { let temp = tempfile::tempdir().unwrap(); let index = write_index(temp.path()); + let ids = temp.path().join("ids.bin"); + let optional = temp.path().join("optional.json"); + fs::write(&ids, 7u64.to_le_bytes()).unwrap(); + fs::write(&optional, br#"{"optional":true}"#).unwrap(); + let aux_arg = format!("ordinaldb.ids={}", ids.display()); + let optional_aux_arg = format!("optional.stats={}", optional.display()); let manifest = temp.path().join("manifest.json"); let bin = env!("CARGO_BIN_EXE_ordvec-manifest"); let output = Command::new(bin) - .args([ - "create", - "--index", - index.to_str().unwrap(), - "--row-id-is-identity", - "--embedding-model", - "test-embedding", - "--out", - manifest.to_str().unwrap(), - ]) + .arg("create") + .arg("--index") + .arg(index.to_str().unwrap()) + .arg("--row-id-is-identity") + .arg("--aux") + .arg(&aux_arg) + .arg("--optional-aux") + .arg(&optional_aux_arg) + .arg("--embedding-model") + .arg("test-embedding") + .arg("--out") + .arg(manifest.to_str().unwrap()) .output() .unwrap(); assert!( @@ -2596,6 +2818,7 @@ fn cli_create_verify_and_exit_codes() { ); } +#[cfg(feature = "cli")] #[test] fn create_outside_manifest_dir_requires_explicit_path_policy() { let temp = tempfile::tempdir().unwrap(); @@ -2809,28 +3032,31 @@ fn sqlite_cache_is_explicit_and_activation_reverifies_by_default() { .iter() .any(|issue| issue.code == "sqlite_activation_forced")); - let bin = env!("CARGO_BIN_EXE_ordvec-manifest"); - let output = Command::new(bin) - .args([ - "sqlite", - "activate", - "--db", - db.to_str().unwrap(), - "--manifest", - manifest_path.to_str().unwrap(), - "--force", - "--json", - ]) - .output() - .unwrap(); - assert_eq!(output.status.code(), Some(0)); - let forced_report: ordvec_manifest::VerificationReport = - serde_json::from_slice(&output.stdout).unwrap(); - assert!(!forced_report.ok); - assert!(forced_report - .warnings - .iter() - .any(|issue| issue.code == "sqlite_activation_forced")); + #[cfg(feature = "cli")] + { + let bin = env!("CARGO_BIN_EXE_ordvec-manifest"); + let output = Command::new(bin) + .args([ + "sqlite", + "activate", + "--db", + db.to_str().unwrap(), + "--manifest", + manifest_path.to_str().unwrap(), + "--force", + "--json", + ]) + .output() + .unwrap(); + assert_eq!(output.status.code(), Some(0)); + let forced_report: ordvec_manifest::VerificationReport = + serde_json::from_slice(&output.stdout).unwrap(); + assert!(!forced_report.ok); + assert!(forced_report + .warnings + .iter() + .any(|issue| issue.code == "sqlite_activation_forced")); + } } #[cfg(feature = "sqlite")] diff --git a/src/bitmap.rs b/src/bitmap.rs index 0daca1a..3e7c4f4 100644 --- a/src/bitmap.rs +++ b/src/bitmap.rs @@ -28,7 +28,7 @@ use rayon::prelude::*; use crate::rank::rank_transform; use crate::util::{and_popcount, assert_all_finite, result_buffer_len, TopK}; -use crate::SearchResults; +use crate::{OrdvecError, SearchResults}; /// Top-bucket bitmap index for constant-composition coarse scoring. /// @@ -47,6 +47,37 @@ pub struct Bitmap { } impl Bitmap { + pub fn validate_params(dim: usize, n_top: usize) -> Result<(), OrdvecError> { + if dim == 0 { + return Err(OrdvecError::InvalidParameter { + name: "dim", + message: "must be > 0".to_string(), + }); + } + if !dim.is_multiple_of(64) { + return Err(OrdvecError::InvalidParameter { + name: "dim", + message: "must be a multiple of 64".to_string(), + }); + } + if dim > crate::rank_io::MAX_DIM { + return Err(OrdvecError::InvalidParameter { + name: "dim", + message: format!( + "must be <= {} (u16 rank invariant)", + crate::rank_io::MAX_DIM + ), + }); + } + if !(n_top > 0 && n_top < dim) { + return Err(OrdvecError::InvalidParameter { + name: "n_top", + message: "must satisfy 0 < n_top < dim".to_string(), + }); + } + Ok(()) + } + pub fn new(dim: usize, n_top: usize) -> Self { assert_eq!(dim % 64, 0, "dim must be a multiple of 64"); // Bitmap rank-transforms each document (u16 ranks) and indexes the @@ -464,6 +495,20 @@ impl Bitmap { self.bitmaps.len() * std::mem::size_of::() } + pub fn swap_remove(&mut self, idx: usize) -> usize { + assert!(idx < self.n_vectors, "index out of bounds"); + let last = self.n_vectors - 1; + let qpv = self.qwords_per_vec; + if idx != last { + let src = last * qpv; + let dst = idx * qpv; + self.bitmaps.copy_within(src..src + qpv, dst); + } + self.bitmaps.truncate(last * qpv); + self.n_vectors -= 1; + last + } + /// Persist to a `.tvbm` file. Format: 17-byte header + u64 bitmaps LE. pub fn write(&self, path: impl AsRef) -> std::io::Result<()> { crate::rank_io::write_bitmap(path, self.dim, self.n_top, self.n_vectors, &self.bitmaps) diff --git a/src/lib.rs b/src/lib.rs index 06ea646..8515824 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -48,6 +48,8 @@ // (THREAT_MODEL.md, THREAT-SIMD-001). #![deny(unsafe_op_in_unsafe_fn)] +use std::fmt; + mod bitmap; mod fastscan; #[cfg(feature = "experimental")] @@ -61,7 +63,7 @@ pub mod sign_bitmap; mod util; pub use bitmap::Bitmap; -pub use quant::{rankquant_eval_search, RankQuant}; +pub use quant::{rankquant_eval_search, RankQuant, TwoStageCandidatePolicy}; pub use rank::Rank; pub use rank_io::{probe_index_metadata, IndexKind, IndexMetadata, IndexParams}; pub use sign_bitmap::SignBitmap; @@ -109,6 +111,80 @@ pub type MultiBucketBitmapIndex = MultiBucketBitmap; #[deprecated(since = "0.2.0", note = "renamed to `RankQuantFastscan`")] pub type RankQuantFastscanIndex = RankQuantFastscan; +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum OrdvecError { + InvalidParameter { + name: &'static str, + message: String, + }, + InvalidLength { + name: &'static str, + len: usize, + dim: usize, + }, + InvalidVectorLength { + name: &'static str, + len: usize, + expected: usize, + }, + CandidateIdOutOfRange { + id: u32, + n_vectors: usize, + }, +} + +impl fmt::Display for OrdvecError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::InvalidParameter { name, message } => { + write!(f, "invalid {name}: {message}") + } + Self::InvalidLength { name, len, dim } => { + write!(f, "{name} length {len} must be a multiple of dim {dim}") + } + Self::InvalidVectorLength { + name, + len, + expected, + } => { + write!(f, "{name} length {len} must equal dim {expected}") + } + Self::CandidateIdOutOfRange { id, n_vectors } => { + write!( + f, + "candidate id {id} out of range for n_vectors {n_vectors}" + ) + } + } + } +} + +impl std::error::Error for OrdvecError {} + +pub fn validate_flat_vectors_len(len: usize, dim: usize) -> Result { + if dim == 0 { + return Err(OrdvecError::InvalidParameter { + name: "dim", + message: "must be > 0".to_string(), + }); + } + if !len.is_multiple_of(dim) { + return Err(OrdvecError::InvalidLength { + name: "vectors", + len, + dim, + }); + } + Ok(len / dim) +} + +pub fn validate_candidate_ids(candidates: &[u32], n_vectors: usize) -> Result<(), OrdvecError> { + if let Some(&id) = candidates.iter().find(|&&id| (id as usize) >= n_vectors) { + return Err(OrdvecError::CandidateIdOutOfRange { id, n_vectors }); + } + Ok(()) +} + /// Top-k search results, laid out as `nq` contiguous blocks of `k`. /// /// `scores` and `indices` are flat row-major buffers of length `nq * k`; diff --git a/src/quant.rs b/src/quant.rs index cc40553..ccd65bd 100644 --- a/src/quant.rs +++ b/src/quant.rs @@ -23,8 +23,9 @@ use crate::rank::{ bucket_centre, bucket_ranks, pack_buckets, rank_to_bucket, rank_transform, rankquant_bytes_per_vec, rankquant_norm, }; +use crate::sign_bitmap::SignBitmap; use crate::util::{assert_all_finite, l2_normalise, result_buffer_len, TopK}; -use crate::SearchResults; +use crate::{validate_candidate_ids, OrdvecError, SearchResults}; fn check_eval_bits(bits: u8) { assert!((1..=7).contains(&bits), "bits must be in 1..=7"); @@ -79,6 +80,36 @@ pub struct RankQuant { pub(crate) packed: Vec, } +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct TwoStageCandidatePolicy { + pub min_candidates: usize, + pub k_multiplier: usize, + pub max_candidates: Option, +} + +impl TwoStageCandidatePolicy { + pub fn candidate_count(&self, k: usize, search_space: usize) -> usize { + if k == 0 || search_space == 0 { + return 0; + } + let mut count = self.min_candidates.max(k.saturating_mul(self.k_multiplier)); + if let Some(max_candidates) = self.max_candidates { + count = count.min(max_candidates); + } + count.min(search_space) + } +} + +impl Default for TwoStageCandidatePolicy { + fn default() -> Self { + Self { + min_candidates: 256, + k_multiplier: 32, + max_candidates: None, + } + } +} + /// SIMD dispatch tier for the asymmetric scan kernels. /// /// Tier selection is gated on *both* runtime CPU features and the @@ -147,6 +178,44 @@ fn select_simd_tier(dim: usize, bits: u8) -> SimdTier { } impl RankQuant { + pub fn validate_params(dim: usize, bits: u8) -> Result<(), OrdvecError> { + if !matches!(bits, 1 | 2 | 4) { + return Err(OrdvecError::InvalidParameter { + name: "bits", + message: "must be 1, 2, or 4".to_string(), + }); + } + if dim < 2 { + return Err(OrdvecError::InvalidParameter { + name: "dim", + message: "must be >= 2".to_string(), + }); + } + if dim > u16::MAX as usize { + return Err(OrdvecError::InvalidParameter { + name: "dim", + message: "must fit in u16".to_string(), + }); + } + let codes_per_byte = (8 / bits) as usize; + if !dim.is_multiple_of(codes_per_byte) { + return Err(OrdvecError::InvalidParameter { + name: "dim", + message: format!("must be a multiple of {codes_per_byte} for bits = {bits}"), + }); + } + let n_buckets = 1usize << bits; + if !dim.is_multiple_of(n_buckets) { + return Err(OrdvecError::InvalidParameter { + name: "dim", + message: format!( + "must be divisible by 2^bits = {n_buckets} so every bucket receives exactly dim / 2^bits rank entries" + ), + }); + } + Ok(()) + } + pub fn new(dim: usize, bits: u8) -> Self { assert!(matches!(bits, 1 | 2 | 4), "bits must be 1, 2, or 4"); assert!(dim >= 2, "dim must be >= 2"); @@ -661,6 +730,73 @@ impl RankQuant { .collect(); (scores, global_indices) } + + pub fn try_search_with_sign_probe( + &self, + sign_probe: &SignBitmap, + query: &[f32], + k: usize, + ) -> Result<(Vec, Vec), OrdvecError> { + self.try_search_with_sign_probe_with_policy( + sign_probe, + query, + k, + TwoStageCandidatePolicy::default(), + ) + } + + pub fn try_search_with_sign_probe_with_policy( + &self, + sign_probe: &SignBitmap, + query: &[f32], + k: usize, + policy: TwoStageCandidatePolicy, + ) -> Result<(Vec, Vec), OrdvecError> { + if sign_probe.dim() != self.dim { + return Err(OrdvecError::InvalidParameter { + name: "sign_probe.dim", + message: format!("must match RankQuant dim {}", self.dim), + }); + } + if sign_probe.len() != self.n_vectors { + return Err(OrdvecError::InvalidParameter { + name: "sign_probe.len", + message: format!("must match RankQuant len {}", self.n_vectors), + }); + } + if query.len() != self.dim { + return Err(OrdvecError::InvalidVectorLength { + name: "query", + len: query.len(), + expected: self.dim, + }); + } + validate_finite(query, "query")?; + let candidate_count = policy.candidate_count(k, self.n_vectors); + let candidates = sign_probe.top_m_candidates(query, candidate_count); + validate_candidate_ids(&candidates, self.n_vectors)?; + Ok(self.search_asymmetric_subset(query, &candidates, k)) + } + + pub fn search_with_sign_probe( + &self, + sign_probe: &SignBitmap, + query: &[f32], + k: usize, + ) -> (Vec, Vec) { + self.try_search_with_sign_probe(sign_probe, query, k) + .expect("search_with_sign_probe validation failed") + } +} + +fn validate_finite(values: &[f32], name: &'static str) -> Result<(), OrdvecError> { + if values.iter().any(|value| !value.is_finite()) { + return Err(OrdvecError::InvalidParameter { + name, + message: "must contain only finite values".to_string(), + }); + } + Ok(()) } /// Standalone symmetric RankQuant-style eval search for arbitrary bit widths. diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs index 8efd63a..8cae9a1 100644 --- a/src/sign_bitmap.rs +++ b/src/sign_bitmap.rs @@ -29,6 +29,8 @@ use rayon::prelude::*; +use crate::OrdvecError; + /// Index storing a 1-bit sign-cosine fingerprint per document. /// /// Storage: `dim / 8` bytes per doc. Dim must be a multiple of 64 @@ -44,6 +46,31 @@ pub struct SignBitmap { } impl SignBitmap { + pub fn validate_dim(dim: usize) -> Result<(), OrdvecError> { + if dim == 0 { + return Err(OrdvecError::InvalidParameter { + name: "dim", + message: "must be > 0".to_string(), + }); + } + if !dim.is_multiple_of(64) { + return Err(OrdvecError::InvalidParameter { + name: "dim", + message: "must be a multiple of 64".to_string(), + }); + } + if dim > crate::rank_io::MAX_SIGN_BITMAP_DIM { + return Err(OrdvecError::InvalidParameter { + name: "dim", + message: format!( + "must be <= MAX_SIGN_BITMAP_DIM (= {})", + crate::rank_io::MAX_SIGN_BITMAP_DIM + ), + }); + } + Ok(()) + } + /// Build an empty index for `dim`-dimensional embeddings. /// /// `dim` must be a multiple of 64 in @@ -303,6 +330,20 @@ impl SignBitmap { self.bitmaps.len() * std::mem::size_of::() } + pub fn swap_remove(&mut self, idx: usize) -> usize { + assert!(idx < self.n_vectors, "index out of bounds"); + let last = self.n_vectors - 1; + let qpv = self.qwords_per_vec; + if idx != last { + let src = last * qpv; + let dst = idx * qpv; + self.bitmaps.copy_within(src..src + qpv, dst); + } + self.bitmaps.truncate(last * qpv); + self.n_vectors -= 1; + last + } + /// Persist to a `.tvsb` file. Format: 13-byte header + LE u64 bitmaps. pub fn write(&self, path: impl AsRef) -> std::io::Result<()> { crate::rank_io::write_sign_bitmap(path, self.dim, self.n_vectors, &self.bitmaps) diff --git a/tests/index/bitmap.rs b/tests/index/bitmap.rs index 2468903..3d632f3 100644 --- a/tests/index/bitmap.rs +++ b/tests/index/bitmap.rs @@ -9,6 +9,17 @@ use rand_chacha::ChaCha8Rng; use crate::{make_corpus, D, N}; +fn corpus_after_swap_remove(corpus: &[f32], rows: usize, remove: usize) -> Vec { + let mut expected = corpus[..rows * D].to_vec(); + let last = rows - 1; + if remove != last { + let src = last * D..(last + 1) * D; + expected.copy_within(src, remove * D); + } + expected.truncate(last * D); + expected +} + #[test] fn rank_io_round_trip_bitmap_index() { let corpus = make_corpus(42); @@ -30,6 +41,59 @@ fn rank_io_round_trip_bitmap_index() { assert_eq!(r1.indices_for_query(0), r2.indices_for_query(0)); } +#[test] +fn bitmap_swap_remove_cases_match_rebuilt_probe() { + let corpus = make_corpus(40_001); + let query = &make_corpus(40_002)[..D]; + + for (rows, remove) in [(1usize, 0usize), (8, 0), (8, 3), (8, 7)] { + let mut index = Bitmap::new(D, D / 4); + index.add(&corpus[..rows * D]); + let moved = index.swap_remove(remove); + + let expected_corpus = corpus_after_swap_remove(&corpus, rows, remove); + let mut rebuilt = Bitmap::new(D, D / 4); + rebuilt.add(&expected_corpus); + + assert_eq!(moved, rows - 1); + assert_eq!(index.len(), rows - 1); + assert_eq!(index.byte_size(), (rows - 1) * index.bytes_per_vec()); + assert_eq!( + index.top_m_candidates(query, rows), + rebuilt.top_m_candidates(query, rows), + "remove={remove} rows={rows}" + ); + assert_eq!( + index.search(query, rows).indices_for_query(0), + rebuilt.search(query, rows).indices_for_query(0), + "remove={remove} rows={rows}" + ); + } +} + +#[test] +fn bitmap_write_then_load_after_swap_remove_preserves_probe() { + let corpus = make_corpus(40_003); + let mut index = Bitmap::new(D, D / 4); + index.add(&corpus); + index.swap_remove(17); + + let tmp = std::env::temp_dir().join(format!( + "ordvec_bitmap_after_swap_remove_{}.tvbm", + std::process::id() + )); + index.write(&tmp).expect("write"); + let loaded = Bitmap::load(&tmp).expect("load"); + std::fs::remove_file(&tmp).ok(); + + let query = &make_corpus(40_004)[..D]; + assert_eq!(loaded.len(), index.len()); + assert_eq!( + loaded.top_m_candidates(query, 32), + index.top_m_candidates(query, 32) + ); +} + #[test] fn bitmap_index_constant_composition_invariant() { // Every doc bitmap should have exactly n_top bits set. diff --git a/tests/index/two_stage.rs b/tests/index/two_stage.rs index b63f477..5b0ad19 100644 --- a/tests/index/two_stage.rs +++ b/tests/index/two_stage.rs @@ -1,7 +1,21 @@ -use ordvec::{RankQuant, SignBitmap}; +use ordvec::{ + validate_candidate_ids, validate_flat_vectors_len, Bitmap, OrdvecError, RankQuant, SignBitmap, + TwoStageCandidatePolicy, +}; use crate::{make_corpus, D, N}; +fn corpus_after_swap_remove(corpus: &[f32], rows: usize, remove: usize) -> Vec { + let mut expected = corpus[..rows * D].to_vec(); + let last = rows - 1; + if remove != last { + let src = last * D..(last + 1) * D; + expected.copy_within(src, remove * D); + } + expected.truncate(last * D); + expected +} + fn build_two_stage(bits: u8) -> (SignBitmap, RankQuant, Vec) { let corpus = make_corpus(15_001); let mut sign = SignBitmap::new(D); @@ -32,6 +46,94 @@ fn assert_score_then_id_order(scores: &[f32], ids: &[i64]) { } } +#[test] +fn core_validation_helpers_report_errors_without_panicking() { + assert!(RankQuant::validate_params(D, 2).is_ok()); + assert!(RankQuant::validate_params(D, 3).is_err()); + assert!(RankQuant::validate_params(1, 2).is_err()); + assert!(RankQuant::validate_params(D + 1, 2).is_err()); + + assert!(Bitmap::validate_params(D, D / 4).is_ok()); + assert!(Bitmap::validate_params(D + 1, D / 4).is_err()); + assert!(Bitmap::validate_params(D, 0).is_err()); + assert!(Bitmap::validate_params(D, D).is_err()); + + assert!(SignBitmap::validate_dim(D).is_ok()); + assert!(SignBitmap::validate_dim(0).is_err()); + assert!(SignBitmap::validate_dim(D + 1).is_err()); + + assert_eq!(validate_flat_vectors_len(D * 3, D).unwrap(), 3); + assert!(validate_flat_vectors_len(D * 3 + 1, D).is_err()); + assert!(validate_candidate_ids(&[0, 7, (N - 1) as u32], N).is_ok()); + assert!(validate_candidate_ids(&[N as u32], N).is_err()); +} + +#[test] +fn two_stage_candidate_policy_is_overflow_safe_and_clamped() { + let default = TwoStageCandidatePolicy::default(); + assert_eq!(default.candidate_count(0, N), 0); + assert_eq!(default.candidate_count(1, N), 256.min(N)); + assert_eq!(default.candidate_count(10, N), N); + + let capped = TwoStageCandidatePolicy { + min_candidates: 4, + k_multiplier: usize::MAX, + max_candidates: Some(37), + }; + assert_eq!(capped.candidate_count(usize::MAX, N), 37.min(N)); + assert_eq!(capped.candidate_count(10, 9), 9); +} + +#[test] +fn sign_bitmap_swap_remove_cases_match_rebuilt_probe() { + let corpus = make_corpus(15_010); + let query = &make_corpus(15_011)[..D]; + + for (rows, remove) in [(1usize, 0usize), (8, 0), (8, 3), (8, 7)] { + let mut index = SignBitmap::new(D); + index.add(&corpus[..rows * D]); + let moved = index.swap_remove(remove); + + let expected_corpus = corpus_after_swap_remove(&corpus, rows, remove); + let mut rebuilt = SignBitmap::new(D); + rebuilt.add(&expected_corpus); + + assert_eq!(moved, rows - 1); + assert_eq!(index.len(), rows - 1); + assert_eq!(index.byte_size(), (rows - 1) * index.bytes_per_vec()); + assert_eq!(index.score_all(query), rebuilt.score_all(query)); + assert_eq!( + index.top_m_candidates(query, rows), + rebuilt.top_m_candidates(query, rows), + "remove={remove} rows={rows}" + ); + } +} + +#[test] +fn sign_bitmap_write_then_load_after_swap_remove_preserves_probe() { + let corpus = make_corpus(15_012); + let mut index = SignBitmap::new(D); + index.add(&corpus); + index.swap_remove(17); + + let tmp = std::env::temp_dir().join(format!( + "ordvec_sign_bitmap_after_swap_remove_{}.tvsb", + std::process::id() + )); + index.write(&tmp).expect("write"); + let loaded = SignBitmap::load(&tmp).expect("load"); + std::fs::remove_file(&tmp).ok(); + + let query = &make_corpus(15_013)[..D]; + assert_eq!(loaded.len(), index.len()); + assert_eq!(loaded.score_all(query), index.score_all(query)); + assert_eq!( + loaded.top_m_candidates(query, 32), + index.top_m_candidates(query, 32) + ); +} + #[test] fn sign_rankquant_pipeline_handles_edge_candidate_and_k_shapes() { let (sign, rankquant, _corpus) = build_two_stage(2); @@ -67,6 +169,68 @@ fn sign_rankquant_pipeline_handles_edge_candidate_and_k_shapes() { assert_score_then_id_order(&scores, &ids); } +#[test] +fn rankquant_sign_probe_helper_matches_manual_candidates() { + let (sign, rankquant, _corpus) = build_two_stage(2); + let query = &make_corpus(15_004)[..D]; + let policy = TwoStageCandidatePolicy { + min_candidates: 0, + k_multiplier: 4, + max_candidates: Some(37), + }; + let k = 5; + let candidates = sign.top_m_candidates(query, policy.candidate_count(k, rankquant.len())); + let manual = rankquant.search_asymmetric_subset(query, &candidates, k); + let helper = rankquant + .try_search_with_sign_probe_with_policy(&sign, query, k, policy) + .unwrap(); + + assert_eq!(helper.1, manual.1); + assert_eq!(helper.0.len(), manual.0.len()); + for (helper, manual) in helper.0.iter().zip(manual.0.iter()) { + assert!((helper - manual).abs() <= 1e-6); + } + + let default_try = rankquant + .try_search_with_sign_probe(&sign, query, k) + .unwrap(); + let default_panic = rankquant.search_with_sign_probe(&sign, query, k); + assert_eq!(default_try.1, default_panic.1); +} + +#[test] +fn rankquant_sign_probe_helper_validates_probe_and_query_shape() { + let (sign, rankquant, _corpus) = build_two_stage(2); + let query = &make_corpus(15_005)[..D]; + let wrong_dim = SignBitmap::new(D * 2); + assert!(rankquant + .try_search_with_sign_probe(&wrong_dim, query, 5) + .is_err()); + + let mut short_probe = SignBitmap::new(D); + short_probe.add(&make_corpus(15_006)[..(N - 1) * D]); + assert!(rankquant + .try_search_with_sign_probe(&short_probe, query, 5) + .is_err()); + + assert!(matches!( + rankquant + .try_search_with_sign_probe(&sign, &query[..D - 1], 5) + .unwrap_err(), + OrdvecError::InvalidVectorLength { + name: "query", + len, + expected: D, + } if len == D - 1 + )); + + let mut bad_query = query.to_vec(); + bad_query[0] = f32::NAN; + assert!(rankquant + .try_search_with_sign_probe(&sign, &bad_query, 5) + .is_err()); +} + #[test] fn sign_rankquant_full_candidate_set_matches_full_rankquant_search() { let (sign, rankquant, _corpus) = build_two_stage(4); diff --git a/tests/release_publish_invariants.py b/tests/release_publish_invariants.py index bc7f82d..6aacd63 100644 --- a/tests/release_publish_invariants.py +++ b/tests/release_publish_invariants.py @@ -1209,6 +1209,60 @@ def check_publish_crates(workflow: dict[str, Any], path: str) -> None: manifest_job = mapping(jobs.get("publish-manifest-crate"), f"{path}: jobs.publish-manifest-crate") if not has_need(manifest_job, "publish-crate"): fail(f"{path}: publish-manifest-crate must need publish-crate so ordvec publishes first") + manifest_steps = sequence( + manifest_job.get("steps"), f"{path}: jobs.publish-manifest-crate.steps" + ) + manifest_recovery_steps: list[tuple[int, dict[str, Any]]] = [] + for index, raw_step in enumerate(manifest_steps): + step = mapping(raw_step, f"{path}: jobs.publish-manifest-crate.steps[{index}]") + if step.get("name") == "Check for existing ordvec-manifest .crate recovery": + manifest_recovery_steps.append((index, step)) + if len(manifest_recovery_steps) != 1: + fail( + f"{path}: publish-manifest-crate must have exactly one first-publish recovery check" + ) + recovery_index, recovery_step = manifest_recovery_steps[0] + if recovery_step.get("id") != "manifest_crate_recovery": + fail(f"{path}: manifest crate recovery step must have id manifest_crate_recovery") + recovery_run = recovery_step.get("run") + if not isinstance(recovery_run, str): + fail(f"{path}: manifest crate recovery step must be a run step") + for required in ( + "already_published=true", + "already_published=false", + "Refusing recovery", + "crates.io already serves byte-identical ordvec-manifest", + ): + if required not in recovery_run: + fail(f"{path}: manifest crate recovery step must contain {required!r}") + for url_var in ("API_URL", "STATIC_URL"): + if not any( + has_shell_arg(words, shell_vars(url_var)) + and has_shell_option_value( + words, {"--user-agent", "-A"}, shell_vars("CRATES_IO_USER_AGENT") + ) + and has_shell_option_value(words, {"--output", "-o"}, shell_vars("EXISTING")) + for words in shell_curl_commands(recovery_run) + ): + fail( + f"{path}: manifest crate recovery step must curl ${url_var} " + "with CRATES_IO_USER_AGENT into $EXISTING" + ) + for index, raw_step in enumerate(manifest_steps): + step = mapping(raw_step, f"{path}: jobs.publish-manifest-crate.steps[{index}]") + name = step.get("name") + if name in { + "Validate manifest publish dry-run", + "Mint a short-lived crates.io credential (OIDC)", + "cargo publish", + }: + if index < recovery_index: + fail(f"{path}: {name} must run after the manifest crate recovery check") + if step.get("if") != "steps.manifest_crate_recovery.outputs.already_published != 'true'": + fail( + f"{path}: {name} must be skipped when manifest crate recovery found " + "byte-identical existing bytes" + ) build_manifest_job = mapping(jobs.get("build-manifest-crate"), f"{path}: jobs.build-manifest-crate") if not has_need(build_manifest_job, "publish-crate"): fail(f"{path}: build-manifest-crate must need publish-crate so lockstep ordvec exists") From 083a0912024711f3b1e66e406e1126e643e016b5 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Tue, 9 Jun 2026 13:18:13 -0500 Subject: [PATCH 2/4] Add ordvec-manifest Python package release lane Signed-off-by: Nelson Spence --- .github/workflows/python.yml | 43 +- .github/workflows/release.yml | 362 +++++++++- Cargo.lock | 10 + Cargo.toml | 6 +- ordvec-manifest-python/Cargo.toml | 20 + ordvec-manifest-python/README.md | 47 ++ ordvec-manifest-python/pyproject.toml | 50 ++ .../python/ordvec_manifest/__init__.py | 51 ++ ordvec-manifest-python/src/lib.rs | 621 ++++++++++++++++++ .../tests/test_manifest_bindings.py | 134 ++++ tests/release_publish_invariants.py | 165 ++++- tests/release_pypi_canonical_dist.py | 32 +- tests/release_pypi_canonical_dist_tests.py | 24 +- tests/release_signed_release_invariants.sh | 48 +- 14 files changed, 1525 insertions(+), 88 deletions(-) create mode 100644 ordvec-manifest-python/Cargo.toml create mode 100644 ordvec-manifest-python/README.md create mode 100644 ordvec-manifest-python/pyproject.toml create mode 100644 ordvec-manifest-python/python/ordvec_manifest/__init__.py create mode 100644 ordvec-manifest-python/src/lib.rs create mode 100644 ordvec-manifest-python/tests/test_manifest_bindings.py diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 17aa6ee..b1f77d2 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -1,10 +1,11 @@ name: python -# Bindings CI: lint + build the ordvec-python extension with maturin and run the -# pytest suite across OS and Python versions. Kept separate from ci.yml (the -# core-crate gates) so the binding's pyo3/numpy/ndarray toolchain never entangles -# the core checks, and so a green run proves the bindings build cold from a -# checkout (the paper's reproducibility requirement). +# Bindings CI: lint + build the ordvec-python and ordvec-manifest-python +# extensions with maturin and run their pytest suites across OS and Python +# versions. Kept separate from ci.yml (the core-crate gates) so the binding +# pyo3/numpy/ndarray toolchains never entangle the core checks, and so a green +# run proves the bindings build cold from a checkout (the paper's +# reproducibility requirement). on: # Push runs on EVERY commit to main (no paths filter): release.yml's # require-ci-green gate asserts a successful python.yml run exists for the exact @@ -17,6 +18,8 @@ on: pull_request: paths: - "ordvec-python/**" + - "ordvec-manifest-python/**" + - "ordvec-manifest/**" - "src/**" - "Cargo.toml" - "Cargo.lock" @@ -48,7 +51,9 @@ jobs: # The core ci.yml clippy is scoped to the core crate via default-members, # so the binding gets its quality gate here instead. - run: cargo fmt -p ordvec-python --check + - run: cargo fmt -p ordvec-manifest-python --check - run: cargo clippy -p ordvec-python --all-targets -- -D warnings + - run: cargo clippy -p ordvec-manifest-python --all-targets -- -D warnings test: name: py${{ matrix.python }} on ${{ matrix.os }} @@ -119,3 +124,31 @@ jobs: python -m pip install --require-hashes --no-index --no-deps -r "$REQ_FILE" - name: pytest run: python -m pytest ordvec-python/tests -q + - name: build the manifest wheel (abi3, release) + working-directory: ordvec-manifest-python + run: maturin build --release --out dist + - name: install the built manifest wheel + shell: bash + run: | + set -euo pipefail + WHEEL="$(python - <<'PY' + from pathlib import Path + wheels = sorted(Path("ordvec-manifest-python/dist").glob("*.whl")) + if len(wheels) != 1: + raise SystemExit(f"expected exactly one manifest wheel, found {wheels}") + print(wheels[0]) + PY + )" + REQ_FILE="${RUNNER_TEMP:?RUNNER_TEMP must be set}/ordvec-manifest-wheel-requirements.txt" + python - <<'PY' "$WHEEL" > "$REQ_FILE" + import hashlib + import sys + from pathlib import Path + + wheel = Path(sys.argv[1]).resolve() + digest = hashlib.sha256(wheel.read_bytes()).hexdigest() + print(f"ordvec-manifest @ {wheel.as_uri()} --hash=sha256:{digest}") + PY + python -m pip install --require-hashes --no-index --no-deps -r "$REQ_FILE" + - name: pytest manifest bindings + run: python -m pytest ordvec-manifest-python/tests -q diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4d3c346..b5089d4 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,4 +1,4 @@ -# Unified, tag-triggered release pipeline for ordvec (Rust crates + Python wheel). +# Unified, tag-triggered release pipeline for ordvec (Rust crates + Python wheels). # # Cutting a stable `vMAJOR.MINOR.PATCH` tag fully automates: build (crates + # wheels + sdist) -> canonicalize the Python dist (current build for new @@ -6,11 +6,12 @@ # version) -> attest / SLSA-provenance the files this run actually built -> # stage core/Python assets on the DRAFT GitHub Release (`release-assets-draft`) # -> publish the core crate -> build/attest/stage the lockstep manifest crate -# (now that the core version exists on crates.io) -> gated registry publishes / -# verification -> un-draft ONLY after all registry jobs succeed -# (`publish-github-release`). The registry gates (two crates.io jobs plus PyPI) -# are bound to GitHub Environments with Required Reviewers, so they pause for a -# human. +# plus `ordvec-manifest` Python assets (now that the core version exists on +# crates.io) -> gated registry publishes / verification -> un-draft ONLY after +# all registry jobs succeed +# (`publish-github-release`). The registry gates (two crates.io jobs plus two +# PyPI jobs) are bound to GitHub Environments with Required Reviewers, so they +# pause for a human. # # The un-draft-after-publish ordering is deliberate: it prevents a public # GitHub Release from existing for a version that crates.io / PyPI later @@ -75,6 +76,8 @@ # `crates-io`). # * PyPI: ordvec > publishing > edit the GitHub publisher -> workflow = # `release.yml` (env stays `pypi`). +# * PyPI: ordvec-manifest > publishing > add or edit the GitHub publisher +# -> workflow = `release.yml` (env stays `pypi`). # * GitHub Environments `crates-io` AND `pypi`: # - keep "Required reviewers" (the human publish gate); # - set "Deployment branches and tags" to **Selected branches and tags** @@ -525,6 +528,140 @@ jobs: path: ordvec-python/ordvec-python.cdx.json if-no-files-found: error + build-manifest-wheels: + name: manifest wheel ${{ matrix.platform.target }} (${{ matrix.platform.runner }}) + needs: guard + if: needs.guard.outputs.ok == 'true' + runs-on: ${{ matrix.platform.runner }} + strategy: + fail-fast: false + matrix: + platform: + - { runner: ubuntu-latest, target: x86_64, manylinux: auto } + - { runner: ubuntu-latest, target: aarch64, manylinux: auto } + - { runner: macos-latest, target: aarch64, manylinux: auto } + - { runner: windows-latest, target: x64, manylinux: auto } + steps: + - name: Harden the runner + uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4 + with: + egress-policy: audit + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + persist-credentials: false + - name: Build manifest abi3 wheel + uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 + with: + working-directory: ordvec-manifest-python + target: ${{ matrix.platform.target }} + manylinux: ${{ matrix.platform.manylinux }} + args: --release --out dist + - name: Set up Python to test the built manifest wheel + if: ${{ !(matrix.platform.runner == 'ubuntu-latest' && matrix.platform.target == 'aarch64') }} + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.13" + - name: Install the built manifest wheel and run pytest + if: ${{ !(matrix.platform.runner == 'ubuntu-latest' && matrix.platform.target == 'aarch64') }} + shell: bash + run: | + set -euo pipefail + WHEEL="$(python - <<'PY' + from pathlib import Path + wheels = sorted(Path("ordvec-manifest-python/dist").glob("*.whl")) + if len(wheels) != 1: + raise SystemExit(f"expected exactly one manifest wheel, found {wheels}") + print(wheels[0]) + PY + )" + python -m pip install --require-hashes -r ordvec-python/requirements-dev.txt + REQ_FILE="${RUNNER_TEMP:?RUNNER_TEMP must be set}/ordvec-manifest-wheel-requirements.txt" + python - <<'PY' "$WHEEL" > "$REQ_FILE" + import hashlib + import sys + from pathlib import Path + + wheel = Path(sys.argv[1]).resolve() + digest = hashlib.sha256(wheel.read_bytes()).hexdigest() + print(f"ordvec-manifest @ {wheel.as_uri()} --hash=sha256:{digest}") + PY + python -m pip install --require-hashes --no-index --no-deps -r "$REQ_FILE" + python -m pytest ordvec-manifest-python/tests -q + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: wheels-manifest-${{ matrix.platform.runner }}-${{ matrix.platform.target }} + path: ordvec-manifest-python/dist/*.whl + if-no-files-found: error + + build-manifest-sdist: + name: build manifest sdist + SBOM + needs: guard + if: needs.guard.outputs.ok == 'true' + runs-on: ubuntu-latest + steps: + - name: Harden the runner + uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4 + with: + egress-policy: audit + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + persist-credentials: false + - name: Build the manifest sdist + uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 + with: + working-directory: ordvec-manifest-python + command: sdist + args: --out dist + - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable (2026-03-27) + with: + toolchain: stable + - name: Set up Python to test the manifest sdist + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.13" + - name: Install from the manifest sdist and run pytest + shell: bash + run: | + set -euo pipefail + SDIST="$(python - <<'PY' + from pathlib import Path + sdists = sorted(Path("ordvec-manifest-python/dist").glob("*.tar.gz")) + if len(sdists) != 1: + raise SystemExit(f"expected exactly one manifest sdist, found {sdists}") + print(sdists[0]) + PY + )" + python -m pip install --require-hashes -r ordvec-python/requirements-dev.txt + REQ_FILE="${RUNNER_TEMP:?RUNNER_TEMP must be set}/ordvec-manifest-sdist-requirements.txt" + python - <<'PY' "$SDIST" > "$REQ_FILE" + import hashlib + import sys + from pathlib import Path + + sdist = Path(sys.argv[1]).resolve() + digest = hashlib.sha256(sdist.read_bytes()).hexdigest() + print(f"ordvec-manifest @ {sdist.as_uri()} --hash=sha256:{digest}") + PY + python -m pip install --require-hashes --no-index --no-deps --no-build-isolation -r "$REQ_FILE" + python -m pytest ordvec-manifest-python/tests -q + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: sdist-manifest + path: ordvec-manifest-python/dist/*.tar.gz + if-no-files-found: error + - name: Install cargo-cyclonedx + shell: bash + run: cargo install cargo-cyclonedx --version 0.5.9 --locked + - name: Generate CycloneDX SBOM for the manifest binding crate + shell: bash + run: cargo cyclonedx --manifest-path ordvec-manifest-python/Cargo.toml --format json + - name: Upload the manifest binding SBOM + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: sbom-manifest-python + path: ordvec-manifest-python/ordvec-manifest-python.cdx.json + if-no-files-found: error + pypi-canonical-dist: name: canonicalize Python dist for PyPI/GitHub Release needs: [guard, build-wheels, build-sdist] @@ -571,6 +708,118 @@ jobs: canonical-dist/*.tar.gz if-no-files-found: error + pypi-manifest-canonical-dist: + name: canonicalize ordvec-manifest Python dist for PyPI/GitHub Release + needs: [guard, build-manifest-wheels, build-manifest-sdist] + if: needs.guard.outputs.ok == 'true' + runs-on: ubuntu-latest + outputs: + source: ${{ steps.canonicalize.outputs.source }} + pypi_exists: ${{ steps.canonicalize.outputs.pypi_exists }} + steps: + - name: Harden the runner + uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4 + with: + egress-policy: audit + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + persist-credentials: false + - name: Collect the built manifest wheels + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + pattern: wheels-manifest-* + path: built-dist + merge-multiple: true + - name: Collect the built manifest sdist + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: sdist-manifest + path: built-dist + - name: Select canonical ordvec-manifest Python dist + id: canonicalize + env: + VERSION: ${{ needs.guard.outputs.version }} + run: | + set -euo pipefail + python3 tests/release_pypi_canonical_dist.py canonicalize \ + --project ordvec-manifest \ + --version "$VERSION" \ + --built-dir built-dist \ + --out-dir canonical-dist + - name: Upload the canonical ordvec-manifest Python dist + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: pypi-manifest-canonical-dist + path: | + canonical-dist/*.whl + canonical-dist/*.tar.gz + if-no-files-found: error + + smoke-linux-aarch64-manifest-wheel: + name: smoke linux/aarch64 ordvec-manifest wheel + needs: [guard, pypi-manifest-canonical-dist] + if: needs.guard.outputs.ok == 'true' + runs-on: ubuntu-24.04-arm + steps: + - name: Harden the runner + uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4 + with: + egress-policy: audit + - name: Set up Python to test the canonical manifest wheel + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.13" + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + persist-credentials: false + - name: Download the canonical manifest Python dist + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: pypi-manifest-canonical-dist + path: wheelhouse + - name: Install exact manifest wheel and run tiny verifier smoke + shell: bash + run: | + set -euo pipefail + WHEEL="$(python - <<'PY' + from pathlib import Path + wheels = sorted( + wheel + for wheel in Path("wheelhouse").glob("*.whl") + if "aarch64" in wheel.name and ("manylinux" in wheel.name or "musllinux" in wheel.name) + ) + if len(wheels) != 1: + raise SystemExit(f"expected exactly one linux/aarch64 manifest wheel, found {wheels}") + print(wheels[0]) + PY + )" + python -m pip install --require-hashes -r ordvec-python/requirements-dev.txt + REQ_FILE="${RUNNER_TEMP:?RUNNER_TEMP must be set}/ordvec-manifest-aarch64-wheel-requirements.txt" + python - <<'PY' "$WHEEL" > "$REQ_FILE" + import hashlib + import sys + from pathlib import Path + + wheel = Path(sys.argv[1]).resolve() + digest = hashlib.sha256(wheel.read_bytes()).hexdigest() + print(f"ordvec-manifest @ {wheel.as_uri()} --hash=sha256:{digest}") + PY + python -m pip install --require-hashes --no-index --no-deps -r "$REQ_FILE" + python - <<'PY' + import hashlib + from pathlib import Path + import tempfile + + import ordvec_manifest + + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "artifact.bin" + path.write_bytes(b"ordvec-manifest smoke") + got = ordvec_manifest.sha256_file(path) + assert got["sha256"] == hashlib.sha256(b"ordvec-manifest smoke").hexdigest() + assert got["size_bytes"] == len(b"ordvec-manifest smoke") + PY + attest: name: GitHub artifact attestation (+ .sigstore.json bundle) needs: [guard, build-crate, pypi-canonical-dist] @@ -867,7 +1116,7 @@ jobs: attest-manifest: name: GitHub artifact attestation for ordvec-manifest - needs: [guard, build-manifest-crate] + needs: [guard, build-manifest-crate, pypi-manifest-canonical-dist] if: needs.guard.outputs.ok == 'true' runs-on: ubuntu-latest permissions: @@ -885,14 +1134,29 @@ jobs: with: name: dist-manifest-crate path: dist - - name: Attest build provenance for manifest crate - id: attest_manifest + - name: Collect the canonical ordvec-manifest Python dist + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: pypi-manifest-canonical-dist + path: dist + - name: Attest build provenance for manifest crate + canonical manifest wheels + sdist + id: attest_manifest_all + if: needs.pypi-manifest-canonical-dist.outputs.source == 'build' + uses: actions/attest-build-provenance@a2bbfa25375fe432b6a289bc6b6cd05ecd0c4c32 # v4.1.0 + with: + subject-path: | + dist/*.crate + dist/*.whl + dist/*.tar.gz + - name: Attest build provenance for manifest crate only + id: attest_manifest_crate + if: needs.pypi-manifest-canonical-dist.outputs.source == 'pypi' uses: actions/attest-build-provenance@a2bbfa25375fe432b6a289bc6b6cd05ecd0c4c32 # v4.1.0 with: subject-path: dist/*.crate - name: Stage the manifest Sigstore bundle as a release asset env: - BUNDLE: ${{ steps.attest_manifest.outputs.bundle-path }} + BUNDLE: ${{ steps.attest_manifest_all.outputs.bundle-path || steps.attest_manifest_crate.outputs.bundle-path }} VERSION: ${{ needs.guard.outputs.version }} run: cp "$BUNDLE" "ordvec-manifest-${VERSION}.sigstore.json" - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -902,8 +1166,8 @@ jobs: if-no-files-found: error combine-manifest-hash: - name: combine ordvec-manifest digest for SLSA provenance - needs: [guard, build-manifest-crate] + name: combine ordvec-manifest digests for SLSA provenance + needs: [guard, build-manifest-crate, pypi-manifest-canonical-dist] if: needs.guard.outputs.ok == 'true' runs-on: ubuntu-latest outputs: @@ -918,12 +1182,28 @@ jobs: with: name: dist-manifest-crate path: dist - - name: Compute base64 sha256sum over manifest SLSA subject + - name: Collect the canonical ordvec-manifest Python dist + if: needs.pypi-manifest-canonical-dist.outputs.source == 'build' + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: pypi-manifest-canonical-dist + path: dist + - name: Compute base64 sha256sum over manifest SLSA subjects id: hash working-directory: dist + env: + PYPI_SOURCE: ${{ needs.pypi-manifest-canonical-dist.outputs.source }} run: | set -euo pipefail - echo "hashes=$(sha256sum ./*.crate | base64 -w0)" >> "$GITHUB_OUTPUT" + if [ "$PYPI_SOURCE" = "build" ]; then + echo "hashes=$(sha256sum ./*.crate ./*.whl ./*.tar.gz | base64 -w0)" >> "$GITHUB_OUTPUT" + elif [ "$PYPI_SOURCE" = "pypi" ]; then + echo "::notice::ordvec-manifest PyPI dist already exists; SLSA subjects are limited to the manifest crate built by this run." + echo "hashes=$(sha256sum ./*.crate | base64 -w0)" >> "$GITHUB_OUTPUT" + else + echo "::error::unexpected pypi-manifest-canonical-dist source: $PYPI_SOURCE" + exit 1 + fi manifest-provenance: name: SLSA provenance for ordvec-manifest (.intoto.jsonl) @@ -941,7 +1221,7 @@ jobs: release-manifest-assets-draft: name: stage ordvec-manifest assets on the DRAFT Release (does NOT un-draft) - needs: [guard, build-manifest-crate, attest-manifest, manifest-provenance] + needs: [guard, build-manifest-crate, attest-manifest, manifest-provenance, pypi-manifest-canonical-dist, smoke-linux-aarch64-manifest-wheel] if: needs.guard.outputs.ok == 'true' runs-on: ubuntu-latest permissions: @@ -961,6 +1241,11 @@ jobs: with: name: sigstore-bundle-manifest path: dist + - name: Collect the canonical ordvec-manifest Python dist + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: pypi-manifest-canonical-dist + path: dist - name: Collect workflow artifacts for manifest SLSA provenance uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: @@ -986,6 +1271,8 @@ jobs: set -euo pipefail gh release upload "$TAG_NAME" \ dist/*.crate \ + dist/*.whl \ + dist/*.tar.gz \ dist/*.sigstore.json \ dist/*.intoto.jsonl \ --clobber @@ -1109,6 +1396,49 @@ jobs: fi echo "OK: crates.io-served manifest .crate is byte-identical to the SLSA-attested artifact ($A_SHA)." + publish-manifest-pypi: + name: publish ordvec-manifest to PyPI + needs: [guard, pypi-manifest-canonical-dist, release-manifest-assets-draft] + if: needs.guard.outputs.ok == 'true' + runs-on: ubuntu-latest + environment: + name: pypi # MANUAL GATE — Required reviewer + url: https://pypi.org/p/ordvec-manifest + permissions: + contents: read + id-token: write # Trusted Publishing (OIDC); PEP 740 attestations on by default + steps: + - name: Harden the runner + uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4 + with: + egress-policy: audit + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + persist-credentials: false + - name: Collect the canonical ordvec-manifest Python dist + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: pypi-manifest-canonical-dist + path: dist + - name: Skip ordvec-manifest PyPI upload when the immutable version already exists + if: needs.pypi-manifest-canonical-dist.outputs.source == 'pypi' + run: | + echo "::notice::PyPI already serves ordvec-manifest for this version; verifying existing canonical files instead of uploading." + - name: Publish ordvec-manifest to PyPI (Trusted Publishing; PEP 740 attestations on by default) + if: needs.pypi-manifest-canonical-dist.outputs.source == 'build' + uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0 + with: + packages-dir: dist + - name: Verify ordvec-manifest PyPI hashes match canonical dist + env: + VERSION: ${{ needs.guard.outputs.version }} + run: | + set -euo pipefail + python3 tests/release_pypi_canonical_dist.py verify \ + --project ordvec-manifest \ + --version "$VERSION" \ + --dist-dir dist + publish-pypi: name: publish to PyPI needs: [guard, pypi-canonical-dist, release-assets-draft] @@ -1153,7 +1483,7 @@ jobs: publish-github-release: name: un-draft the GitHub Release (only after all registry publishes succeed) - needs: [guard, publish-crate, publish-manifest-crate, publish-pypi] + needs: [guard, publish-crate, publish-manifest-crate, publish-pypi, publish-manifest-pypi] if: needs.guard.outputs.ok == 'true' runs-on: ubuntu-latest permissions: diff --git a/Cargo.lock b/Cargo.lock index d6940de..54d4378 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -631,6 +631,16 @@ dependencies = [ "uuid", ] +[[package]] +name = "ordvec-manifest-python" +version = "0.4.0" +dependencies = [ + "ordvec-manifest", + "pyo3", + "serde", + "serde_json", +] + [[package]] name = "ordvec-python" version = "0.4.0" diff --git a/Cargo.toml b/Cargo.toml index a2ae503..11f4f60 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ exclude = [ "ordvec-ffi/", "ordvec-go/", "ordvec-manifest/", + "ordvec-manifest-python/", "ordvec-python/", "tests/__pycache__/", "tests/release_environment_settings.sh", @@ -79,14 +80,15 @@ codegen-units = 1 opt-level = 3 # Workspace: the `ordvec-python` member holds the PyO3/maturin bindings shipped to -# PyPI as `ordvec`; `ordvec-ffi` holds the C ABI; and `ordvec-manifest` is a +# PyPI as `ordvec`; `ordvec-manifest-python` holds the PyPI bindings shipped as +# `ordvec-manifest`; `ordvec-ffi` holds the C ABI; and `ordvec-manifest` is a # lockstep manifest verifier crate. `default-members = ["."]` keeps bare # `cargo build/test/clippy` scoped to the core crate, so the existing CI gates # are unaffected; non-core members get explicit CI lanes. The single workspace # `Cargo.lock` carries their transitive dependencies. [workspace] resolver = "2" -members = ["ordvec-python", "ordvec-ffi", "ordvec-manifest"] +members = ["ordvec-python", "ordvec-ffi", "ordvec-manifest", "ordvec-manifest-python"] default-members = ["."] # fuzz/ is a cargo-fuzz crate built only via `cargo +nightly fuzz`. Keep it out of # the workspace so it stays a standalone crate (its own Cargo.lock) and `cargo fuzz` diff --git a/ordvec-manifest-python/Cargo.toml b/ordvec-manifest-python/Cargo.toml new file mode 100644 index 0000000..a65015f --- /dev/null +++ b/ordvec-manifest-python/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "ordvec-manifest-python" +version = "0.4.0" +edition = "2021" +rust-version = "1.89" +description = "Python bindings for ordvec-manifest index provenance verification" +license = "MIT OR Apache-2.0" +repository = "https://github.com/Fieldnote-Echo/ordvec" +# Ships to PyPI as `ordvec-manifest` via maturin, never to crates.io. +publish = false + +[lib] +name = "_ordvec_manifest" +crate-type = ["cdylib"] + +[dependencies] +ordvec_manifest_core = { package = "ordvec-manifest", path = "../ordvec-manifest", default-features = false } +pyo3 = { version = "0.27.0", features = ["extension-module", "abi3-py310"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" diff --git a/ordvec-manifest-python/README.md b/ordvec-manifest-python/README.md new file mode 100644 index 0000000..9b7b53d --- /dev/null +++ b/ordvec-manifest-python/README.md @@ -0,0 +1,47 @@ +# ordvec-manifest + +Python bindings for the `ordvec-manifest` verifier. + +Install from PyPI: + +```bash +python -m pip install ordvec-manifest +``` + +Import as `ordvec_manifest`. The package exposes the Rust manifest verifier as +dict-returning Python functions: + +```python +import ordvec_manifest + +report = ordvec_manifest.verify_manifest("index.manifest.json") +if not report["ok"]: + raise RuntimeError(report["errors"]) +``` + +Create manifests with caller-owned sidecars by passing dictionaries with +`name`, `path`, and optional `required`: + +```python +manifest = ordvec_manifest.create_manifest( + "index.tvrq", + "index.manifest.json", + "bge-small-en-v1.5", + row_id_is_identity=True, + auxiliary_artifacts=[ + {"name": "ordinaldb.ids", "path": "ids.bin"}, + {"name": "optional.stats", "path": "stats.json", "required": False}, + ], +) +plan = ordvec_manifest.verify_for_load("index.manifest.json") +``` + +For OrdinalDB v0.1, keep `row_id_identity` for the ordvec row count and declare +`ids.bin` as required auxiliary artifact name `ordinaldb.ids`. Do not encode +`ids.bin` as JSONL row identity; the v1 JSONL row-map contract is UUID-only. + +The verifier checks manifest shape, declared artifact digests and sizes, probed +ordvec index metadata, row identity, auxiliary artifact state, optional +calibration profiles, optional encoder-distortion profiles, and attestation +shape metadata. It does not sign artifacts, manage keys, call networks, mutate +index files, or decide deployment policy. diff --git a/ordvec-manifest-python/pyproject.toml b/ordvec-manifest-python/pyproject.toml new file mode 100644 index 0000000..283ec39 --- /dev/null +++ b/ordvec-manifest-python/pyproject.toml @@ -0,0 +1,50 @@ +[build-system] +requires = ["maturin>=1.13.3,<2.0"] +build-backend = "maturin" + +[project] +name = "ordvec-manifest" +version = "0.4.0" +description = "Python bindings for ordvec index manifest verification" +readme = "README.md" +requires-python = ">=3.10" +license = { text = "MIT OR Apache-2.0" } +authors = [ + { name = "Nelson Spence" }, +] +keywords = [ + "vector-search", + "manifest", + "provenance", + "verification", + "quantization", +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "License :: OSI Approved :: Apache Software License", + "Operating System :: POSIX :: Linux", + "Operating System :: MacOS", + "Operating System :: Microsoft :: Windows", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Rust", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries :: Python Modules", +] + +[project.urls] +Homepage = "https://github.com/Fieldnote-Echo/ordvec" +Repository = "https://github.com/Fieldnote-Echo/ordvec" +Issues = "https://github.com/Fieldnote-Echo/ordvec/issues" + +[tool.maturin] +manifest-path = "Cargo.toml" +features = ["pyo3/extension-module"] +python-source = "python" +module-name = "ordvec_manifest._ordvec_manifest" diff --git a/ordvec-manifest-python/python/ordvec_manifest/__init__.py b/ordvec-manifest-python/python/ordvec_manifest/__init__.py new file mode 100644 index 0000000..dca8363 --- /dev/null +++ b/ordvec-manifest-python/python/ordvec_manifest/__init__.py @@ -0,0 +1,51 @@ +"""Python bindings for the ordvec manifest verifier. + +The package wraps the Rust ``ordvec-manifest`` crate and returns plain Python +``dict`` objects for JSON-shaped verifier outputs. It is intentionally a +verification API, not a policy engine: callers still decide where to store +artifacts, how to trust keys, and when to load verified bytes. +""" + +from ._ordvec_manifest import ( + CALIBRATION_SCHEMA_VERSION, + DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES, + DEFAULT_MAX_AUXILIARY_ARTIFACTS, + DEFAULT_MAX_CACHED_REPORT_BYTES, + DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES, + DEFAULT_MAX_MANIFEST_BYTES, + DEFAULT_MAX_REPORT_ISSUES, + DEFAULT_MAX_ROW_IDENTITY_JSONL_LINE_BYTES, + DEFAULT_MAX_ROW_IDENTITY_ROWS, + DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES, + ENCODER_DISTORTION_SCHEMA_VERSION, + SCHEMA_VERSION, + create_manifest, + default_resource_limits, + inspect_manifest, + sha256_file, + verify_for_load, + verify_manifest, +) + +__all__ = [ + "SCHEMA_VERSION", + "CALIBRATION_SCHEMA_VERSION", + "ENCODER_DISTORTION_SCHEMA_VERSION", + "DEFAULT_MAX_MANIFEST_BYTES", + "DEFAULT_MAX_ROW_IDENTITY_JSONL_LINE_BYTES", + "DEFAULT_MAX_ROW_IDENTITY_ROWS", + "DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES", + "DEFAULT_MAX_AUXILIARY_ARTIFACTS", + "DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES", + "DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES", + "DEFAULT_MAX_REPORT_ISSUES", + "DEFAULT_MAX_CACHED_REPORT_BYTES", + "default_resource_limits", + "sha256_file", + "inspect_manifest", + "verify_manifest", + "verify_for_load", + "create_manifest", +] + +__version__ = "0.4.0" diff --git a/ordvec-manifest-python/src/lib.rs b/ordvec-manifest-python/src/lib.rs new file mode 100644 index 0000000..2b57399 --- /dev/null +++ b/ordvec-manifest-python/src/lib.rs @@ -0,0 +1,621 @@ +//! Python bindings for the `ordvec-manifest` verifier crate. + +use ordvec_manifest_core::{ + create_manifest_for_index_with_options, load_manifest_file_with_options, + sha256_file as hash_file, write_manifest_file, CreateAuxiliaryArtifact, CreateManifestOptions, + CreateRowIdentity, ManifestError, ResourceLimits, VerifiedLoadPlanError, VerifyOptions, + CALIBRATION_SCHEMA_VERSION, DEFAULT_MAX_AUXILIARY_ARTIFACTS, + DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES, DEFAULT_MAX_CACHED_REPORT_BYTES, + DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES, DEFAULT_MAX_MANIFEST_BYTES, + DEFAULT_MAX_REPORT_ISSUES, DEFAULT_MAX_ROW_IDENTITY_JSONL_LINE_BYTES, + DEFAULT_MAX_ROW_IDENTITY_ROWS, DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES, + ENCODER_DISTORTION_SCHEMA_VERSION, SCHEMA_VERSION, +}; +use pyo3::exceptions::PyKeyError; +use pyo3::prelude::*; +use pyo3::types::PyModule; +use pyo3::wrap_pyfunction; +use serde::Serialize; +use std::path::{Path, PathBuf}; + +fn manifest_error(err: ManifestError) -> PyErr { + match err { + ManifestError::Io(err) => pyo3::exceptions::PyOSError::new_err(err.to_string()), + ManifestError::Json(err) => pyo3::exceptions::PyValueError::new_err(err.to_string()), + ManifestError::Invalid(message) => pyo3::exceptions::PyValueError::new_err(message), + ManifestError::LimitExceeded { code, message } => { + pyo3::exceptions::PyValueError::new_err(format!("{code}: {message}")) + } + } +} + +fn value_error(err: impl std::fmt::Display) -> PyErr { + pyo3::exceptions::PyValueError::new_err(err.to_string()) +} + +fn verified_load_plan_error(err: VerifiedLoadPlanError) -> PyErr { + match err { + VerifiedLoadPlanError::Manifest(err) => manifest_error(err), + VerifiedLoadPlanError::VerificationFailed(_) + | VerifiedLoadPlanError::IncompletePlan { .. } => value_error(err), + } +} + +fn json_to_py(py: Python<'_>, value: &T) -> PyResult> { + let text = serde_json::to_string(value).map_err(value_error)?; + let json = PyModule::import(py, pyo3::intern!(py, "json"))?; + let loads = json.getattr(pyo3::intern!(py, "loads"))?; + Ok(loads.call1((text,))?.unbind()) +} + +fn path_to_string(path: &Path) -> String { + strip_windows_verbatim_prefix(path.to_string_lossy().into_owned()) +} + +fn strip_windows_verbatim_prefix(path: String) -> String { + if let Some(rest) = path.strip_prefix("\\\\?\\UNC\\") { + format!("\\\\{rest}") + } else if let Some(rest) = path.strip_prefix("\\\\?\\") { + rest.to_string() + } else { + path + } +} + +#[cfg(test)] +mod tests { + use super::strip_windows_verbatim_prefix; + + #[test] + fn strips_windows_verbatim_drive_prefix() { + assert_eq!( + strip_windows_verbatim_prefix(r"\\?\C:\tmp\ids.bin".to_string()), + r"C:\tmp\ids.bin" + ); + } + + #[test] + fn strips_windows_verbatim_unc_prefix() { + assert_eq!( + strip_windows_verbatim_prefix(r"\\?\UNC\server\share\ids.bin".to_string()), + r"\\server\share\ids.bin" + ); + } +} + +#[allow(clippy::too_many_arguments)] +fn resource_limits( + max_manifest_bytes: Option, + max_row_map_line_bytes: Option, + max_row_map_rows: Option, + max_row_map_tracked_id_bytes: Option, + max_auxiliary_artifacts: Option, + max_auxiliary_artifact_bytes: Option, + max_encoder_distortion_profile_bytes: Option, + max_report_issues: Option, + max_cached_report_bytes: Option, +) -> ResourceLimits { + let mut limits = ResourceLimits::default(); + if let Some(value) = max_manifest_bytes { + limits.max_manifest_bytes = value; + } + if let Some(value) = max_row_map_line_bytes { + limits.max_row_identity_jsonl_line_bytes = value; + } + if let Some(value) = max_row_map_rows { + limits.max_row_identity_rows = value; + } + if let Some(value) = max_row_map_tracked_id_bytes { + limits.max_row_identity_tracked_db_id_bytes = value; + } + if let Some(value) = max_auxiliary_artifacts { + limits.max_auxiliary_artifacts = value; + } + if let Some(value) = max_auxiliary_artifact_bytes { + limits.max_auxiliary_artifact_bytes = value; + } + if let Some(value) = max_encoder_distortion_profile_bytes { + limits.max_encoder_distortion_profile_bytes = value; + } + if let Some(value) = max_report_issues { + limits.max_report_issues = value; + } + if let Some(value) = max_cached_report_bytes { + limits.max_cached_report_bytes = value; + } + limits +} + +#[derive(Serialize)] +struct PythonResourceLimits { + max_manifest_bytes: u64, + max_row_map_line_bytes: usize, + max_row_map_rows: usize, + max_row_map_tracked_id_bytes: usize, + max_auxiliary_artifacts: usize, + max_auxiliary_artifact_bytes: u64, + max_encoder_distortion_profile_bytes: u64, + max_report_issues: usize, + max_cached_report_bytes: u64, +} + +impl From for PythonResourceLimits { + fn from(limits: ResourceLimits) -> Self { + Self { + max_manifest_bytes: limits.max_manifest_bytes, + max_row_map_line_bytes: limits.max_row_identity_jsonl_line_bytes, + max_row_map_rows: limits.max_row_identity_rows, + max_row_map_tracked_id_bytes: limits.max_row_identity_tracked_db_id_bytes, + max_auxiliary_artifacts: limits.max_auxiliary_artifacts, + max_auxiliary_artifact_bytes: limits.max_auxiliary_artifact_bytes, + max_encoder_distortion_profile_bytes: limits.max_encoder_distortion_profile_bytes, + max_report_issues: limits.max_report_issues, + max_cached_report_bytes: limits.max_cached_report_bytes, + } + } +} + +#[allow(clippy::too_many_arguments)] +fn verify_options( + index: Option, + allow_absolute_paths: bool, + allow_path_escape: bool, + allow_duplicate_db_ids: bool, + max_manifest_bytes: Option, + max_row_map_line_bytes: Option, + max_row_map_rows: Option, + max_row_map_tracked_id_bytes: Option, + max_auxiliary_artifacts: Option, + max_auxiliary_artifact_bytes: Option, + max_encoder_distortion_profile_bytes: Option, + max_report_issues: Option, + max_cached_report_bytes: Option, +) -> VerifyOptions { + VerifyOptions { + allow_absolute_paths, + allow_path_escape, + allow_duplicate_db_ids, + index_override: index, + limits: resource_limits( + max_manifest_bytes, + max_row_map_line_bytes, + max_row_map_rows, + max_row_map_tracked_id_bytes, + max_auxiliary_artifacts, + max_auxiliary_artifact_bytes, + max_encoder_distortion_profile_bytes, + max_report_issues, + max_cached_report_bytes, + ), + } +} + +#[allow(clippy::too_many_arguments)] +fn create_options( + allow_absolute_paths: bool, + allow_path_escape: bool, + max_manifest_bytes: Option, + max_row_map_line_bytes: Option, + max_row_map_rows: Option, + max_row_map_tracked_id_bytes: Option, + max_auxiliary_artifacts: Option, + max_auxiliary_artifact_bytes: Option, + max_encoder_distortion_profile_bytes: Option, + max_report_issues: Option, + max_cached_report_bytes: Option, + auxiliary_artifacts: Vec, +) -> CreateManifestOptions { + CreateManifestOptions { + allow_absolute_paths, + allow_path_escape, + limits: resource_limits( + max_manifest_bytes, + max_row_map_line_bytes, + max_row_map_rows, + max_row_map_tracked_id_bytes, + max_auxiliary_artifacts, + max_auxiliary_artifact_bytes, + max_encoder_distortion_profile_bytes, + max_report_issues, + max_cached_report_bytes, + ), + auxiliary_artifacts, + } +} + +fn parse_auxiliary_artifacts( + py: Python<'_>, + auxiliary_artifacts: Option>, +) -> PyResult> { + let Some(auxiliary_artifacts) = auxiliary_artifacts else { + return Ok(Vec::new()); + }; + let auxiliary_artifacts = auxiliary_artifacts.bind(py); + let mut parsed = Vec::new(); + for item in auxiliary_artifacts.try_iter()? { + let item = item?; + let name = item.get_item("name")?.extract::()?; + let path = item.get_item("path")?.extract::()?; + let required = match item.get_item("required") { + Ok(value) => value.extract::()?, + Err(err) if err.is_instance_of::(py) => true, + Err(err) => return Err(err), + }; + parsed.push(CreateAuxiliaryArtifact { + name, + path, + required, + }); + } + Ok(parsed) +} + +#[pyfunction] +fn default_resource_limits(py: Python<'_>) -> PyResult> { + json_to_py(py, &PythonResourceLimits::from(ResourceLimits::default())) +} + +#[pyfunction] +fn sha256_file(py: Python<'_>, path: PathBuf) -> PyResult> { + let hash = py + .detach(|| hash_file(path)) + .map_err(|err| pyo3::exceptions::PyOSError::new_err(err.to_string()))?; + json_to_py(py, &hash) +} + +#[pyfunction] +#[pyo3(signature = ( + manifest, + *, + max_manifest_bytes = None, + max_row_map_line_bytes = None, + max_row_map_rows = None, + max_row_map_tracked_id_bytes = None, + max_auxiliary_artifacts = None, + max_auxiliary_artifact_bytes = None, + max_encoder_distortion_profile_bytes = None, + max_report_issues = None, + max_cached_report_bytes = None +))] +#[allow(clippy::too_many_arguments)] +fn inspect_manifest( + py: Python<'_>, + manifest: PathBuf, + max_manifest_bytes: Option, + max_row_map_line_bytes: Option, + max_row_map_rows: Option, + max_row_map_tracked_id_bytes: Option, + max_auxiliary_artifacts: Option, + max_auxiliary_artifact_bytes: Option, + max_encoder_distortion_profile_bytes: Option, + max_report_issues: Option, + max_cached_report_bytes: Option, +) -> PyResult> { + let options = verify_options( + None, + false, + false, + false, + max_manifest_bytes, + max_row_map_line_bytes, + max_row_map_rows, + max_row_map_tracked_id_bytes, + max_auxiliary_artifacts, + max_auxiliary_artifact_bytes, + max_encoder_distortion_profile_bytes, + max_report_issues, + max_cached_report_bytes, + ); + let document = py + .detach(|| load_manifest_file_with_options(manifest, &options)) + .map_err(manifest_error)?; + json_to_py(py, &document.manifest) +} + +#[pyfunction] +#[pyo3(signature = ( + manifest, + *, + index = None, + allow_absolute_paths = false, + allow_path_escape = false, + allow_duplicate_db_ids = false, + max_manifest_bytes = None, + max_row_map_line_bytes = None, + max_row_map_rows = None, + max_row_map_tracked_id_bytes = None, + max_auxiliary_artifacts = None, + max_auxiliary_artifact_bytes = None, + max_encoder_distortion_profile_bytes = None, + max_report_issues = None, + max_cached_report_bytes = None +))] +#[allow(clippy::too_many_arguments)] +fn verify_manifest( + py: Python<'_>, + manifest: PathBuf, + index: Option, + allow_absolute_paths: bool, + allow_path_escape: bool, + allow_duplicate_db_ids: bool, + max_manifest_bytes: Option, + max_row_map_line_bytes: Option, + max_row_map_rows: Option, + max_row_map_tracked_id_bytes: Option, + max_auxiliary_artifacts: Option, + max_auxiliary_artifact_bytes: Option, + max_encoder_distortion_profile_bytes: Option, + max_report_issues: Option, + max_cached_report_bytes: Option, +) -> PyResult> { + let options = verify_options( + index, + allow_absolute_paths, + allow_path_escape, + allow_duplicate_db_ids, + max_manifest_bytes, + max_row_map_line_bytes, + max_row_map_rows, + max_row_map_tracked_id_bytes, + max_auxiliary_artifacts, + max_auxiliary_artifact_bytes, + max_encoder_distortion_profile_bytes, + max_report_issues, + max_cached_report_bytes, + ); + let report = py + .detach(|| { + let document = load_manifest_file_with_options(manifest, &options)?; + Ok::<_, ManifestError>(ordvec_manifest_core::verify_manifest(&document, options)) + }) + .map_err(manifest_error)?; + json_to_py(py, &report) +} + +#[derive(Serialize)] +struct PythonVerifiedLoadPlan { + manifest_path: Option, + artifact_path: String, + metadata: ordvec_manifest_core::MetadataReport, + row_identity: PythonVerifiedRowIdentityPlan, + auxiliary_artifacts: Vec, + report: ordvec_manifest_core::VerificationReport, +} + +#[derive(Serialize)] +struct PythonVerifiedRowIdentityPlan { + kind: String, + path: Option, + row_count: usize, + validated_rows: Option, + sha256: Option, +} + +#[derive(Serialize)] +struct PythonVerifiedAuxiliaryArtifactPlan { + name: String, + path: Option, + required: bool, + state: ordvec_manifest_core::AuxiliaryArtifactState, + reason_code: Option, + sha256: Option, + size_bytes: Option, +} + +#[pyfunction] +#[pyo3(signature = ( + manifest, + *, + index = None, + allow_absolute_paths = false, + allow_path_escape = false, + allow_duplicate_db_ids = false, + max_manifest_bytes = None, + max_row_map_line_bytes = None, + max_row_map_rows = None, + max_row_map_tracked_id_bytes = None, + max_auxiliary_artifacts = None, + max_auxiliary_artifact_bytes = None, + max_encoder_distortion_profile_bytes = None, + max_report_issues = None, + max_cached_report_bytes = None +))] +#[allow(clippy::too_many_arguments)] +fn verify_for_load( + py: Python<'_>, + manifest: PathBuf, + index: Option, + allow_absolute_paths: bool, + allow_path_escape: bool, + allow_duplicate_db_ids: bool, + max_manifest_bytes: Option, + max_row_map_line_bytes: Option, + max_row_map_rows: Option, + max_row_map_tracked_id_bytes: Option, + max_auxiliary_artifacts: Option, + max_auxiliary_artifact_bytes: Option, + max_encoder_distortion_profile_bytes: Option, + max_report_issues: Option, + max_cached_report_bytes: Option, +) -> PyResult> { + let options = verify_options( + index, + allow_absolute_paths, + allow_path_escape, + allow_duplicate_db_ids, + max_manifest_bytes, + max_row_map_line_bytes, + max_row_map_rows, + max_row_map_tracked_id_bytes, + max_auxiliary_artifacts, + max_auxiliary_artifact_bytes, + max_encoder_distortion_profile_bytes, + max_report_issues, + max_cached_report_bytes, + ); + let plan = py + .detach(|| ordvec_manifest_core::verify_for_load(manifest, options)) + .map_err(verified_load_plan_error)?; + let row_identity = plan.row_identity(); + let value = PythonVerifiedLoadPlan { + manifest_path: plan.manifest_path().map(path_to_string), + artifact_path: path_to_string(plan.artifact_path()), + metadata: plan.metadata().clone(), + row_identity: PythonVerifiedRowIdentityPlan { + kind: row_identity.kind().to_string(), + path: row_identity.path().map(path_to_string), + row_count: row_identity.row_count(), + validated_rows: row_identity.validated_rows(), + sha256: row_identity.sha256().map(str::to_string), + }, + auxiliary_artifacts: plan + .auxiliary_artifacts() + .iter() + .map(|artifact| PythonVerifiedAuxiliaryArtifactPlan { + name: artifact.name().to_string(), + path: artifact.path().map(path_to_string), + required: artifact.required(), + state: artifact.state(), + reason_code: artifact.reason_code().map(str::to_string), + sha256: artifact.sha256().map(str::to_string), + size_bytes: artifact.size_bytes(), + }) + .collect(), + report: plan.report().clone(), + }; + json_to_py(py, &value) +} + +#[pyfunction] +#[pyo3(signature = ( + index, + out, + embedding_model, + *, + row_map = None, + row_id_is_identity = false, + auxiliary_artifacts = None, + allow_absolute_paths = false, + allow_path_escape = false, + max_manifest_bytes = None, + max_row_map_line_bytes = None, + max_row_map_rows = None, + max_row_map_tracked_id_bytes = None, + max_auxiliary_artifacts = None, + max_auxiliary_artifact_bytes = None, + max_encoder_distortion_profile_bytes = None, + max_report_issues = None, + max_cached_report_bytes = None +))] +#[allow(clippy::too_many_arguments)] +fn create_manifest( + py: Python<'_>, + index: PathBuf, + out: PathBuf, + embedding_model: String, + row_map: Option, + row_id_is_identity: bool, + auxiliary_artifacts: Option>, + allow_absolute_paths: bool, + allow_path_escape: bool, + max_manifest_bytes: Option, + max_row_map_line_bytes: Option, + max_row_map_rows: Option, + max_row_map_tracked_id_bytes: Option, + max_auxiliary_artifacts: Option, + max_auxiliary_artifact_bytes: Option, + max_encoder_distortion_profile_bytes: Option, + max_report_issues: Option, + max_cached_report_bytes: Option, +) -> PyResult> { + let row_identity = match (row_map, row_id_is_identity) { + (Some(_), true) => { + return Err(pyo3::exceptions::PyValueError::new_err( + "use either row_map or row_id_is_identity, not both", + )); + } + (Some(path), false) => CreateRowIdentity::Jsonl(path), + (None, true) => CreateRowIdentity::RowIdIdentity, + (None, false) => { + return Err(pyo3::exceptions::PyValueError::new_err( + "one of row_map or row_id_is_identity=True is required", + )); + } + }; + let auxiliary_artifacts = parse_auxiliary_artifacts(py, auxiliary_artifacts)?; + let options = create_options( + allow_absolute_paths, + allow_path_escape, + max_manifest_bytes, + max_row_map_line_bytes, + max_row_map_rows, + max_row_map_tracked_id_bytes, + max_auxiliary_artifacts, + max_auxiliary_artifact_bytes, + max_encoder_distortion_profile_bytes, + max_report_issues, + max_cached_report_bytes, + auxiliary_artifacts, + ); + let manifest = py + .detach(|| { + let manifest = create_manifest_for_index_with_options( + index, + row_identity, + embedding_model, + &out, + options, + )?; + write_manifest_file(&manifest, out)?; + Ok::<_, ManifestError>(manifest) + }) + .map_err(manifest_error)?; + json_to_py(py, &manifest) +} + +#[pymodule] +fn _ordvec_manifest(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add("SCHEMA_VERSION", SCHEMA_VERSION)?; + m.add("CALIBRATION_SCHEMA_VERSION", CALIBRATION_SCHEMA_VERSION)?; + m.add( + "ENCODER_DISTORTION_SCHEMA_VERSION", + ENCODER_DISTORTION_SCHEMA_VERSION, + )?; + m.add("DEFAULT_MAX_MANIFEST_BYTES", DEFAULT_MAX_MANIFEST_BYTES)?; + m.add( + "DEFAULT_MAX_ROW_IDENTITY_JSONL_LINE_BYTES", + DEFAULT_MAX_ROW_IDENTITY_JSONL_LINE_BYTES, + )?; + m.add( + "DEFAULT_MAX_ROW_IDENTITY_ROWS", + DEFAULT_MAX_ROW_IDENTITY_ROWS, + )?; + m.add( + "DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES", + DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES, + )?; + m.add( + "DEFAULT_MAX_AUXILIARY_ARTIFACTS", + DEFAULT_MAX_AUXILIARY_ARTIFACTS, + )?; + m.add( + "DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES", + DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES, + )?; + m.add( + "DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES", + DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES, + )?; + m.add("DEFAULT_MAX_REPORT_ISSUES", DEFAULT_MAX_REPORT_ISSUES)?; + m.add( + "DEFAULT_MAX_CACHED_REPORT_BYTES", + DEFAULT_MAX_CACHED_REPORT_BYTES, + )?; + m.add_function(wrap_pyfunction!(default_resource_limits, m)?)?; + m.add_function(wrap_pyfunction!(sha256_file, m)?)?; + m.add_function(wrap_pyfunction!(inspect_manifest, m)?)?; + m.add_function(wrap_pyfunction!(verify_manifest, m)?)?; + m.add_function(wrap_pyfunction!(verify_for_load, m)?)?; + m.add_function(wrap_pyfunction!(create_manifest, m)?)?; + Ok(()) +} diff --git a/ordvec-manifest-python/tests/test_manifest_bindings.py b/ordvec-manifest-python/tests/test_manifest_bindings.py new file mode 100644 index 0000000..9ecefd9 --- /dev/null +++ b/ordvec-manifest-python/tests/test_manifest_bindings.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +import hashlib +import json +from pathlib import Path + +import pytest + +import ordvec_manifest + + +def write_rankquant_index(path: Path, *, dim: int = 16, rows: int = 2, bits: int = 2): + bytes_per_vec = dim * bits // 8 + path.write_bytes( + b"TVRQ" + + bytes([1, bits]) + + dim.to_bytes(4, "little") + + rows.to_bytes(4, "little") + + (b"\x00" * (rows * bytes_per_vec)) + ) + + +def write_unloadable_manifest(tmp_path): + artifact = tmp_path / "index.tvrq" + artifact.write_bytes(b"not an ordvec index") + digest = hashlib.sha256(artifact.read_bytes()).hexdigest() + manifest = { + "schema_version": ordvec_manifest.SCHEMA_VERSION, + "manifest_id": "urn:uuid:7c66ad6e-bdde-49a8-b420-f1136d04f5bd", + "created_at": "2026-06-09T00:00:00Z", + "artifact": { + "path": artifact.name, + "sha256": digest, + "kind": "rank_quant", + "format_version": 1, + "dim": 16, + "vector_count": 1, + "bytes_per_vec": 4, + "params": {"kind": "rank_quant", "bits": 2}, + "file_size_bytes": artifact.stat().st_size, + }, + "embedding": {"model": "test-embedding", "dim": 16}, + "row_identity": {"kind": "row_id_identity", "row_count": 1}, + } + manifest_path = tmp_path / "manifest.json" + manifest_path.write_text(json.dumps(manifest), encoding="utf-8") + return artifact, manifest_path + + +def test_hash_and_limits(tmp_path): + path = tmp_path / "artifact.bin" + path.write_bytes(b"manifest bindings") + + result = ordvec_manifest.sha256_file(path) + + assert result == { + "sha256": hashlib.sha256(b"manifest bindings").hexdigest(), + "size_bytes": len(b"manifest bindings"), + } + limits = ordvec_manifest.default_resource_limits() + assert limits["max_manifest_bytes"] == ordvec_manifest.DEFAULT_MAX_MANIFEST_BYTES + assert "max_row_map_line_bytes" in limits + assert "max_row_identity_jsonl_line_bytes" not in limits + + _, manifest_path = write_unloadable_manifest(tmp_path) + report = ordvec_manifest.verify_manifest(manifest_path, **limits) + assert report["ok"] is False + + +def test_inspect_and_verify_return_dicts(tmp_path): + _, manifest_path = write_unloadable_manifest(tmp_path) + + manifest = ordvec_manifest.inspect_manifest(manifest_path) + report = ordvec_manifest.verify_manifest(manifest_path) + + assert manifest["schema_version"] == ordvec_manifest.SCHEMA_VERSION + assert report["ok"] is False + assert report["artifact"]["sha256"] == manifest["artifact"]["sha256"] + assert any(error["code"] == "artifact_probe_failed" for error in report["errors"]) + + +def test_verify_for_load_raises_when_report_is_not_loadable(tmp_path): + _, manifest_path = write_unloadable_manifest(tmp_path) + + with pytest.raises(ValueError, match="manifest verification failed"): + ordvec_manifest.verify_for_load(manifest_path) + + +def test_verify_for_load_preserves_manifest_io_errors(tmp_path): + with pytest.raises(OSError): + ordvec_manifest.verify_for_load(tmp_path / "missing.json") + + +def test_create_manifest_requires_explicit_row_identity(tmp_path): + index = tmp_path / "index.tvrq" + index.write_bytes(b"not an ordvec index") + + with pytest.raises(ValueError, match="row_map or row_id_is_identity"): + ordvec_manifest.create_manifest(index, tmp_path / "manifest.json", "model") + + +def test_create_manifest_accepts_auxiliary_artifacts(tmp_path): + index = tmp_path / "index.tvrq" + ids = tmp_path / "ids.bin" + optional = tmp_path / "optional.json" + manifest_path = tmp_path / "manifest.json" + write_rankquant_index(index) + ids.write_bytes((7).to_bytes(8, "little") + (9).to_bytes(8, "little")) + optional.write_text('{"optional": true}', encoding="utf-8") + + manifest = ordvec_manifest.create_manifest( + index, + manifest_path, + "model", + row_id_is_identity=True, + auxiliary_artifacts=[ + {"name": "ordinaldb.ids", "path": ids}, + {"name": "optional.stats", "path": optional, "required": False}, + ], + ) + + assert manifest["row_identity"] == {"kind": "row_id_identity", "row_count": 2} + assert manifest["auxiliary_artifacts"][0]["name"] == "ordinaldb.ids" + assert manifest["auxiliary_artifacts"][0]["path"] == "ids.bin" + assert manifest["auxiliary_artifacts"][0].get("required", True) is True + assert manifest["auxiliary_artifacts"][1]["name"] == "optional.stats" + assert manifest["auxiliary_artifacts"][1]["required"] is False + + optional.unlink() + plan = ordvec_manifest.verify_for_load(manifest_path) + auxiliary = {artifact["name"]: artifact for artifact in plan["auxiliary_artifacts"]} + assert auxiliary["ordinaldb.ids"]["state"] == "verified" + assert Path(auxiliary["ordinaldb.ids"]["path"]) == ids.resolve() + assert auxiliary["optional.stats"]["state"] == "optional_absent" diff --git a/tests/release_publish_invariants.py b/tests/release_publish_invariants.py index 6aacd63..e34c3a6 100644 --- a/tests/release_publish_invariants.py +++ b/tests/release_publish_invariants.py @@ -369,6 +369,15 @@ def check_release_version_sync() -> None: "ordvec-python/python/ordvec/__init__.py __version__": python_init_version( "ordvec-python/python/ordvec/__init__.py" ), + "ordvec-manifest-python/Cargo.toml package.version": package_version( + "ordvec-manifest-python/Cargo.toml" + ), + "ordvec-manifest-python/pyproject.toml project.version": project_version( + "ordvec-manifest-python/pyproject.toml" + ), + "ordvec-manifest-python/python/ordvec_manifest/__init__.py __version__": python_init_version( + "ordvec-manifest-python/python/ordvec_manifest/__init__.py" + ), "ordvec-manifest/Cargo.toml package.version": package_version("ordvec-manifest/Cargo.toml"), "ordvec-ffi/Cargo.toml package.version": package_version("ordvec-ffi/Cargo.toml"), } @@ -415,6 +424,7 @@ def check_release_compatibility_sync() -> None: for path in ( "ordvec-manifest/Cargo.toml", "ordvec-python/Cargo.toml", + "ordvec-manifest-python/Cargo.toml", "ordvec-ffi/Cargo.toml", ): rust_version = package_rust_version(path) @@ -450,6 +460,7 @@ def check_publication_model() -> None: "Cargo.toml": True, "ordvec-manifest/Cargo.toml": True, "ordvec-python/Cargo.toml": False, + "ordvec-manifest-python/Cargo.toml": False, "ordvec-ffi/Cargo.toml": False, "fuzz/Cargo.toml": False, } @@ -484,6 +495,44 @@ def check_python_package_metadata() -> None: if feature not in pyo3_features: fail(f"ordvec-python/Cargo.toml: pyo3 features must include {feature}") + manifest_pyproject = load_toml("ordvec-manifest-python/pyproject.toml") + manifest_project = mapping( + manifest_pyproject.get("project"), + "ordvec-manifest-python/pyproject.toml: project", + ) + if manifest_project.get("name") != "ordvec-manifest": + fail("ordvec-manifest-python/pyproject.toml: project.name must be 'ordvec-manifest'") + if manifest_project.get("requires-python") != ">=3.10": + fail("ordvec-manifest-python/pyproject.toml: project.requires-python must be >=3.10") + + manifest_cargo = load_toml("ordvec-manifest-python/Cargo.toml") + manifest_dependencies = mapping( + manifest_cargo.get("dependencies"), + "ordvec-manifest-python/Cargo.toml: dependencies", + ) + manifest_pyo3 = mapping( + manifest_dependencies.get("pyo3"), + "ordvec-manifest-python/Cargo.toml: dependencies.pyo3", + ) + manifest_pyo3_features = sequence( + manifest_pyo3.get("features"), + "ordvec-manifest-python/Cargo.toml: dependencies.pyo3.features", + ) + for feature in ("extension-module", "abi3-py310"): + if feature not in manifest_pyo3_features: + fail(f"ordvec-manifest-python/Cargo.toml: pyo3 features must include {feature}") + + manifest_tool = mapping( + manifest_pyproject.get("tool"), + "ordvec-manifest-python/pyproject.toml: tool", + ) + manifest_maturin = mapping( + manifest_tool.get("maturin"), + "ordvec-manifest-python/pyproject.toml: tool.maturin", + ) + if manifest_maturin.get("module-name") != "ordvec_manifest._ordvec_manifest": + fail("ordvec-manifest-python/pyproject.toml: tool.maturin.module-name must target ordvec_manifest._ordvec_manifest") + readme = read_text("README.md") py_readme = read_text("ordvec-python/README.md") for path, text in (("README.md", readme), ("ordvec-python/README.md", py_readme)): @@ -618,6 +667,7 @@ def check_package_contents() -> None: "ordvec-ffi/", "ordvec-go/", "ordvec-manifest/", + "ordvec-manifest-python/", "ordvec-python/", "target/", "tests/release_", @@ -654,6 +704,7 @@ def check_package_contents() -> None: "ordvec-ffi/", "ordvec-go/", "ordvec-manifest/", + "ordvec-manifest-python/", "ordvec-python/", "target/", "tests/release_", @@ -882,6 +933,7 @@ def check_release_security_gates(workflow: dict[str, Any], path: str) -> None: "manifest-provenance", "publish-manifest-crate", "publish-pypi", + "publish-manifest-pypi", } for job_name, raw_job in jobs.items(): if not isinstance(job_name, str): @@ -903,6 +955,7 @@ def check_release_security_gates(workflow: dict[str, Any], path: str) -> None: ("publish-crate", "crates-io"), ("publish-manifest-crate", "crates-io"), ("publish-pypi", "pypi"), + ("publish-manifest-pypi", "pypi"), ): job = mapping(jobs.get(job_name), f"{path}: jobs.{job_name}") raw_environment = job.get("environment") @@ -937,18 +990,29 @@ def check_aarch64_smoke_selector(workflow: dict[str, Any], path: str) -> None: fail(f"{path}: linux/aarch64 wheel selector must match architecture and assert exactly one wheel") -def check_pypi_canonical_dist(workflow: dict[str, Any], path: str) -> None: +def check_pypi_canonical_dist( + workflow: dict[str, Any], + path: str, + *, + job_name: str = "pypi-canonical-dist", + wheel_build_job: str = "build-wheels", + sdist_build_job: str = "build-sdist", + wheel_artifact_pattern: str = "wheels-*", + sdist_artifact_name: str = "sdist", + canonical_artifact_name: str = "pypi-canonical-dist", + project: str | None = None, +) -> None: jobs = mapping(workflow.get("jobs"), f"{path}: jobs") - job = mapping(jobs.get("pypi-canonical-dist"), f"{path}: jobs.pypi-canonical-dist") - steps = sequence(job.get("steps"), f"{path}: jobs.pypi-canonical-dist.steps") + job = mapping(jobs.get(job_name), f"{path}: jobs.{job_name}") + steps = sequence(job.get("steps"), f"{path}: jobs.{job_name}.steps") - for needed in ("build-wheels", "build-sdist"): + for needed in (wheel_build_job, sdist_build_job): if not has_need(job, needed): - fail(f"{path}: pypi-canonical-dist must need {needed}") + fail(f"{path}: {job_name} must need {needed}") - outputs = mapping(job.get("outputs"), f"{path}: jobs.pypi-canonical-dist.outputs") + outputs = mapping(job.get("outputs"), f"{path}: jobs.{job_name}.outputs") if outputs.get("source") != "${{ steps.canonicalize.outputs.source }}": - fail(f"{path}: pypi-canonical-dist must expose the canonical source output") + fail(f"{path}: {job_name} must expose the canonical source output") wheels_downloads: list[int] = [] sdist_downloads: list[int] = [] @@ -956,38 +1020,40 @@ def check_pypi_canonical_dist(workflow: dict[str, Any], path: str) -> None: uploads: list[tuple[int, dict[str, Any], dict[str, Any]]] = [] for index, raw_step in enumerate(steps): - step = mapping(raw_step, f"{path}: jobs.pypi-canonical-dist.steps[{index}]") + step = mapping(raw_step, f"{path}: jobs.{job_name}.steps[{index}]") action = action_name(step) if action == "actions/download-artifact": with_map = mapping(step.get("with", {}), f"{path}: {step_label(index, step)} with") artifact_path = norm_path(with_map.get("path")) - if with_map.get("pattern") == "wheels-*" and boolish_true(with_map.get("merge-multiple")): + if with_map.get("pattern") == wheel_artifact_pattern and boolish_true(with_map.get("merge-multiple")): if artifact_path != "built-dist": - fail(f"{path}: canonical wheel download must target built-dist") + fail(f"{path}: {job_name} canonical wheel download must target built-dist") wheels_downloads.append(index) - elif with_map.get("name") == "sdist": + elif with_map.get("name") == sdist_artifact_name: if artifact_path != "built-dist": - fail(f"{path}: canonical sdist download must target built-dist") + fail(f"{path}: {job_name} canonical sdist download must target built-dist") sdist_downloads.append(index) elif action == "actions/upload-artifact": with_map = mapping(step.get("with", {}), f"{path}: {step_label(index, step)} with") - if with_map.get("name") == "pypi-canonical-dist": + if with_map.get("name") == canonical_artifact_name: uploads.append((index, step, with_map)) run = step.get("run") if contains_text(run, "tests/release_pypi_canonical_dist.py canonicalize"): canonicalize_steps.append(step) if "--built-dir built-dist" not in run or "--out-dir canonical-dist" not in run: - fail(f"{path}: canonicalize step must read built-dist and write canonical-dist") + fail(f"{path}: {job_name} canonicalize step must read built-dist and write canonical-dist") + if project is not None and f"--project {project}" not in run: + fail(f"{path}: {job_name} canonicalize step must pass --project {project}") if len(wheels_downloads) != 1: - fail(f"{path}: pypi-canonical-dist must download exactly one wheels-* artifact set") + fail(f"{path}: {job_name} must download exactly one {wheel_artifact_pattern} artifact set") if len(sdist_downloads) != 1: - fail(f"{path}: pypi-canonical-dist must download exactly one sdist artifact") + fail(f"{path}: {job_name} must download exactly one {sdist_artifact_name} artifact") if len(canonicalize_steps) != 1: - fail(f"{path}: pypi-canonical-dist must run release_pypi_canonical_dist.py canonicalize") + fail(f"{path}: {job_name} must run release_pypi_canonical_dist.py canonicalize") if len(uploads) != 1: - fail(f"{path}: pypi-canonical-dist must upload exactly one pypi-canonical-dist artifact") + fail(f"{path}: {job_name} must upload exactly one {canonical_artifact_name} artifact") _, _, upload_with = uploads[0] upload_path = upload_with.get("path") @@ -995,30 +1061,38 @@ def check_pypi_canonical_dist(workflow: dict[str, Any], path: str) -> None: contains_text(upload_path, "canonical-dist/*.whl") and contains_text(upload_path, "canonical-dist/*.tar.gz") ): - fail(f"{path}: pypi-canonical-dist upload must include canonical wheels and sdist") + fail(f"{path}: {job_name} upload must include canonical wheels and sdist") -def check_publish_pypi(workflow: dict[str, Any], path: str) -> None: +def check_publish_pypi( + workflow: dict[str, Any], + path: str, + *, + job_name: str = "publish-pypi", + canonical_job: str = "pypi-canonical-dist", + canonical_artifact_name: str = "pypi-canonical-dist", + project: str | None = None, +) -> None: jobs = mapping(workflow.get("jobs"), f"{path}: jobs") - job = mapping(jobs.get("publish-pypi"), f"{path}: jobs.publish-pypi") - steps = sequence(job.get("steps"), f"{path}: jobs.publish-pypi.steps") + job = mapping(jobs.get(job_name), f"{path}: jobs.{job_name}") + steps = sequence(job.get("steps"), f"{path}: jobs.{job_name}.steps") - if not has_need(job, "pypi-canonical-dist"): - fail(f"{path}: publish-pypi must need pypi-canonical-dist") + if not has_need(job, canonical_job): + fail(f"{path}: {job_name} must need {canonical_job}") publish_steps: list[tuple[int, dict[str, Any]]] = [] canonical_downloads: list[tuple[int, dict[str, Any], dict[str, Any]]] = [] verify_steps: list[dict[str, Any]] = [] for index, raw_step in enumerate(steps): - step = mapping(raw_step, f"{path}: jobs.publish-pypi.steps[{index}]") + step = mapping(raw_step, f"{path}: jobs.{job_name}.steps[{index}]") action = action_name(step) if action == "pypa/gh-action-pypi-publish": publish_steps.append((index, step)) if action == "actions/download-artifact": with_block = step.get("with", {}) with_map = mapping(with_block, f"{path}: {step_label(index, step)} with") - if with_map.get("name") == "pypi-canonical-dist": + if with_map.get("name") == canonical_artifact_name: canonical_downloads.append((index, step, with_map)) elif norm_path(with_map.get("path")) == "dist": fail(f"{path}: {step_label(index, step)} downloads a non-canonical artifact into dist") @@ -1027,30 +1101,32 @@ def check_publish_pypi(workflow: dict[str, Any], path: str) -> None: if contains_text(run, "tests/release_pypi_canonical_dist.py verify"): verify_steps.append(step) if "--dist-dir dist" not in run: - fail(f"{path}: PyPI verify step must verify dist") + fail(f"{path}: {job_name} PyPI verify step must verify dist") + if project is not None and f"--project {project}" not in run: + fail(f"{path}: {job_name} PyPI verify step must pass --project {project}") if len(publish_steps) != 1: - fail(f"{path}: publish-pypi must have exactly one pypa/gh-action-pypi-publish step") + fail(f"{path}: {job_name} must have exactly one pypa/gh-action-pypi-publish step") publish_index, publish_step = publish_steps[0] - if publish_step.get("if") != "needs.pypi-canonical-dist.outputs.source == 'build'": - fail(f"{path}: PyPI publish step must only run when canonical source is the current build") + if publish_step.get("if") != f"needs.{canonical_job}.outputs.source == 'build'": + fail(f"{path}: {job_name} PyPI publish step must only run when canonical source is the current build") publish_with = mapping( publish_step.get("with", {}), f"{path}: {step_label(publish_index, publish_step)} with" ) if norm_path(publish_with.get("packages-dir")) != "dist": - fail(f"{path}: PyPI publish step must upload packages-dir: dist") + fail(f"{path}: {job_name} PyPI publish step must upload packages-dir: dist") if len(canonical_downloads) != 1: - fail(f"{path}: publish-pypi must download exactly one pypi-canonical-dist artifact") + fail(f"{path}: {job_name} must download exactly one {canonical_artifact_name} artifact") download_index, download_step, download_with = canonical_downloads[0] if download_index > publish_index: fail(f"{path}: {step_label(download_index, download_step)} must run before the PyPI publish step") if norm_path(download_with.get("path")) != "dist": - fail(f"{path}: publish-pypi must download pypi-canonical-dist into dist") + fail(f"{path}: {job_name} must download {canonical_artifact_name} into dist") if len(verify_steps) != 1: - fail(f"{path}: publish-pypi must run release_pypi_canonical_dist.py verify exactly once") + fail(f"{path}: {job_name} must run release_pypi_canonical_dist.py verify exactly once") for index, step in enumerate(steps): if action_name(step) != "actions/download-artifact": @@ -1058,7 +1134,7 @@ def check_publish_pypi(workflow: dict[str, Any], path: str) -> None: with_map = mapping(step.get("with", {}), f"{path}: {step_label(index, step)} with") label = step_label(index, step) artifact_path = norm_path(with_map.get("path")) - if artifact_path == "dist" and with_map.get("name") != "pypi-canonical-dist": + if artifact_path == "dist" and with_map.get("name") != canonical_artifact_name: fail(f"{path}: {label} must not place non-canonical artifacts in dist") @@ -1472,9 +1548,28 @@ def main() -> None: check_release_security_gates(workflow, WORKFLOW_PATH) check_aarch64_smoke_selector(workflow, WORKFLOW_PATH) check_pypi_canonical_dist(workflow, WORKFLOW_PATH) + check_pypi_canonical_dist( + workflow, + WORKFLOW_PATH, + job_name="pypi-manifest-canonical-dist", + wheel_build_job="build-manifest-wheels", + sdist_build_job="build-manifest-sdist", + wheel_artifact_pattern="wheels-manifest-*", + sdist_artifact_name="sdist-manifest", + canonical_artifact_name="pypi-manifest-canonical-dist", + project="ordvec-manifest", + ) check_publish_crates(workflow, WORKFLOW_PATH) check_ci_manifest_package_defer(load_workflow(CI_WORKFLOW_PATH), CI_WORKFLOW_PATH) check_publish_pypi(workflow, WORKFLOW_PATH) + check_publish_pypi( + workflow, + WORKFLOW_PATH, + job_name="publish-manifest-pypi", + canonical_job="pypi-manifest-canonical-dist", + canonical_artifact_name="pypi-manifest-canonical-dist", + project="ordvec-manifest", + ) check_sde_cache_invariants() diff --git a/tests/release_pypi_canonical_dist.py b/tests/release_pypi_canonical_dist.py index 6946c08..b0be14f 100644 --- a/tests/release_pypi_canonical_dist.py +++ b/tests/release_pypi_canonical_dist.py @@ -22,7 +22,7 @@ from typing import Any -PROJECT = "ordvec" +DEFAULT_PROJECT = "ordvec" DIST_SUFFIXES = (".whl", ".tar.gz") @@ -65,8 +65,8 @@ def dist_files(directory: Path) -> dict[str, Path]: return files -def fetch_pypi_payload(version: str) -> dict[str, Any] | None: - url = f"https://pypi.org/pypi/{PROJECT}/{version}/json" +def fetch_pypi_payload(project: str, version: str) -> dict[str, Any] | None: + url = f"https://pypi.org/pypi/{project}/{version}/json" try: with urllib.request.urlopen(url, timeout=20) as response: return json.load(response) @@ -130,11 +130,11 @@ def ensure_same_filenames(local: dict[str, Path], remote: dict[str, dict[str, st ) -def canonicalize(version: str, built_dir: Path, out_dir: Path) -> None: +def canonicalize(project: str, version: str, built_dir: Path, out_dir: Path) -> None: built = dist_files(built_dir) prepare_empty_dir(out_dir) try: - payload = fetch_pypi_payload(version) + payload = fetch_pypi_payload(project, version) except PyPIReadError as exc: fail(str(exc)) @@ -143,7 +143,7 @@ def canonicalize(version: str, built_dir: Path, out_dir: Path) -> None: shutil.copy2(path, out_dir / filename) set_output("source", "build") set_output("pypi_exists", "false") - print(f"OK: PyPI has no {PROJECT} {version}; canonical dist uses current build") + print(f"OK: PyPI has no {project} {version}; canonical dist uses current build") return try: @@ -170,11 +170,11 @@ def canonicalize(version: str, built_dir: Path, out_dir: Path) -> None: set_output("source", "pypi") set_output("pypi_exists", "true") - print(f"OK: PyPI already has {PROJECT} {version}; canonical dist uses verified PyPI files") + print(f"OK: PyPI already has {project} {version}; canonical dist uses verified PyPI files") -def remote_hashes(version: str) -> dict[str, str] | None: - payload = fetch_pypi_payload(version) +def remote_hashes(project: str, version: str) -> dict[str, str] | None: + payload = fetch_pypi_payload(project, version) if payload is None: return None return {name: item["sha256"] for name, item in pypi_dist_map(payload).items()} @@ -184,15 +184,15 @@ def local_hashes(dist_dir: Path) -> dict[str, str]: return {name: sha256_file(path) for name, path in dist_files(dist_dir).items()} -def verify(version: str, dist_dir: Path, attempts: int, sleep_seconds: float) -> None: +def verify(project: str, version: str, dist_dir: Path, attempts: int, sleep_seconds: float) -> None: local = local_hashes(dist_dir) - url = f"https://pypi.org/pypi/{PROJECT}/{version}/json" + url = f"https://pypi.org/pypi/{project}/{version}/json" last_error = "not checked" for attempt in range(1, attempts + 1): try: - remote = remote_hashes(version) + remote = remote_hashes(project, version) if remote == local: - print(f"OK: PyPI-served hashes match canonical dist for {PROJECT} {version}") + print(f"OK: PyPI-served hashes match canonical dist for {project} {version}") return last_error = f"local={local!r} remote={remote!r}" except PyPIReadError as exc: @@ -208,11 +208,13 @@ def parse_args() -> argparse.Namespace: subparsers = parser.add_subparsers(dest="command", required=True) canonical = subparsers.add_parser("canonicalize") + canonical.add_argument("--project", default=DEFAULT_PROJECT) canonical.add_argument("--version", required=True) canonical.add_argument("--built-dir", required=True, type=Path) canonical.add_argument("--out-dir", required=True, type=Path) verify_parser = subparsers.add_parser("verify") + verify_parser.add_argument("--project", default=DEFAULT_PROJECT) verify_parser.add_argument("--version", required=True) verify_parser.add_argument("--dist-dir", required=True, type=Path) verify_parser.add_argument("--attempts", default=24, type=int) @@ -224,10 +226,10 @@ def parse_args() -> argparse.Namespace: def main() -> None: args = parse_args() if args.command == "canonicalize": - canonicalize(args.version, args.built_dir, args.out_dir) + canonicalize(args.project, args.version, args.built_dir, args.out_dir) return if args.command == "verify": - verify(args.version, args.dist_dir, args.attempts, args.sleep_seconds) + verify(args.project, args.version, args.dist_dir, args.attempts, args.sleep_seconds) return raise AssertionError(f"unknown command: {args.command}") diff --git a/tests/release_pypi_canonical_dist_tests.py b/tests/release_pypi_canonical_dist_tests.py index 0bcbf13..0022cb1 100644 --- a/tests/release_pypi_canonical_dist_tests.py +++ b/tests/release_pypi_canonical_dist_tests.py @@ -35,10 +35,10 @@ def test_missing_pypi_release_uses_current_build(self) -> None: write(built / "ordvec-0.3.0-cp310-abi3-win_amd64.whl", b"fresh wheel") old_fetch = canonical.fetch_pypi_payload - canonical.fetch_pypi_payload = lambda version: None + canonical.fetch_pypi_payload = lambda project, version: None try: with redirect_stdout(io.StringIO()): - canonical.canonicalize("0.3.0", built, out) + canonical.canonicalize("ordvec", "0.3.0", built, out) finally: canonical.fetch_pypi_payload = old_fetch @@ -75,10 +75,10 @@ def test_existing_pypi_release_uses_verified_remote_bytes(self) -> None: } old_fetch = canonical.fetch_pypi_payload - canonical.fetch_pypi_payload = lambda version: payload + canonical.fetch_pypi_payload = lambda project, version: payload try: with redirect_stdout(io.StringIO()): - canonical.canonicalize("0.3.0", built, out) + canonical.canonicalize("ordvec", "0.3.0", built, out) finally: canonical.fetch_pypi_payload = old_fetch @@ -104,10 +104,10 @@ def test_existing_pypi_release_rejects_filename_drift(self) -> None: } old_fetch = canonical.fetch_pypi_payload - canonical.fetch_pypi_payload = lambda version: payload + canonical.fetch_pypi_payload = lambda project, version: payload try: with redirect_stderr(io.StringIO()), self.assertRaises(SystemExit): - canonical.canonicalize("0.3.0", built, out) + canonical.canonicalize("ordvec", "0.3.0", built, out) finally: canonical.fetch_pypi_payload = old_fetch @@ -129,7 +129,7 @@ def test_verify_retries_after_transient_pypi_fetch_error(self) -> None: old_fetch = canonical.fetch_pypi_payload old_sleep = canonical.time.sleep - def fetch(version: str) -> dict[str, object] | None: + def fetch(project: str, version: str) -> dict[str, object] | None: response = responses.pop(0) if isinstance(response, Exception): raise response @@ -139,7 +139,7 @@ def fetch(version: str) -> dict[str, object] | None: canonical.time.sleep = sleeps.append try: with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()): - canonical.verify("0.3.0", dist, attempts=2, sleep_seconds=0.25) + canonical.verify("ordvec", "0.3.0", dist, attempts=2, sleep_seconds=0.25) finally: canonical.fetch_pypi_payload = old_fetch canonical.time.sleep = old_sleep @@ -164,11 +164,11 @@ def test_verify_retries_after_empty_pypi_dist_payload(self) -> None: old_fetch = canonical.fetch_pypi_payload old_sleep = canonical.time.sleep - canonical.fetch_pypi_payload = lambda version: responses.pop(0) + canonical.fetch_pypi_payload = lambda project, version: responses.pop(0) canonical.time.sleep = sleeps.append try: with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()): - canonical.verify("0.3.0", dist, attempts=2, sleep_seconds=0.5) + canonical.verify("ordvec", "0.3.0", dist, attempts=2, sleep_seconds=0.5) finally: canonical.fetch_pypi_payload = old_fetch canonical.time.sleep = old_sleep @@ -184,12 +184,12 @@ def test_canonicalize_reports_pypi_read_error(self) -> None: write(built / "ordvec-0.3.0.tar.gz", b"fresh sdist") old_fetch = canonical.fetch_pypi_payload - canonical.fetch_pypi_payload = lambda version: (_ for _ in ()).throw( + canonical.fetch_pypi_payload = lambda project, version: (_ for _ in ()).throw( canonical.PyPIReadError("temporary PyPI 503") ) try: with redirect_stderr(io.StringIO()), self.assertRaises(SystemExit): - canonical.canonicalize("0.3.0", built, out) + canonical.canonicalize("ordvec", "0.3.0", built, out) finally: canonical.fetch_pypi_payload = old_fetch diff --git a/tests/release_signed_release_invariants.sh b/tests/release_signed_release_invariants.sh index 99bcc6a..e709f1a 100755 --- a/tests/release_signed_release_invariants.sh +++ b/tests/release_signed_release_invariants.sh @@ -7,9 +7,11 @@ # unsigned releases may keep the score below 10 temporarily. The same graph # keeps the build-attest-publish chain honest: # -# build-{crate,manifest-crate,wheels,sdist} (raw artifacts) +# build-{crate,manifest-crate,wheels,manifest-wheels,sdist,manifest-sdist} +# (raw artifacts) # | # +-> pypi-canonical-dist (current build, or verified immutable PyPI files) +# +-> pypi-manifest-canonical-dist (same for ordvec-manifest) # | # +-> attest (id-token + attestations + .sigstore.json; # | Rust-crates-only when PyPI files already exist) @@ -23,9 +25,10 @@ # +--> publish-pypi (Trusted Publishing, or existing-file verification) # | # +-> build/attest/provenance/release-manifest-assets-draft -# | (after publish-crate; uploads manifest .crate/.sigstore.json/.intoto.jsonl) +# | (after publish-crate; uploads manifest .crate/.whl/.tar.gz/.sigstore.json/.intoto.jsonl) # | # +--> publish-manifest-crate (same byte-identity proof after manifest assets stage) +# +--> publish-manifest-pypi (Trusted Publishing, or existing-file verification) # | # v # publish-github-release (un-draft, ONLY after all registry publishes succeed) @@ -126,12 +129,22 @@ job_needs release-manifest-assets-draft attest-manifest \ || fail "release-manifest-assets-draft must \`needs: attest-manifest\`" job_needs release-manifest-assets-draft manifest-provenance \ || fail "release-manifest-assets-draft must \`needs: manifest-provenance\`" +job_needs release-manifest-assets-draft pypi-manifest-canonical-dist \ + || fail "release-manifest-assets-draft must \`needs: pypi-manifest-canonical-dist\`" +job_needs release-manifest-assets-draft smoke-linux-aarch64-manifest-wheel \ + || fail "release-manifest-assets-draft must \`needs: smoke-linux-aarch64-manifest-wheel\`" job_downloads_artifact_to_path release-manifest-assets-draft dist-manifest-crate dist \ || fail "release-manifest-assets-draft must download the manifest dist-manifest-crate artifact into dist" job_downloads_artifact_to_path release-manifest-assets-draft sigstore-bundle-manifest dist \ || fail "release-manifest-assets-draft must download the manifest Sigstore bundle into dist" +job_downloads_artifact_to_path release-manifest-assets-draft pypi-manifest-canonical-dist dist \ + || fail "release-manifest-assets-draft must download the canonical manifest Python dist into dist" printf '%s\n' "$body_manifest_draft" | grep -qE 'dist/\*\.crate([^a-zA-Z]|$)' \ || fail "release-manifest-assets-draft must upload dist/*.crate" +printf '%s\n' "$body_manifest_draft" | grep -qE 'dist/\*\.whl([^a-zA-Z]|$)' \ + || fail "release-manifest-assets-draft must upload dist/*.whl" +printf '%s\n' "$body_manifest_draft" | grep -qE 'dist/\*\.tar\.gz([^a-zA-Z]|$)' \ + || fail "release-manifest-assets-draft must upload dist/*.tar.gz" printf '%s\n' "$body_manifest_draft" | grep -qE 'dist/\*\.sigstore\.json([^a-zA-Z]|$)' \ || fail "release-manifest-assets-draft must upload dist/*.sigstore.json" printf '%s\n' "$body_manifest_draft" | grep -qE 'dist/\*\.intoto\.jsonl([^a-zA-Z]|$)' \ @@ -195,15 +208,23 @@ printf '%s\n' "$att_manifest" | grep -qE '^[[:space:]]+attestations:[[:space:]]* || fail "attest-manifest job must grant \`attestations: write\` (persist to the GitHub attestation store)" job_needs attest-manifest build-manifest-crate \ || fail "attest-manifest must \`needs: build-manifest-crate\`" +job_needs attest-manifest pypi-manifest-canonical-dist \ + || fail "attest-manifest must \`needs: pypi-manifest-canonical-dist\`" job_downloads_artifact_to_path attest-manifest dist-manifest-crate dist \ || fail "attest-manifest must download the dist-manifest-crate artifact into dist" +job_downloads_artifact_to_path attest-manifest pypi-manifest-canonical-dist dist \ + || fail "attest-manifest must download the canonical manifest Python dist into dist" comb="$(job_body combine-hashes)" comb_manifest="$(job_body combine-manifest-hash)" job_needs combine-manifest-hash build-manifest-crate \ || fail "combine-manifest-hash must \`needs: build-manifest-crate\` so the manifest .crate is a SLSA subject" +job_needs combine-manifest-hash pypi-manifest-canonical-dist \ + || fail "combine-manifest-hash must \`needs: pypi-manifest-canonical-dist\` so manifest Python dist can be SLSA subjects" job_downloads_artifact_to_path combine-manifest-hash dist-manifest-crate dist \ || fail "combine-manifest-hash must download the dist-manifest-crate artifact into dist" +job_downloads_artifact_to_path combine-manifest-hash pypi-manifest-canonical-dist dist \ + || fail "combine-manifest-hash must download the canonical manifest Python dist when it is built by this run" build_manifest="$(job_body build-manifest-crate)" job_needs build-manifest-crate publish-crate \ @@ -231,6 +252,11 @@ job_needs publish-manifest-crate release-manifest-assets-draft \ || fail "publish-manifest-crate must \`needs: release-manifest-assets-draft\`" job_needs publish-manifest-crate publish-crate \ || fail "publish-manifest-crate must \`needs: publish-crate\`" +body="$(job_body publish-manifest-pypi)" +printf '%s\n' "$body" | grep -qE '^[[:space:]]+id-token:[[:space:]]*write' \ + || fail "publish-manifest-pypi must grant \`id-token: write\` (Trusted Publishing OIDC)" +job_needs publish-manifest-pypi release-manifest-assets-draft \ + || fail "publish-manifest-pypi must \`needs: release-manifest-assets-draft\`" # ---------------------------------------------------------------------- # (9) Rust crate publish jobs prove byte-identity vs the attested .crate on BOTH @@ -291,6 +317,13 @@ printf '%s\n' "$pcd" | grep -qE 'release_pypi_canonical_dist\.py canonicalize' \ || fail "pypi-canonical-dist must canonicalize Python artifacts before attestation/release upload" printf '%s\n' "$pcd" | grep -qE 'name:[[:space:]]*pypi-canonical-dist' \ || fail "pypi-canonical-dist must upload the canonical Python dist artifact" +pcd_manifest="$(job_body pypi-manifest-canonical-dist)" +printf '%s\n' "$pcd_manifest" | grep -qE 'release_pypi_canonical_dist\.py canonicalize' \ + || fail "pypi-manifest-canonical-dist must canonicalize manifest Python artifacts before attestation/release upload" +printf '%s\n' "$pcd_manifest" | grep -qE -- '--project[[:space:]]+ordvec-manifest' \ + || fail "pypi-manifest-canonical-dist must canonicalize the ordvec-manifest PyPI project" +printf '%s\n' "$pcd_manifest" | grep -qE 'name:[[:space:]]*pypi-manifest-canonical-dist' \ + || fail "pypi-manifest-canonical-dist must upload the canonical manifest Python dist artifact" ppb="$(job_body publish-pypi)" job_needs publish-pypi pypi-canonical-dist \ @@ -299,13 +332,22 @@ printf '%s\n' "$ppb" | grep -qE 'name:[[:space:]]*pypi-canonical-dist' \ || fail "publish-pypi must consume pypi-canonical-dist, not raw rebuilt wheel/sdist artifacts" printf '%s\n' "$ppb" | grep -qE 'release_pypi_canonical_dist\.py verify' \ || fail "publish-pypi must verify PyPI-served wheel/sdist hashes against canonical dist" +mppb="$(job_body publish-manifest-pypi)" +job_needs publish-manifest-pypi pypi-manifest-canonical-dist \ + || fail "publish-manifest-pypi must \`needs: pypi-manifest-canonical-dist\` (publish/verify exactly the canonical manifest files)" +printf '%s\n' "$mppb" | grep -qE 'name:[[:space:]]*pypi-manifest-canonical-dist' \ + || fail "publish-manifest-pypi must consume pypi-manifest-canonical-dist, not raw rebuilt wheel/sdist artifacts" +printf '%s\n' "$mppb" | grep -qE 'release_pypi_canonical_dist\.py verify' \ + || fail "publish-manifest-pypi must verify PyPI-served manifest wheel/sdist hashes against canonical dist" +printf '%s\n' "$mppb" | grep -qE -- '--project[[:space:]]+ordvec-manifest' \ + || fail "publish-manifest-pypi must verify the ordvec-manifest PyPI project" grep -q 'pypi.org/pypi' tests/release_pypi_canonical_dist.py \ || fail "release_pypi_canonical_dist.py must query PyPI for served file hashes" # ---------------------------------------------------------------------- # (10) publish-github-release un-drafts ONLY AFTER all registry publishes succeed. # ---------------------------------------------------------------------- -for dep in publish-crate publish-manifest-crate publish-pypi; do +for dep in publish-crate publish-manifest-crate publish-pypi publish-manifest-pypi; do job_needs publish-github-release "$dep" \ || fail "publish-github-release must \`needs: $dep\` (un-draft only after all registry publishes succeed)" done From b4e7a76163373377fa8f06c60f2e3cc65cac8125 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Tue, 9 Jun 2026 15:02:00 -0500 Subject: [PATCH 3/4] Avoid manifest wheel artifact collision Signed-off-by: Nelson Spence --- .github/workflows/release.yml | 4 ++-- tests/release_publish_invariants.py | 20 +++++++++++++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b5089d4..ea5baee 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -589,7 +589,7 @@ jobs: python -m pytest ordvec-manifest-python/tests -q - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: wheels-manifest-${{ matrix.platform.runner }}-${{ matrix.platform.target }} + name: manifest-wheels-${{ matrix.platform.runner }}-${{ matrix.platform.target }} path: ordvec-manifest-python/dist/*.whl if-no-files-found: error @@ -727,7 +727,7 @@ jobs: - name: Collect the built manifest wheels uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: - pattern: wheels-manifest-* + pattern: manifest-wheels-* path: built-dist merge-multiple: true - name: Collect the built manifest sdist diff --git a/tests/release_publish_invariants.py b/tests/release_publish_invariants.py index e34c3a6..d91d181 100644 --- a/tests/release_publish_invariants.py +++ b/tests/release_publish_invariants.py @@ -11,6 +11,7 @@ import shutil import subprocess import sys +from fnmatch import fnmatchcase from typing import Any try: @@ -1010,6 +1011,23 @@ def check_pypi_canonical_dist( if not has_need(job, needed): fail(f"{path}: {job_name} must need {needed}") + wheel_job = mapping(jobs.get(wheel_build_job), f"{path}: jobs.{wheel_build_job}") + wheel_steps = sequence(wheel_job.get("steps"), f"{path}: jobs.{wheel_build_job}.steps") + wheel_upload_names: list[str] = [] + for index, raw_step in enumerate(wheel_steps): + step = mapping(raw_step, f"{path}: jobs.{wheel_build_job}.steps[{index}]") + if action_name(step) != "actions/upload-artifact": + continue + with_map = mapping(step.get("with", {}), f"{path}: {step_label(index, step)} with") + name = with_map.get("name") + if isinstance(name, str) and fnmatchcase(name, wheel_artifact_pattern): + wheel_upload_names.append(name) + if len(wheel_upload_names) != 1: + fail( + f"{path}: {wheel_build_job} must upload exactly one artifact matching " + f"{wheel_artifact_pattern}; got {wheel_upload_names!r}" + ) + outputs = mapping(job.get("outputs"), f"{path}: jobs.{job_name}.outputs") if outputs.get("source") != "${{ steps.canonicalize.outputs.source }}": fail(f"{path}: {job_name} must expose the canonical source output") @@ -1554,7 +1572,7 @@ def main() -> None: job_name="pypi-manifest-canonical-dist", wheel_build_job="build-manifest-wheels", sdist_build_job="build-manifest-sdist", - wheel_artifact_pattern="wheels-manifest-*", + wheel_artifact_pattern="manifest-wheels-*", sdist_artifact_name="sdist-manifest", canonical_artifact_name="pypi-manifest-canonical-dist", project="ordvec-manifest", From 358e6b7e4cdb5649ba5806aaff79abd46ba8b259 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Tue, 9 Jun 2026 15:06:24 -0500 Subject: [PATCH 4/4] Harden auxiliary load-plan lookup Signed-off-by: Nelson Spence --- ordvec-manifest/src/lib.rs | 3 ++- ordvec-manifest/tests/manifest.rs | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ordvec-manifest/src/lib.rs b/ordvec-manifest/src/lib.rs index 0f84fbc..f542748 100644 --- a/ordvec-manifest/src/lib.rs +++ b/ordvec-manifest/src/lib.rs @@ -2851,9 +2851,10 @@ impl VerifiedLoadPlan { } pub fn auxiliary_by_name(&self, name: &str) -> Option<&VerifiedAuxiliaryArtifactPlan> { + let name = name.trim(); self.auxiliary_artifacts .iter() - .find(|artifact| artifact.name() == name) + .find(|artifact| artifact.name().trim() == name) } pub fn require_auxiliary(&self, name: &str) -> Result<&Path, RequireAuxiliaryError> { diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs index c1dfc80..a3414d8 100644 --- a/ordvec-manifest/tests/manifest.rs +++ b/ordvec-manifest/tests/manifest.rs @@ -377,6 +377,10 @@ fn create_manifest_declares_auxiliary_artifacts_for_load_plan_lookup() { plan.require_auxiliary("ordinaldb.ids").unwrap(), fs::canonicalize(&ids).unwrap().as_path() ); + assert_eq!( + plan.require_auxiliary(" ordinaldb.ids ").unwrap(), + fs::canonicalize(&ids).unwrap().as_path() + ); assert_eq!( plan.auxiliary_by_name("optional.stats").unwrap().state(), AuxiliaryArtifactState::OptionalAbsent