From c13ac690a6a9022946e2ff238467249b38561df4 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 19 Jun 2026 13:56:20 -0500 Subject: [PATCH] Fix release audit blockers --- .github/workflows/ci.yml | 22 ++-- .github/workflows/coverage.yml | 17 ++-- CHANGELOG.md | 83 ++++++++------- README.md | 13 ++- THREAT_MODEL.md | 13 ++- docs/INDEX_PROVENANCE.md | 13 ++- docs/PERSISTED_FORMAT.md | 8 +- fuzz/fuzz_targets/load_fastscan.rs | 13 ++- ordvec-ffi/src/lib.rs | 12 +++ ordvec-manifest/README.md | 12 ++- ordvec-manifest/src/lib.rs | 151 ++++++++++++++++++---------- ordvec-manifest/src/main.rs | 5 + ordvec-manifest/src/sqlite.rs | 13 ++- ordvec-manifest/tests/manifest.rs | 33 ++++++ ordvec-python/README.md | 4 + src/fastscan.rs | 87 +++++++++++++++- src/lib.rs | 6 +- src/quant.rs | 6 +- src/rank_io.rs | 125 +++++++++++++++++++---- tests/index/fastscan.rs | 74 +++++++++++++- tests/release_publish_invariants.py | 31 ++++-- 21 files changed, 568 insertions(+), 173 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 59574460..ee4354d3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -357,10 +357,11 @@ jobs: # # Pattern adapted from microsoft/DiskANN's CI (also a vector-search crate). # The local setup-intel-sde action owns the fixed Intel downloadmirror build, - # SHA256 verification, and x86_64 runner guard. The SHA gate still fails - # closed for any archive we extract. While Intel's CloudFront/WAF challenge - # blocks GitHub-hosted runners, this PR temporarily lets the job report an - # explicit unavailable state and skip SDE-dependent steps. + # SHA256 verification, and x86_64 runner guard. The SHA gate fails closed for + # any archive we extract. Pull requests may soft-skip during Intel mirror + # outages, but push/workflow_dispatch runs fail closed; the release gate only + # accepts the post-merge push workflow result, so a release cannot proceed + # without the SDE probe and AVX-512 tests actually executing on main. avx512: name: avx512 (Intel SDE / Sapphire Rapids) runs-on: ubuntu-24.04 @@ -393,12 +394,13 @@ jobs: with: version: ${{ env.SDE_VERSION }} sha256: ${{ env.SDE_SHA256 }} - allow-unavailable: true - - name: note Intel SDE unavailable - if: steps.sde.outputs.sde-available != 'true' - run: echo "::notice::Intel SDE archive unavailable; temporarily skipping AVX-512 SDE coverage." + allow-unavailable: ${{ github.event_name == 'pull_request' }} + - name: note Intel SDE unavailable on PR + if: ${{ github.event_name == 'pull_request' && steps.sde.outputs.sde-available != 'true' }} + run: | + echo "::warning::Intel SDE archive unavailable on this pull request; push and release-gated runs fail closed." - name: sanity-check AVX-512 detection under SDE - if: steps.sde.outputs.sde-available == 'true' + if: ${{ steps.sde.outputs.sde-available == 'true' }} env: SDE_PATH: ${{ steps.sde.outputs.sde-path }} run: | @@ -432,7 +434,7 @@ jobs: "${SDE_PATH}" -spr -- \ "${RUNNER_TEMP}/sde-probe/target/release/sde-probe" - name: cargo test under SDE (AVX-512 kernels) - if: steps.sde.outputs.sde-available == 'true' + if: ${{ steps.sde.outputs.sde-available == 'true' }} env: CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER: ${{ steps.sde.outputs.sde-path }} -spr -- # Cause any AVX-512 test that would silently skip on a non-AVX-512 host diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 50064581..e68c1f79 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -54,15 +54,16 @@ jobs: with: version: ${{ env.SDE_VERSION }} sha256: ${{ env.SDE_SHA256 }} - allow-unavailable: true - - name: Note Intel SDE unavailable - if: steps.sde.outputs.sde-available != 'true' - run: echo "::notice::Intel SDE archive unavailable; temporarily skipping SDE-backed coverage." + allow-unavailable: ${{ github.event_name == 'pull_request' }} + - name: note Intel SDE unavailable on PR + if: ${{ github.event_name == 'pull_request' && steps.sde.outputs.sde-available != 'true' }} + run: | + echo "::warning::Intel SDE archive unavailable on this pull request; push and release-gated runs fail closed." - name: Install cargo-llvm-cov (pinned) - if: steps.sde.outputs.sde-available == 'true' + if: ${{ steps.sde.outputs.sde-available == 'true' }} run: cargo install cargo-llvm-cov --version 0.8.7 --locked - name: Sanity-check AVX-512 detection under SDE - if: steps.sde.outputs.sde-available == 'true' + if: ${{ steps.sde.outputs.sde-available == 'true' }} env: SDE_PATH: ${{ steps.sde.outputs.sde-path }} run: | @@ -99,12 +100,12 @@ jobs: # feature detection reaches the AVX-512 kernels. That makes the coverage # floor reflect the same exercised code as the dedicated ci.yml avx512 job. - name: Generate coverage (lcov) + enforce floor - if: steps.sde.outputs.sde-available == 'true' + if: ${{ steps.sde.outputs.sde-available == 'true' }} env: CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER: ${{ steps.sde.outputs.sde-path }} -spr -- run: cargo llvm-cov --all-features --target x86_64-unknown-linux-gnu --fail-under-lines 85 --lcov --output-path lcov.info - name: Upload coverage to Codecov - if: steps.sde.outputs.sde-available == 'true' + if: ${{ steps.sde.outputs.sde-available == 'true' }} uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f # v7.0.0 with: files: lcov.info diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a055246..a526c0fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,10 +5,21 @@ All notable changes to this project are documented here. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## Unreleased +## 0.5.0 - 2026-06-19 ### Security +- **Hardened `.ovfs` FastScan loading before the format's first stable + release.** `RankQuantFastscan` now rejects invalid FastScan payload bytes + (`byte & 0xf0 != 0`), rows that violate b=2 constant composition, and + nonzero block-tail padding across the path, reader, and byte-slice load APIs. + Loader fuzzing now runs a safe `search()` after every successful `.ovfs` load, + and persisted-input tests compare the dispatch path against the scalar + FastScan reference (AVX-512 under SDE, scalar otherwise). +- **Bounded calibration-profile hashing in `ordvec-manifest`.** Verification now + applies `max_calibration_profile_bytes` (64 MiB by default, CLI-overridable) + before hashing calibration profile artifacts, matching the existing bounded + resource model for encoder-distortion profiles and auxiliary artifacts. - **Cleared OSV / OpenSSF-Scorecard advisories on the dev-only BEIR benchmark tooling** (introduced with the benchmark harness; none reach the published `ordvec` crate or the `ordvec` PyPI wheel). The `benchmarks/beir/requirements.txt` @@ -41,6 +52,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `ordvec-manifest` v1 support for `.ovfs` are deferred to 0.8.0 (#233, #232); bind `.ovfs` artifacts with caller-owned checksums or attestations when they cross a trust boundary. +- **Caller-owned serial batched/buffered two-stage primitives** (additive): + `SignBitmap::top_m_candidates_batched_serial_csr`, `CandidateBatch`, + `SubsetScratch`, `RankQuant::search_asymmetric_subset_batched_serial`, and + `RankQuant::search_asymmetric_subset_batched_serial_into`. These primitives + never enter rayon; callers partition query batches and drive the serial + `_into` primitive from their own scheduler. The serial CSR candidate generator + is correctness-first in this release; future releases can optimize internals + behind the same signature. +- `avx512vpop_supported()` (`#[doc(hidden)]`) — reports whether the AVX-512 + VPOPCNTDQ scan kernels are active on the current CPU. The scan dispatch reads + only this predicate (no per-dimension gate). ### Performance @@ -58,12 +80,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 residues 0..7 plus 384/512/768/1024/1536 for all six SignBitmap/Bitmap scan kernels. This is stage-1 scan-kernel throughput, not a whole-pipeline figure. -### Added - -- `avx512vpop_supported()` (`#[doc(hidden)]`) — reports whether the AVX-512 - VPOPCNTDQ scan kernels are active on the current CPU. The scan dispatch reads - only this predicate (no per-dimension gate). - ### Changed - **Clarified BEIR benchmark release claims.** The committed README figures use @@ -76,6 +92,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 previously-written `.tvr` / `.tvrq` / `.tvbm` / `.tvsb` file continues to load unchanged; only the file extensions and magic bytes written by `write()` change (#230). +- **Documented the v0.5 `b=8` support boundary.** `b=8` is a stable Rust + in-memory evidence/refinement width: asymmetric scoring and code/projection + generation work at any valid dimension, while symmetric `RankQuant::search` + requires `dim % 256 == 0`. It is not exposed through the Python `RankQuant` + constructor in v0.5.0, cannot be persisted to `.ovrq`, and each prepared + asymmetric query/worker owns a `dim * 256` `f32` LUT (about 64 MiB at the + maximum dimension). - **Release-hardened the caller-owned serial two-stage primitives** (no API change; added in 0.5.0). The trust model is now explicit and tested: - Rejection-path regression tests for the full CSR/query/buffer validation set @@ -85,9 +108,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 input can never reach the SIMD scan. - A counting-allocator test proving `search_asymmetric_subset_batched_serial_into` performs **zero heap allocations** in steady state (warmed `SubsetScratch`, - reused caller buffers) **on the AVX-512/AVX2 rerank path** — the strong form of - the prior capacity-stability proxy. (The scalar fallback, e.g. aarch64, - allocates a per-query scoring LUT; the test skips the strict check there.) + reused caller buffers, including the scalar LUT scratch) across the rerank + paths — the strong form of the prior capacity-stability proxy. - A focused `two_stage_bench` example decomposing stage-1 candidate-gen / single-query rerank loop / batched `_into` / full two-stage at the Harrier-1024 shape, with a committed reference capture @@ -97,6 +119,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- **Made Intel SDE AVX-512 coverage fail closed for release gating.** Pull + requests may emit a visible warning and skip SDE-dependent steps during an + Intel mirror outage, but the push/workflow-dispatch runs used by the release + gate still fail closed; setup must succeed, the AVX-512 CPUID probe must run, + and the SDE-backed test/coverage commands must execute before release. +- **Closed manifest verifier path-reopen drift.** Verification and SQLite + cache-key construction now hash, probe, and validate the canonical path that + was checked and recorded, rather than reopening the pre-canonical joined path. +- **Marked persisted-format metadata enums non-exhaustive before v0.5 ships.** + `IndexKind`, `IndexParams`, `ManifestIndexKind`, and `ManifestIndexParams` + are now future-extensible for later stable formats such as `.ovfs` manifest + support without forcing downstream exhaustive matches. +- **Corrected FastScan dispatch documentation.** `RankQuantFastscan` dispatches + AVX-512 when available and otherwise uses its scalar kernel; the AVX2 path is + part of the exact `RankQuant` asymmetric scorer, not FastScan. - **`ordvec-manifest` crate and wheel now ship license text.** Both declared `MIT OR Apache-2.0` but packaged no `LICENSE-*` files (a pre-0.5.0 defect); added `LICENSE-MIT` + `LICENSE-APACHE-2.0` (copied from the workspace root) to @@ -108,32 +145,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 the sdist) — closing the regression class at the published-bytes layer, not only at `cargo package`. -## 0.5.0 - 2026-06-13 - -### Added - -- **Caller-owned serial batched/buffered two-stage primitives** (additive): - - `SignBitmap::top_m_candidates_batched_serial_csr(&self, queries, m) -> CandidateBatch` - — serial (no rayon) CSR candidate generation; pair with the rerank below to - run a fully caller-scheduled two-stage search. - - `RankQuant::search_asymmetric_subset_batched_serial(..) -> SearchResults` and - `..._serial_into(.., &mut SubsetScratch, &mut out_scores, &mut out_indices)` - — serial batched subset rerank; the `_into` form is allocation-free after - scratch warmup on the AVX-512/AVX2 rerank path (the integration contract for - runtimes that own their own thread pool / GIL release). - - New public types `CandidateBatch` (CSR candidate carrier) and `SubsetScratch` - (reusable rerank scratch). -- These primitives never enter rayon; the caller owns parallelism. No bundled - rayon convenience wrapper ships in this release — partition the query batch and - drive the serial `_into` primitive from your own pool. The existing - internally-parallel `top_m_candidates_batched` and `search_asymmetric*` are - unchanged. - -### Notes - -- The serial CSR candidate-gen is a correctness-first implementation; a future - release optimizes its internals behind the same signature. - ## 0.4.0 - 2026-06-04 ### Added diff --git a/README.md b/README.md index ae16887d..beb923b6 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,10 @@ structure of each vector on its own: known before you see any data (256 B at dim = 1024, 2-bit), with `bits ∈ {1, 2, 4}` the size/recall knob. (`b = 8` is an opt-in evidence/refinement width — asymmetric scoring at any dim, symmetric only - when `dim % 256 == 0` — not a broad retrieval mode.) + when `dim % 256 == 0` — not a broad retrieval mode. In v0.5.0 it is + Rust-only, in-memory, not accepted by the Python `RankQuant` constructor, and + not persistable to `.ovrq`; each prepared asymmetric query owns a + `dim * 256` `f32` LUT, about 64 MiB at the maximum dimension.) - **Two-stage retrieval, built in.** A cheap bitmap / sign-popcount prefilter feeds an exact rerank — the coarse→fine pipeline ships as library primitives. The coarse-scan→exact-rerank pattern, and the @@ -118,7 +121,9 @@ large-scale serving rather than competing with one. - **`Rank`** — full-precision rank vectors (`u16` per coordinate). - **`RankQuant`** — ranks bucketed into `1 << bits` equal-width bins, `bits` bits per coordinate (`dim * bits / 8` bytes/doc). Both a - symmetric (Spearman) and asymmetric (float-query LUT) scorer. + symmetric (Spearman) and asymmetric (float-query LUT) scorer. `bits ∈ + {1, 2, 4}` are the cross-language persisted retrieval widths in v0.5.0; + `b = 8` is Rust-only and in-memory for evidence/refinement. - **`Bitmap`** — a top-bucket bitmap per document (one bit per coordinate); scoring is `popcount(Q AND D)`, a coarsened rank overlap. - **`SignBitmap`** — a sign bitmap per document for sign-cosine @@ -127,8 +132,8 @@ large-scale serving rather than competing with one. Two further paths, for callers who need them: - **`RankQuantFastscan`** — a stable, documented *but specialized* public - type: an optional b=2 FastScan kernel (block-32 nibble/PQ-LUT, AVX-512 → AVX2 - → scalar dispatch) for absolute-minimum stage-1 scan latency, at 2× the + type: an optional b=2 FastScan kernel (block-32 nibble/PQ-LUT, AVX-512 → + scalar dispatch) for absolute-minimum stage-1 scan latency, at 2× the RankQuant b=2 footprint (`dim/2` bytes/doc) and 8-bit LUT scoring noise. It persists to `.ovfs` (magic `OVFS`) through direct `RankQuantFastscan::{write,load}` calls. In v0.5.0, `.ovfs` is not yet part diff --git a/THREAT_MODEL.md b/THREAT_MODEL.md index 0c8a0230..1237abeb 100644 --- a/THREAT_MODEL.md +++ b/THREAT_MODEL.md @@ -119,7 +119,9 @@ candidate generation followed by RankQuant subset reranking). - Per-row **structural** invariants: `Rank` rows must be a true permutation of `[0, dim)` (verified by bound + duplicate checks ⇒ pigeonhole); `RankQuant` rows must satisfy constant composition (uniform per-bucket - histogram); `Bitmap` rows must have exactly `n_top` bits set. + histogram); `Bitmap` rows must have exactly `n_top` bits set; + `RankQuantFastscan` `.ovfs` rows must use valid FastScan nibbles, satisfy + b=2 constant composition, and zero block-tail padding. - No `panic!` on malformed data — all validation returns `io::Error(InvalidData)`. - The raw `rank_io` read/write functions are `pub(crate)`; the only public @@ -205,8 +207,9 @@ introduces `O(span/255)` per-pair approximation error — an intentional trade-off matching FAISS FastScan semantics, documented in the code. The scalar and AVX-512 paths agree on the same quantized inputs (equivalence test), and `TopK` uses `total_cmp` for deterministic tie-breaking across all paths. -This is approximate *scoring*, not a CPU oracle. FastScan is a `#[doc(hidden)]` -pre-ranker; callers needing exact scores use `RankQuant::search_asymmetric`. +This is approximate *scoring*, not a CPU oracle. FastScan is a stable +specialized pre-ranker; callers needing exact scores use +`RankQuant::search_asymmetric`. **THREAT-SIMD-004 (mitigated this cycle): Native sanitizer coverage for unsafe kernels.** `.github/workflows/sanitizers.yml` runs nightly @@ -444,7 +447,9 @@ SignBitmap→RankQuant retrieval path. `search_asymmetric_fastscan_b2` + the scalar/AVX-512 kernel), crossing the 32-doc block boundary so tail-padding blocks are exercised. On non-AVX-512 CI runners it exercises the scalar reference kernel; under Intel SDE -it exercises the AVX-512 kernel. +it exercises the AVX-512 kernel. The `load_fastscan` target also follows every +successful `.ovfs` load with a safe `search()` call so loader-accepted bytes +must survive the public scan path. **THREAT-FUZZ-002 (mitigated this cycle): CI-bound fuzzing for continuous regression.** A `fuzz.yml` workflow now runs a bounded smoke on every pull diff --git a/docs/INDEX_PROVENANCE.md b/docs/INDEX_PROVENANCE.md index 39719ede..38067f40 100644 --- a/docs/INDEX_PROVENANCE.md +++ b/docs/INDEX_PROVENANCE.md @@ -96,7 +96,8 @@ The manifest verifier checks: path/hash integrity for side artifacts, and optional calibration-profile linkage; - optional `calibration` profile references, checking profile identity, - path/hash integrity, encoder identity, and ordinalization compatibility; + path/hash integrity, configured byte ceiling, encoder identity, and + ordinalization compatibility; - attestation **shape** only: predicate type, builder id when present, and at least one subject SHA-256 matching the artifact when attestations are supplied. @@ -104,7 +105,10 @@ The manifest verifier checks: The v1 verifier intentionally does not create or verify `.ovfs` FastScan artifacts yet. If a `RankQuantFastscan` artifact crosses a trust boundary in v0.5.0, bind the bytes with a caller-owned checksum, artifact-store control, or -attestation and load it directly only after that policy check succeeds. +attestation and load it directly only after that policy check succeeds. The +direct `.ovfs` loader still rejects invalid nibbles, non-canonical block-tail +padding, and rows that violate b=2 constant composition; manifest v1 simply +does not bind or probe those bytes yet. Auxiliary artifacts are for application-owned sidecars such as metadata, secondary indexes, or stores that a caller intends to load together with the @@ -135,8 +139,9 @@ manifest's `calibration.profile_id`. When present, `calibration` binds an index artifact to a hashed ordinal profile used to interpret overlap, bucket, sign, or rank evidence under a calibrated -null. The verifier checks profile identity, path/hash integrity, encoder -identity, and ordinalization compatibility; it does not judge whether the null +null. The verifier checks profile identity, path/hash integrity, configured +byte ceiling, encoder identity, and ordinalization compatibility; it does not +judge whether the null model is scientifically adequate and does not compute likelihood ratios or tail probabilities. Calibration profiles must match the encoder identity declared by `embedding`; cross-encoder calibration is rejected by default. The diff --git a/docs/PERSISTED_FORMAT.md b/docs/PERSISTED_FORMAT.md index d41c9317..3f19677e 100644 --- a/docs/PERSISTED_FORMAT.md +++ b/docs/PERSISTED_FORMAT.md @@ -10,7 +10,10 @@ API, but `.ovfs` is intentionally outside this v1 primitive-format, `probe_index_metadata()`, and `ordvec-manifest` contract. Until metadata-probe and manifest support are promoted, callers should treat `.ovfs` as a specialized direct-load artifact and bind it with application-owned checksums or -attestations when it crosses a trust boundary. +attestations when it crosses a trust boundary. The direct `.ovfs` loader still +validates the payload before search: real document bytes must be 4-bit FastScan +codes, every row must satisfy b=2 constant composition, and block-tail padding +must be zero. All integer fields are little-endian. Each format has one fixed header followed by one contiguous payload. The payload must consume the rest of the file @@ -65,7 +68,8 @@ cache in their own manifests: In v0.5.0, `probe_index_metadata(path)` rejects `OVFS` with an unsupported metadata-probe error rather than returning a partial descriptor. Load `.ovfs` only through `RankQuantFastscan::load` unless and until the FastScan metadata -contract is promoted in a later minor release. +contract is promoted in a later minor release; the direct loader rejects +invalid nibbles, non-canonical tail padding, and b=2 composition violations. Example external segment entry: diff --git a/fuzz/fuzz_targets/load_fastscan.rs b/fuzz/fuzz_targets/load_fastscan.rs index 26b08008..92df8f35 100644 --- a/fuzz/fuzz_targets/load_fastscan.rs +++ b/fuzz/fuzz_targets/load_fastscan.rs @@ -8,15 +8,18 @@ //! `RankQuantFastscan::load_from_bytes` — which runs that exact loader (the //! full public in-memory load path). //! -//! Contract: on arbitrary bytes the loader must return `Ok(..)` or `Err(..)` — -//! never panic, abort, or read out of bounds. libFuzzer treats any panic/abort -//! as a crash, so simply letting the result drop is the assertion. +//! Contract: on arbitrary bytes the loader must return `Ok(..)` or `Err(..)`; +//! if it returns `Ok`, at least one safe search must also complete without +//! panic, abort, or read out of bounds. libFuzzer treats any panic/abort as a +//! crash. #![no_main] use libfuzzer_sys::fuzz_target; fuzz_target!(|data: &[u8]| { - // The only thing under test: arbitrary bytes -> Ok | Err, no panic. - let _ = ordvec::RankQuantFastscan::load_from_bytes(data); + if let Ok(index) = ordvec::RankQuantFastscan::load_from_bytes(data) { + let query = vec![0.0f32; index.dim()]; + let _ = index.search(&query, 1); + } }); diff --git a/ordvec-ffi/src/lib.rs b/ordvec-ffi/src/lib.rs index 9e46cada..fbb9fa70 100644 --- a/ordvec-ffi/src/lib.rs +++ b/ordvec-ffi/src/lib.rs @@ -373,6 +373,12 @@ fn info_for_metadata(meta: &IndexMetadata) -> Result { + return Err(FfiError::new( + ORDVEC_STATUS_UNSUPPORTED_FORMAT, + "ABI v1 does not support this index kind", + )) + } }; info.format_version = u32::from(meta.format_version); info.dim = meta.dim as u64; @@ -387,6 +393,12 @@ fn info_for_metadata(meta: &IndexMetadata) -> Result {} + _ => { + return Err(FfiError::new( + ORDVEC_STATUS_UNSUPPORTED_FORMAT, + "ABI v1 does not support these index parameters", + )) + } } info.capabilities = ORDVEC_CAP_FULL_SEARCH | ORDVEC_CAP_SUBSET_SEARCH diff --git a/ordvec-manifest/README.md b/ordvec-manifest/README.md index 6ed4f229..e60e0028 100644 --- a/ordvec-manifest/README.md +++ b/ordvec-manifest/README.md @@ -156,6 +156,8 @@ Stable limit codes are part of the contract: (`auxiliary_artifact_count_limit_exceeded`); - auxiliary artifact bytes per declared file: 64 MiB (`auxiliary_artifact_file_too_large`); +- calibration profile artifact bytes: 64 MiB + (`calibration_profile_too_large`); - encoder distortion profile artifact bytes: 64 MiB (`encoder_distortion_profile_too_large`); - collected report issues: 1,024, after which a @@ -167,6 +169,7 @@ The CLI exposes matching override flags on `inspect`, `verify`, `create`, `--max-row-map-line-bytes`, `--max-row-map-rows`, `--max-row-map-tracked-id-bytes`, `--max-auxiliary-artifacts`, `--max-auxiliary-artifact-bytes`, +`--max-calibration-profile-bytes`, `--max-encoder-distortion-profile-bytes`, `--max-report-issues`, and `--max-cached-report-bytes`. Library callers can override the same ceilings via `VerifyOptions::limits`. @@ -181,6 +184,7 @@ Stable limit codes: | row-identity duplicate-tracking `db_id` bytes | `row_identity_duplicate_tracking_limit_exceeded` | `row_identity_duplicate_tracking_limit_exceeded` | | auxiliary artifact declarations | `auxiliary_artifact_count_limit_exceeded` | n/a | | auxiliary artifact bytes per declared file | `auxiliary_artifact_file_too_large` | n/a | +| calibration profile artifact bytes | `calibration_profile_too_large` | n/a | | encoder distortion profile artifact bytes | `encoder_distortion_profile_too_large` | n/a | | collected verification report issues | `verification_report_issue_limit_exceeded` | n/a | | SQLite cached report JSON bytes | n/a | `sqlite_cached_report_too_large` | @@ -191,10 +195,10 @@ bounded in-memory reader fail before reading with the same stable `max_report_issues` override of `0` suppresses detail issues and returns only the `verification_report_issue_limit_exceeded` sentinel when any issue would otherwise be reported. These limits bound metadata parsing and report/cache -growth; hashing an index or calibration profile is still proportional to the -artifact bytes being verified. SQLite cache-key construction treats an -over-limit encoder distortion profile as non-cacheable and reruns verification -instead of reusing a previously cached report. +growth; hashing the primary index remains proportional to the artifact bytes +being verified. SQLite cache-key construction treats an over-limit calibration +or encoder distortion profile as non-cacheable and reruns verification instead +of reusing a previously cached report. Manifests may declare `auxiliary_artifacts` for caller-owned sidecars that should be integrity-checked with the same path policy as the primary index. diff --git a/ordvec-manifest/src/lib.rs b/ordvec-manifest/src/lib.rs index acdfc5ac..bee68321 100644 --- a/ordvec-manifest/src/lib.rs +++ b/ordvec-manifest/src/lib.rs @@ -37,6 +37,7 @@ pub const DEFAULT_MAX_ROW_IDENTITY_ROWS: usize = 10_000_000; pub const DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES: usize = 64 * 1024 * 1024; pub const DEFAULT_MAX_AUXILIARY_ARTIFACTS: usize = 1024; pub const DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES: u64 = 64 * 1024 * 1024; +pub const DEFAULT_MAX_CALIBRATION_PROFILE_BYTES: u64 = 64 * 1024 * 1024; pub const DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES: u64 = 64 * 1024 * 1024; pub const DEFAULT_MAX_REPORT_ISSUES: usize = 1024; pub const DEFAULT_MAX_CACHED_REPORT_BYTES: u64 = 4 * 1024 * 1024; @@ -252,7 +253,7 @@ fn verify_manifest_with_path_capture( ) { paths.artifact_path = Some(resolved.canonical_path.clone()); report.artifact.canonical_path = Some(path_to_display(&resolved.canonical_path)); - match sha256_file(&resolved.resolved_path) { + match sha256_file(&resolved.canonical_path) { Ok(hash) => { report.artifact.sha256 = Some(hash.sha256.clone()); report.artifact.size_bytes = Some(hash.size_bytes); @@ -281,11 +282,12 @@ fn verify_manifest_with_path_capture( ), } - match probe_index_metadata(&resolved.resolved_path) { + match probe_index_metadata(&resolved.canonical_path) { Ok(metadata) => { - let metadata_report = MetadataReport::from_core(&metadata); + if let Ok(metadata_report) = MetadataReport::try_from_core(&metadata) { + report.artifact.metadata = Some(metadata_report); + } compare_artifact_metadata(&document.manifest.artifact, &metadata, &mut report); - report.artifact.metadata = Some(metadata_report); } Err(err) => report.error( "artifact_probe_failed", @@ -645,25 +647,33 @@ fn compare_artifact_metadata( metadata: &CoreIndexMetadata, report: &mut VerificationReport, ) { - let observed_kind = ManifestIndexKind::from_core(metadata.kind); - if artifact.kind != observed_kind { - report.error( - "artifact_kind_mismatch", - format!( - "artifact kind was {:?}, manifest declares {:?}", - observed_kind, artifact.kind - ), - ); + match ManifestIndexKind::try_from_core(metadata.kind) { + Ok(observed_kind) => { + if artifact.kind != observed_kind { + report.error( + "artifact_kind_mismatch", + format!( + "artifact kind was {:?}, manifest declares {:?}", + observed_kind, artifact.kind + ), + ); + } + } + Err(err) => report.error(err.code(), err.message()), } - let observed_params = ManifestIndexParams::from_core(metadata.params); - if artifact.params != observed_params { - report.error( - "artifact_params_mismatch", - format!( - "artifact params were {:?}, manifest declares {:?}", - observed_params, artifact.params - ), - ); + match ManifestIndexParams::try_from_core(metadata.params) { + Ok(observed_params) => { + if artifact.params != observed_params { + report.error( + "artifact_params_mismatch", + format!( + "artifact params were {:?}, manifest declares {:?}", + observed_params, artifact.params + ), + ); + } + } + Err(err) => report.error(err.code(), err.message()), } if artifact.format_version != metadata.format_version { report.error( @@ -754,7 +764,7 @@ fn verify_row_identity( report.row_identity.canonical_path = Some(path_to_display(&resolved.canonical_path)); match validate_jsonl_rows( - &resolved.resolved_path, + &resolved.canonical_path, options.allow_duplicate_db_ids, &options.limits, Some(*row_count), @@ -1212,7 +1222,7 @@ fn validate_encoder_distortion_profile_artifact( report.encoder_distortion.profile_canonical_path = Some(path_to_display(&resolved.canonical_path)); match sha256_file_bounded( - &resolved.resolved_path, + &resolved.canonical_path, options.limits.max_encoder_distortion_profile_bytes, "encoder_distortion_profile_too_large", "encoder distortion profile", @@ -1657,7 +1667,12 @@ fn validate_calibration_profile( ) { report.calibration.profile_canonical_path = Some(path_to_display(&resolved.canonical_path)); - match sha256_file(&resolved.resolved_path) { + match sha256_file_bounded( + &resolved.canonical_path, + options.limits.max_calibration_profile_bytes, + "calibration_profile_too_large", + "calibration profile", + ) { Ok(hash) => { report.calibration.profile_sha256 = Some(hash.sha256.clone()); report.calibration.profile_size_bytes = Some(hash.size_bytes); @@ -1680,6 +1695,7 @@ fn validate_calibration_profile( ); } } + Err(ManifestError::LimitExceeded { code, message }) => report.error(code, message), Err(err) => report.error( "calibration_profile_hash_failed", format!("failed to hash calibration profile: {err}"), @@ -1921,7 +1937,7 @@ fn verify_auxiliary_artifacts( captured_path = Some(resolved.canonical_path.clone()); entry.canonical_path = Some(path_to_display(&resolved.canonical_path)); match sha256_file_bounded( - &resolved.resolved_path, + &resolved.canonical_path, options.limits.max_auxiliary_artifact_bytes, "auxiliary_artifact_file_too_large", "auxiliary artifact", @@ -2149,10 +2165,7 @@ fn resolve_auxiliary_artifact_path( } } - AuxiliaryPathResolution::Resolved(ResolvedPath { - resolved_path, - canonical_path, - }) + AuxiliaryPathResolution::Resolved(ResolvedPath { canonical_path }) } fn auxiliary_artifact_resolved_path(artifact: &AuxiliaryArtifact, base_dir: &Path) -> PathBuf { @@ -2248,6 +2261,7 @@ pub struct ResourceLimits { pub max_row_identity_tracked_db_id_bytes: usize, pub max_auxiliary_artifacts: usize, pub max_auxiliary_artifact_bytes: u64, + pub max_calibration_profile_bytes: u64, pub max_encoder_distortion_profile_bytes: u64, pub max_report_issues: usize, pub max_cached_report_bytes: u64, @@ -2262,6 +2276,7 @@ impl Default for ResourceLimits { max_row_identity_tracked_db_id_bytes: DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES, max_auxiliary_artifacts: DEFAULT_MAX_AUXILIARY_ARTIFACTS, max_auxiliary_artifact_bytes: DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES, + max_calibration_profile_bytes: DEFAULT_MAX_CALIBRATION_PROFILE_BYTES, max_encoder_distortion_profile_bytes: DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES, max_report_issues: DEFAULT_MAX_REPORT_ISSUES, max_cached_report_bytes: DEFAULT_MAX_CACHED_REPORT_BYTES, @@ -2271,7 +2286,6 @@ impl Default for ResourceLimits { #[derive(Clone, Debug)] struct ResolvedPath { - resolved_path: PathBuf, canonical_path: PathBuf, } @@ -2347,10 +2361,7 @@ fn resolve_existing_path( return None; } - Some(ResolvedPath { - resolved_path, - canonical_path, - }) + Some(ResolvedPath { canonical_path }) } fn has_lexical_escape(path: &Path) -> bool { @@ -2725,6 +2736,7 @@ pub struct RowIdentityDb { #[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] +#[non_exhaustive] pub enum ManifestIndexKind { Rank, RankQuant, @@ -2733,18 +2745,20 @@ pub enum ManifestIndexKind { } impl ManifestIndexKind { - fn from_core(kind: CoreIndexKind) -> Self { + fn try_from_core(kind: CoreIndexKind) -> Result { match kind { - CoreIndexKind::Rank => Self::Rank, - CoreIndexKind::RankQuant => Self::RankQuant, - CoreIndexKind::Bitmap => Self::Bitmap, - CoreIndexKind::SignBitmap => Self::SignBitmap, + CoreIndexKind::Rank => Ok(Self::Rank), + CoreIndexKind::RankQuant => Ok(Self::RankQuant), + CoreIndexKind::Bitmap => Ok(Self::Bitmap), + CoreIndexKind::SignBitmap => Ok(Self::SignBitmap), + other => Err(UnsupportedCoreMetadata::Kind(other)), } } } #[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(tag = "kind", rename_all = "snake_case", deny_unknown_fields)] +#[non_exhaustive] pub enum ManifestIndexParams { Rank, RankQuant { bits: u8 }, @@ -2753,12 +2767,39 @@ pub enum ManifestIndexParams { } impl ManifestIndexParams { - fn from_core(params: CoreIndexParams) -> Self { + fn try_from_core(params: CoreIndexParams) -> Result { match params { - CoreIndexParams::Rank => Self::Rank, - CoreIndexParams::RankQuant { bits } => Self::RankQuant { bits }, - CoreIndexParams::Bitmap { n_top } => Self::Bitmap { n_top }, - CoreIndexParams::SignBitmap => Self::SignBitmap, + CoreIndexParams::Rank => Ok(Self::Rank), + CoreIndexParams::RankQuant { bits } => Ok(Self::RankQuant { bits }), + CoreIndexParams::Bitmap { n_top } => Ok(Self::Bitmap { n_top }), + CoreIndexParams::SignBitmap => Ok(Self::SignBitmap), + other => Err(UnsupportedCoreMetadata::Params(other)), + } + } +} + +#[derive(Copy, Clone, Debug)] +enum UnsupportedCoreMetadata { + Kind(CoreIndexKind), + Params(CoreIndexParams), +} + +impl UnsupportedCoreMetadata { + fn code(self) -> &'static str { + match self { + Self::Kind(_) => "artifact_kind_unsupported", + Self::Params(_) => "artifact_params_unsupported", + } + } + + fn message(self) -> String { + match self { + Self::Kind(kind) => { + format!("artifact metadata kind {kind:?} is not supported by ordvec-manifest v1") + } + Self::Params(params) => format!( + "artifact metadata params {params:?} are not supported by ordvec-manifest v1" + ), } } } @@ -3269,16 +3310,16 @@ pub struct MetadataReport { } impl MetadataReport { - fn from_core(metadata: &CoreIndexMetadata) -> Self { - Self { - kind: ManifestIndexKind::from_core(metadata.kind), + fn try_from_core(metadata: &CoreIndexMetadata) -> Result { + Ok(Self { + kind: ManifestIndexKind::try_from_core(metadata.kind)?, format_version: metadata.format_version, dim: metadata.dim, vector_count: metadata.vector_count, bytes_per_vec: metadata.bytes_per_vec, - params: ManifestIndexParams::from_core(metadata.params), + params: ManifestIndexParams::try_from_core(metadata.params)?, file_size_bytes: metadata.file_size_bytes, - } + }) } } @@ -3438,15 +3479,19 @@ pub fn create_manifest_for_index_with_options( } let metadata = probe_index_metadata(index_path)?; let index_hash = sha256_file(index_path)?; + let kind = ManifestIndexKind::try_from_core(metadata.kind) + .map_err(|err| ManifestError::invalid(err.message()))?; + let params = ManifestIndexParams::try_from_core(metadata.params) + .map_err(|err| ManifestError::invalid(err.message()))?; let artifact = Artifact { path: manifest_path_for_create(index_path, out_base, &options, "artifact")?, sha256: index_hash.sha256, - kind: ManifestIndexKind::from_core(metadata.kind), + kind, format_version: metadata.format_version, dim: metadata.dim, vector_count: metadata.vector_count, bytes_per_vec: metadata.bytes_per_vec, - params: ManifestIndexParams::from_core(metadata.params), + params, file_size_bytes: metadata.file_size_bytes, }; @@ -3834,6 +3879,8 @@ fn is_limit_issue_code(code: &str) -> bool { "row_identity_line_too_large" | "row_identity_row_count_limit_exceeded" | "row_identity_duplicate_tracking_limit_exceeded" + | "calibration_profile_too_large" + | "encoder_distortion_profile_too_large" | "verification_report_issue_limit_exceeded" ) } diff --git a/ordvec-manifest/src/main.rs b/ordvec-manifest/src/main.rs index d26754e1..6236878e 100644 --- a/ordvec-manifest/src/main.rs +++ b/ordvec-manifest/src/main.rs @@ -174,6 +174,8 @@ struct LimitArgs { #[arg(long)] max_auxiliary_artifact_bytes: Option, #[arg(long)] + max_calibration_profile_bytes: Option, + #[arg(long)] max_encoder_distortion_profile_bytes: Option, #[arg(long)] max_report_issues: Option, @@ -202,6 +204,9 @@ impl LimitArgs { if let Some(value) = self.max_auxiliary_artifact_bytes { limits.max_auxiliary_artifact_bytes = value; } + if let Some(value) = self.max_calibration_profile_bytes { + limits.max_calibration_profile_bytes = value; + } if let Some(value) = self.max_encoder_distortion_profile_bytes { limits.max_encoder_distortion_profile_bytes = value; } diff --git a/ordvec-manifest/src/sqlite.rs b/ordvec-manifest/src/sqlite.rs index bd6694cb..6368f9f3 100644 --- a/ordvec-manifest/src/sqlite.rs +++ b/ordvec-manifest/src/sqlite.rs @@ -399,7 +399,7 @@ fn current_cache_key( ) else { return Ok(None); }; - let artifact_sha256 = match sha256_file(&artifact.resolved_path) { + let artifact_sha256 = match sha256_file(&artifact.canonical_path) { Ok(hash) => hash.sha256, Err(_) => return Ok(None), }; @@ -424,7 +424,7 @@ fn current_cache_key( }; let mut row_errors = Vec::new(); let stats = match validate_jsonl_rows( - &row_identity.resolved_path, + &row_identity.canonical_path, options.allow_duplicate_db_ids, &options.limits, Some(*row_count), @@ -616,7 +616,12 @@ fn current_calibration_profile_sha256( ) else { return Ok(None); }; - match sha256_file(&resolved.resolved_path) { + match sha256_file_bounded( + &resolved.canonical_path, + options.limits.max_calibration_profile_bytes, + "calibration_profile_too_large", + "calibration profile", + ) { Ok(hash) => Ok(Some(hash.sha256)), Err(_) => Ok(None), } @@ -646,7 +651,7 @@ fn current_encoder_distortion_profile_sha256( return Ok(None); }; match sha256_file_bounded( - &resolved.resolved_path, + &resolved.canonical_path, options.limits.max_encoder_distortion_profile_bytes, "encoder_distortion_profile_too_large", "encoder distortion profile", diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs index dab4dbe1..3a583e27 100644 --- a/ordvec-manifest/tests/manifest.rs +++ b/ordvec-manifest/tests/manifest.rs @@ -1765,6 +1765,19 @@ fn calibration_profile_artifact_checks_are_enforced() { Some(profile_hash.sha256.as_str()) ); + let report = verify_manifest_with_base( + manifest.clone(), + case.path(), + VerifyOptions { + limits: ResourceLimits { + max_calibration_profile_bytes: 16, + ..ResourceLimits::default() + }, + ..VerifyOptions::default() + }, + ); + assert!(error_codes(&report).contains(&"calibration_profile_too_large")); + let mut missing_profile = manifest.clone(); missing_profile.calibration.as_mut().unwrap().profile = None; let report = verify_manifest_with_base(missing_profile, case.path(), VerifyOptions::default()); @@ -3661,6 +3674,26 @@ fn sqlite_cache_key_includes_calibration_profile_bytes() { .unwrap(); assert!(report.ok, "{:?}", report.errors); + let limited = ordvec_manifest::sqlite::verify_with_registry( + &db, + &document, + &manifest_path, + VerifyOptions { + limits: ResourceLimits { + max_calibration_profile_bytes: 16, + ..ResourceLimits::default() + }, + ..VerifyOptions::default() + }, + true, + ) + .unwrap(); + assert!( + error_codes(&limited).contains(&"calibration_profile_too_large"), + "{:?}", + limited.errors + ); + fs::write( &profile_path, vec![1u8; manifest.artifact.dim * std::mem::size_of::()], diff --git a/ordvec-python/README.md b/ordvec-python/README.md index 4d39595e..90b80b9a 100644 --- a/ordvec-python/README.md +++ b/ordvec-python/README.md @@ -24,6 +24,10 @@ scores, ids = q.search_asymmetric(np.random.randn(8, 1024).astype(np.float32), k | `Bitmap` | Constant-weight top-bucket bitmap per document; `popcount(Q AND D)` candidate scoring. | | `SignBitmap` | Sign bitmap for sign-cosine candidate generation; separate from the constant-weight bitmap theorem. | +The Rust crate's `b = 8` RankQuant evidence/refinement width is not exposed +through the v0.5 Python `RankQuant` constructor and cannot be persisted to +`.ovrq`; use `bits` 1, 2, or 4 from Python. + ## Two-stage retrieval (subset rerank) A `Bitmap` / `SignBitmap` probe yields a candidate shortlist that diff --git a/src/fastscan.rs b/src/fastscan.rs index b856da4d..b33f585a 100644 --- a/src/fastscan.rs +++ b/src/fastscan.rs @@ -19,7 +19,9 @@ //! type — not the headline API. The free [`search_asymmetric_fastscan_b2`] //! entry point stays `pub(crate)`: production callers should reach for //! [`RankQuant::search_asymmetric`](crate::RankQuant::search_asymmetric), -//! whose AVX-512 → AVX2 → scalar dispatch is the maintained surface. Prefer +//! whose AVX-512 → AVX2 → scalar dispatch is the maintained exact surface. +//! FastScan itself dispatches AVX-512 when available and otherwise uses its +//! scalar kernel. Prefer //! FastScan only when b=2 scan latency is the binding constraint. //! This latency path is not part of the constant-weight bitmap overlap //! calibration theorem. @@ -654,7 +656,9 @@ impl RankQuantFastscan { /// Persist this index to a `.ovfs` file (magic `OVFS`). /// /// The on-disk form is a 13-byte header (`OVFS` magic, version, `dim`, - /// `n_vectors`) followed by the opaque block-32 packed FastScan payload. + /// `n_vectors`) followed by the block-32 packed FastScan payload. Each + /// real document byte must be a 4-bit code, each row must satisfy b=2 + /// constant composition, and block-tail padding must be zero. /// This is a new ordvec format with no turbovec-era counterpart. Round-trip /// is a type-level guarantee: [`Self::load`] reconstructs the same /// `(dim, n_vectors)` and packed buffer this writes. @@ -669,9 +673,9 @@ impl RankQuantFastscan { /// Load a `.ovfs` FastScan index previously written by [`Self::write`]. /// - /// The loader validates the header and that the payload length is exactly - /// the block-32 size implied by `(dim, n_vectors)` (`dim % 4 == 0`, no - /// trailing bytes), so the returned index is consistent by construction. + /// The loader validates the header, exact payload length, FastScan nibble + /// domain, b=2 constant composition, and zero tail padding, so the returned + /// index is consistent by construction. pub fn load(path: impl AsRef) -> std::io::Result { let (dim, n_vectors, packed_fs) = crate::rank_io::load_fastscan(path)?; Ok(Self { @@ -699,3 +703,76 @@ impl RankQuantFastscan { Self::read_from(std::io::Cursor::new(bytes)) } } + +#[cfg(test)] +mod tests { + use super::*; + + fn scalar_search_reference( + packed_fs: &[u8], + n: usize, + dim: usize, + queries: &[f32], + k: usize, + ) -> SearchResults { + let nq = queries.len() / dim; + let k = k.min(n); + let mut scores = vec![f32::NEG_INFINITY; nq * k]; + let mut indices = vec![-1i64; nq * k]; + if k == 0 { + return SearchResults { + scores: Vec::new(), + indices: Vec::new(), + nq, + k, + }; + } + + let centred_norm = rankquant_norm(dim, 2); + let inv_norm = 1.0_f32 / centred_norm; + for query_index in 0..nq { + let q = &queries[query_index * dim..(query_index + 1) * dim]; + let q_unit = l2_normalise(q); + let (lut_u8, bias_sum, inv_q) = build_fastscan_b2_query(&q_unit, dim); + let mut top = TopK::new(k); + scan_b2_fastscan_scalar( + packed_fs, n, dim, &lut_u8, bias_sum, inv_q, inv_norm, &mut top, + ); + top.finalize_into( + &mut scores[query_index * k..(query_index + 1) * k], + &mut indices[query_index * k..(query_index + 1) * k], + ); + } + + SearchResults { + scores, + indices, + nq, + k, + } + } + + #[test] + fn persisted_fastscan_dispatch_matches_scalar_reference() { + let dim = 64usize; + let n = 65usize; + let k = 9usize; + let docs: Vec = (0..n * dim) + .map(|i| (((i * 37 + 11) % 257) as f32 - 128.0) / 128.0) + .collect(); + let queries: Vec = (0..3 * dim) + .map(|i| (((i * 19 + 5) % 193) as f32 - 96.0) / 96.0) + .collect(); + + let mut idx = RankQuantFastscan::new(dim); + idx.add(&docs); + let mut bytes = Vec::new(); + idx.write_to(&mut bytes).unwrap(); + let loaded = RankQuantFastscan::load_from_bytes(&bytes).unwrap(); + + let scalar = scalar_search_reference(&loaded.packed_fs, loaded.n_vectors, dim, &queries, k); + let dispatched = loaded.search(&queries, k); + assert_eq!(dispatched.indices, scalar.indices); + assert_eq!(dispatched.scores, scalar.scores); + } +} diff --git a/src/lib.rs b/src/lib.rs index 6f7bdf5a..ea854f95 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,8 +28,8 @@ //! candidate generation. //! //! For b=2 specifically, [`RankQuantFastscan`] is a specialized companion to -//! [`RankQuant`] — a block-32 FastScan kernel (nibble LUT; AVX-512 → AVX2 → -//! scalar dispatch) for absolute-minimum stage-1 scan latency, trading 2× the +//! [`RankQuant`] — a block-32 FastScan kernel (nibble LUT; AVX-512 → scalar +//! dispatch) for absolute-minimum stage-1 scan latency, trading 2× the //! b=2 storage and 8-bit LUT scoring noise. Reach for it only when scan latency //! is the binding constraint. //! @@ -171,7 +171,7 @@ pub use const_weight_bitmap::{ }; // `RankQuantFastscan` is a specialized b=2 FastScan scan path (block-32 nibble -// LUT, AVX-512 → AVX2 → scalar dispatch) for absolute-minimum stage-1 scan +// LUT, AVX-512 → scalar dispatch) for absolute-minimum stage-1 scan // latency, at the cost of 2× the `RankQuant` b=2 storage and 8-bit LUT scoring // noise. It is a stable, documented public type, but a *specialized* one — the // headline retrieval surface is still `RankQuant` / `Bitmap` / two-stage; reach diff --git a/src/quant.rs b/src/quant.rs index 4e44e361..a06a07f0 100644 --- a/src/quant.rs +++ b/src/quant.rs @@ -204,7 +204,11 @@ pub enum RankQuantCapability { /// surface — but it is capability-gated: construct an asymmetric-only /// `b=8` index for non-`256`-aligned dims via [`Self::new_asymmetric`] /// and check [`Self::symmetric_supported`] before calling -/// [`Self::search`]. See [`RankQuantCapability`]. +/// [`Self::search`]. The b=8 asymmetric query LUT has `dim * 256` `f32` +/// entries per prepared query/worker, about 64 MiB at the maximum dimension. +/// In v0.5.0 this width is Rust-only and in-memory: the Python `RankQuant` +/// constructor rejects it and `.ovrq` persistence supports only +/// `bits ∈ {1, 2, 4}`. See [`RankQuantCapability`]. pub struct RankQuant { pub(crate) dim: usize, pub(crate) bits: u8, diff --git a/src/rank_io.rs b/src/rank_io.rs index 59a92562..67924556 100644 --- a/src/rank_io.rs +++ b/src/rank_io.rs @@ -37,6 +37,9 @@ //! rejected (v1 formats have no footer or reserved trailing section). //! * Per-index invariants (e.g., `dim % (1 << bits) == 0` for RankQuant) //! are returned as `Err(InvalidData)`, never `assert!`'d. +//! * FastScan `.ovfs` payloads are decoded far enough to reject invalid +//! nibbles, non-canonical tail padding, and rows that violate b=2 constant +//! composition before any search path can observe the bytes. //! //! Any malformed input returns `io::Error` rather than panicking. //! @@ -60,13 +63,11 @@ //! validates its parameters (matching the loaders' `dim` / `n_top` / `bits` / //! divisibility bounds), `add` caps `n_vectors` at [`MAX_VECTORS`], and the //! types emit only loader-valid data — so anything `T::write` produces, -//! `T::load` reloads. The raw `write_*` helpers are trusted serializers: they -//! assume loader-valid inputs (which only the index types construct) and do -//! *not* re-validate `dim` / `n_vectors` / structure / data semantics. The -//! 128 GiB `MAX_PAYLOAD` cap is the one loader bound they also enforce -//! (checked before `File::create`, so a rejected oversized write never -//! truncates an existing file) — defense-in-depth, and belt-and-braces now -//! that the helpers are no longer reachable with arbitrary external input. +//! `T::load` reloads. The raw `write_*` helpers are trusted serializers for +//! the private in-memory buffers; they still enforce the same header, length, +//! and size-cap guards as the loaders (and `.ovfs` also revalidates its public +//! payload bytes) before `File::create`, so a rejected write never truncates an +//! existing file. use std::fs::File; use std::io::{self, BufReader, BufWriter, Read, Seek, SeekFrom, Write}; @@ -91,6 +92,7 @@ const VERSION: u8 = 1; /// Persisted index family identified from an on-disk ordvec index header. #[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[non_exhaustive] pub enum IndexKind { Rank, RankQuant, @@ -100,6 +102,7 @@ pub enum IndexKind { /// Format-specific parameters declared by an on-disk ordvec index header. #[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[non_exhaustive] pub enum IndexParams { Rank, RankQuant { bits: u8 }, @@ -381,14 +384,11 @@ pub fn probe_index_metadata(path: impl AsRef) -> io::Result OVBM_MAGIC | TVBM_MAGIC => probe_bitmap_metadata(&mut f, file_size_bytes), OVSB_MAGIC | TVSB_MAGIC => probe_sign_bitmap_metadata(&mut f, file_size_bytes), // `OVFS` (RankQuantFastscan) is a recognized magic, but metadata probing - // is intentionally NOT wired up here yet: surfacing it would need a new - // `IndexKind` variant, and `IndexKind` is not `#[non_exhaustive]`, so - // adding one is a breaking change — deferred to the 0.8.0 API - // re-architecture (#232). `.ovfs` files still round-trip via - // `RankQuantFastscan::{write,load}`; only this metadata-probe path is - // pending. Return a specific, actionable error rather than letting it - // fall through to the generic unknown-magic case (which would be - // misleading, since the magic *is* known). + // is intentionally NOT wired up here yet. `.ovfs` files still + // round-trip via `RankQuantFastscan::{write,load}`; only this + // metadata-probe path is pending. Return a specific, actionable error + // rather than letting it fall through to the generic unknown-magic case + // (which would be misleading, since the magic *is* known). OVFS_MAGIC => Err(invalid( "OVFS (RankQuantFastscan) metadata probing is not supported in this \ version; load the index with RankQuantFastscan::load (tracked in #232)", @@ -1069,6 +1069,67 @@ fn fastscan_payload_bytes(dim: usize, vector_count: usize) -> io::Result .ok_or_else(|| invalid("OVFS payload size overflows usize")) } +fn validate_fastscan_payload(dim: usize, n_vectors: usize, packed_fs: &[u8]) -> io::Result<()> { + if n_vectors == 0 { + if packed_fs.is_empty() { + return Ok(()); + } + return Err(invalid(format!( + "OVFS payload is {} bytes but empty index implies 0", + packed_fs.len() + ))); + } + + let pairs = dim / 2; + let n_blocks = n_vectors.div_ceil(32); + let bytes_per_block = pairs * 32; + let expected_per_bucket = dim / 4; + + for block in 0..n_blocks { + let doc_base = block * 32; + let docs_in_block = (n_vectors - doc_base).min(32); + let block_offset = block * bytes_per_block; + + for lane in 0..docs_in_block { + let doc = doc_base + lane; + let mut bucket_counts = [0usize; 4]; + for pair in 0..pairs { + let offset = block_offset + pair * 32 + lane; + let byte = packed_fs[offset]; + if byte & 0xf0 != 0 { + return Err(invalid(format!( + "OVFS payload byte at block {block}, pair {pair}, lane {lane} \ + (document {doc}) has invalid FastScan nibble 0x{byte:02x}" + ))); + } + bucket_counts[((byte >> 2) & 0x03) as usize] += 1; + bucket_counts[(byte & 0x03) as usize] += 1; + } + if bucket_counts != [expected_per_bucket; 4] { + return Err(invalid(format!( + "OVFS document {doc} violates b=2 constant composition: \ + counts={bucket_counts:?}, expected {expected_per_bucket} per bucket" + ))); + } + } + + for lane in docs_in_block..32 { + for pair in 0..pairs { + let offset = block_offset + pair * 32 + lane; + let byte = packed_fs[offset]; + if byte != 0 { + return Err(invalid(format!( + "OVFS tail padding byte at block {block}, pair {pair}, lane {lane} \ + must be zero, got 0x{byte:02x}" + ))); + } + } + } + } + + Ok(()) +} + pub(crate) fn write_fastscan( path: impl AsRef, dim: usize, @@ -1110,6 +1171,7 @@ fn check_fastscan_write(dim: usize, n_vectors: usize, packed_fs: &[u8]) -> io::R packed_fs.len() ))); } + validate_fastscan_payload(dim, n_vectors, packed_fs)?; Ok(()) } @@ -1167,11 +1229,9 @@ fn load_fastscan_from_stream( let payload_bytes = fastscan_payload_bytes(dim, n_vectors)?; check_payload_bytes(payload_bytes)?; check_payload_matches_file(&mut f, "OVFS", file_len, payload_bytes)?; - // The packed FastScan payload is opaque pre-encoded nibbles in the block-32 - // transpose: any byte value is valid, so there is no per-row invariant to - // check beyond the exact payload length validated above. let mut packed_fs = try_alloc_zeroed(payload_bytes)?; f.read_exact(&mut packed_fs)?; + validate_fastscan_payload(dim, n_vectors, &packed_fs)?; Ok((dim, n_vectors, packed_fs)) } @@ -1803,7 +1863,13 @@ mod tests { use super::{load_fastscan, write_fastscan}; // dim=8 (multiple of 4), 4 vectors -> ceil(4/32)*(8/2)*32 = 128-byte payload. let (dim, n) = (8usize, 4usize); - let payload = vec![0u8; 128]; + let mut payload = vec![0u8; 128]; + for lane in 0..n { + payload[lane] = 0x00; + payload[32 + lane] = 0x05; + payload[64 + lane] = 0x0a; + payload[96 + lane] = 0x0f; + } let p = temp_index_path("ovfs_ok"); write_fastscan(&p, dim, n, &payload).unwrap(); let (ld, ln, lbytes) = load_fastscan(&p).unwrap(); @@ -1822,6 +1888,19 @@ mod tests { let e = write_fastscan(&p3, dim, n, &payload[..100]).unwrap_err(); assert_eq!(e.kind(), std::io::ErrorKind::InvalidData); assert!(!p3.exists(), "rejected write must not create a file"); + + // A byte that is not a real FastScan nibble is rejected on write, before + // a file can be created for the safe load/search APIs to observe. + let p4 = temp_index_path("ovfs_badnibble"); + let mut invalid_payload = payload.clone(); + invalid_payload[32] = 0x10; + let e = write_fastscan(&p4, dim, n, &invalid_payload).unwrap_err(); + assert_eq!(e.kind(), std::io::ErrorKind::InvalidData); + assert!( + e.to_string().contains("invalid FastScan nibble"), + "unexpected error: {e}" + ); + assert!(!p4.exists(), "rejected write must not create a file"); } // Probing a valid `.ovfs` file returns a specific, actionable error — NOT the @@ -1833,7 +1912,13 @@ mod tests { fn probe_rejects_ovfs_with_specific_unsupported_error() { use super::{probe_index_metadata, write_fastscan}; let (dim, n) = (8usize, 4usize); - let payload = vec![0u8; 128]; + let mut payload = vec![0u8; 128]; + for lane in 0..n { + payload[lane] = 0x00; + payload[32 + lane] = 0x05; + payload[64 + lane] = 0x0a; + payload[96 + lane] = 0x0f; + } let p = temp_index_path("ovfs_probe"); write_fastscan(&p, dim, n, &payload).unwrap(); let err = probe_index_metadata(&p); diff --git a/tests/index/fastscan.rs b/tests/index/fastscan.rs index ae4bca12..8e995526 100644 --- a/tests/index/fastscan.rs +++ b/tests/index/fastscan.rs @@ -269,13 +269,59 @@ fn fastscan_new_rejects_dim_above_u16_max() { // --------------------------------------------------------------------- fn fs_tmp(name: &str) -> std::path::PathBuf { + let nonce = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos(); std::env::temp_dir().join(format!( - "ordvec_fastscan_{}_{}.ovfs", + "ordvec_fastscan_{}_{}_{}.ovfs", name, - std::process::id() + std::process::id(), + nonce )) } +fn forge_ovfs(dim: usize, n_vectors: usize, payload: &[u8]) -> Vec { + let mut bytes = Vec::new(); + bytes.extend_from_slice(b"OVFS"); + bytes.push(1); + bytes.extend_from_slice(&(dim as u32).to_le_bytes()); + bytes.extend_from_slice(&(n_vectors as u32).to_le_bytes()); + bytes.extend_from_slice(payload); + bytes +} + +fn valid_dim4_n1_payload() -> Vec { + let mut payload = vec![0u8; 64]; + // Buckets [0, 1, 2, 3] -> one coordinate in each b=2 bucket. + payload[0] = 0x01; + payload[32] = 0x0b; + payload +} + +fn assert_fastscan_loaders_reject(bytes: &[u8], expected: &str) { + let path = fs_tmp("malformed_payload"); + std::fs::write(&path, bytes).unwrap(); + let path_err = RankQuantFastscan::load(&path).unwrap_err(); + std::fs::remove_file(&path).ok(); + assert!( + path_err.to_string().contains(expected), + "path loader returned unexpected error: {path_err}" + ); + + let reader_err = RankQuantFastscan::read_from(Cursor::new(bytes.to_vec())).unwrap_err(); + assert!( + reader_err.to_string().contains(expected), + "reader loader returned unexpected error: {reader_err}" + ); + + let bytes_err = RankQuantFastscan::load_from_bytes(bytes).unwrap_err(); + assert!( + bytes_err.to_string().contains(expected), + "byte-slice loader returned unexpected error: {bytes_err}" + ); +} + #[test] fn fastscan_write_load_roundtrip_searches_identically() { const FD: usize = 128; @@ -451,3 +497,27 @@ fn fastscan_load_rejects_dim_not_multiple_of_4() { std::fs::remove_file(&path).ok(); assert!(err.to_string().contains("multiple of 4"), "got: {err}"); } + +#[test] +fn fastscan_load_rejects_invalid_payload_nibble_on_all_public_loaders() { + let mut payload = valid_dim4_n1_payload(); + payload[32] = 0x10; + let bytes = forge_ovfs(4, 1, &payload); + assert_fastscan_loaders_reject(&bytes, "invalid FastScan nibble"); +} + +#[test] +fn fastscan_load_rejects_nonzero_tail_padding_on_all_public_loaders() { + let mut payload = valid_dim4_n1_payload(); + payload[1] = 0x01; + let bytes = forge_ovfs(4, 1, &payload); + assert_fastscan_loaders_reject(&bytes, "tail padding byte"); +} + +#[test] +fn fastscan_load_rejects_constant_composition_violation_on_all_public_loaders() { + let mut payload = valid_dim4_n1_payload(); + payload[0] = 0x00; + let bytes = forge_ovfs(4, 1, &payload); + assert_fastscan_loaders_reject(&bytes, "constant composition"); +} diff --git a/tests/release_publish_invariants.py b/tests/release_publish_invariants.py index 0f015523..17c6b5dc 100644 --- a/tests/release_publish_invariants.py +++ b/tests/release_publish_invariants.py @@ -28,6 +28,11 @@ SDE_ACTION_PATH = os.environ.get( "SDE_ACTION_PATH", ".github/actions/setup-intel-sde/action.yml" ) +PR_ONLY_SDE_ALLOW_UNAVAILABLE = "${{ github.event_name == 'pull_request' }}" +SDE_AVAILABLE_IF = "${{ steps.sde.outputs.sde-available == 'true' }}" +PR_SDE_UNAVAILABLE_IF = ( + "${{ github.event_name == 'pull_request' && steps.sde.outputs.sde-available != 'true' }}" +) PYPI_CANONICAL_EXPECTED_ARGS = ( "--expected-wheels 4", "--expected-sdists 1", @@ -105,6 +110,10 @@ def boolish_true(value: Any) -> bool: return value is True or (isinstance(value, str) and value.lower() == "true") +def boolish_false(value: Any) -> bool: + return value is False or (isinstance(value, str) and value.lower() == "false") + + def step_label(index: int, step: dict[str, Any]) -> str: name = step.get("name") if isinstance(name, str) and name: @@ -1808,20 +1817,24 @@ def check_sde_cache_job(workflow: dict[str, Any], path: str, job_name: str) -> N fail(f"{path}: jobs.{job_name} setup-intel-sde must receive env.SDE_VERSION") if setup_with.get("sha256") != "${{ env.SDE_SHA256 }}": fail(f"{path}: jobs.{job_name} setup-intel-sde must receive env.SDE_SHA256") - if not boolish_true(setup_with.get("allow-unavailable")): - fail(f"{path}: jobs.{job_name} must explicitly opt into the temporary SDE outage valve") + if setup_with.get("allow-unavailable") != PR_ONLY_SDE_ALLOW_UNAVAILABLE: + fail( + f"{path}: jobs.{job_name} may soften Intel SDE outages only on pull_request; " + "push and workflow_dispatch runs must fail closed" + ) - available_if = "steps.sde.outputs.sde-available == 'true'" - unavailable_if = "steps.sde.outputs.sde-available != 'true'" outage_notice_steps = [] for index, raw_step in enumerate(steps): step = mapping(raw_step, f"{path}: jobs.{job_name}.steps[{index}]") - if step.get("if") == unavailable_if and contains_text( + if step.get("if") == PR_SDE_UNAVAILABLE_IF and contains_text( step.get("run"), "Intel SDE archive unavailable" ): outage_notice_steps.append(step) if len(outage_notice_steps) != 1: - fail(f"{path}: jobs.{job_name} must emit exactly one notice when Intel SDE is unavailable") + fail( + f"{path}: jobs.{job_name} must emit exactly one PR-only Intel SDE outage notice; " + "release-gated runs must not green-skip AVX-512 coverage" + ) sde_guarded_names = { "Install cargo-llvm-cov (pinned)", @@ -1839,10 +1852,10 @@ def check_sde_cache_job(workflow: dict[str, Any], path: str, job_name: str) -> N or contains_nested_text(step.get("env"), "steps.sde.outputs.sde-path") or contains_text(step.get("run"), "SDE_PATH") ): - if step.get("if") != available_if: + if step.get("if") != SDE_AVAILABLE_IF: fail( - f"{path}: {step_label(index, step)} must be guarded by " - "steps.sde.outputs.sde-available" + f"{path}: {step_label(index, step)} must run after SDE setup succeeds, " + "and may be skipped only when PR-only SDE setup reports unavailable" )