From fe2aa12a3d02bf765a00ff95bc2f3fb1ea3ca4ed Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 21:31:54 -0500 Subject: [PATCH 1/4] feat: rename on-disk magics TV* -> OV* (ordvec format) with full back-compat Files written by the crate now use the ordvec magics OVR1/OVRQ/OVBM/OVSB (extensions .ovr/.ovrq/.ovbm/.ovsb), replacing the turbovec-era TV* magics. The read contract is unchanged: all loaders (rank_io.rs) AND the C ABI accept BOTH the current OV* and the legacy TV* magics, so every file the crate (or turbovec) ever wrote still loads. Only the write path changed. - src/rank_io.rs: OV* magic constants (written) + TV* retained read-only for back-compat; writers emit OV*; loaders + probe_index_metadata accept both. - ordvec-ffi: the C ABI sniff-magic dispatch accepts both OV* and TV*; probe path was already format-agnostic (uses IndexKind). - tests/persistence_compat.rs: forward fixtures now pin OV*; added tests proving legacy TV* files still load for all four index types. - Parity sweep (docs / extensions only, no logic): ordvec-manifest (+ python bindings), ordvec-python docstrings + tests, ordvec-go test, C header, fuzz targets, docs/*, README format line, SECURITY/THREAT_MODEL, CONTRIBUTING stable-surface statement (the read contract is never broken), .gitignore. Gate: fmt + clippy -D warnings (core/ffi/manifest) + full test suites (core exp+default, manifest, ffi) + ordvec-python check + rustdoc -D warnings. Signed-off-by: Nelson Spence --- .gitignore | 5 + CONTRIBUTING.md | 12 +- README.md | 3 +- SECURITY.md | 5 +- THREAT_MODEL.md | 6 +- docs/INDEX_PROVENANCE.md | 2 +- docs/PERSISTED_FORMAT.md | 34 ++++-- docs/c-api.md | 6 +- docs/compatibility-policy.md | 12 +- fuzz/fuzz_targets/load_bitmap.rs | 5 +- fuzz/fuzz_targets/load_rank.rs | 5 +- fuzz/fuzz_targets/load_rankquant.rs | 5 +- fuzz/fuzz_targets/load_sign_bitmap.rs | 7 +- fuzz/fuzz_targets/roundtrip_rankquant.rs | 2 +- fuzz/fuzz_targets/scratch.rs | 5 +- ordvec-ffi/include/ordvec.h | 8 +- ordvec-ffi/src/lib.rs | 36 +++--- ordvec-ffi/tests/c_link_smoke.rs | 4 +- ordvec-go/ordvec_test.go | 43 ++++++-- ordvec-manifest-python/README.md | 2 +- .../tests/test_manifest_bindings.py | 8 +- ordvec-manifest/README.md | 14 +-- ordvec-manifest/tests/manifest.rs | 32 +++--- ordvec-python/src/lib.rs | 28 +++-- ordvec-python/tests/test_bitmap.py | 6 +- ordvec-python/tests/test_rank.py | 4 +- ordvec-python/tests/test_rank_quant.py | 4 +- ordvec-python/tests/test_redteam_fuzz.py | 70 ++++++------ ordvec-python/tests/test_sign_bitmap.py | 6 +- src/bitmap.rs | 11 +- src/quant.rs | 23 ++-- src/rank.rs | 11 +- src/rank_io.rs | 53 +++++---- src/sign_bitmap.rs | 13 ++- tests/persistence_compat.rs | 103 +++++++++++++++++- 35 files changed, 391 insertions(+), 202 deletions(-) diff --git a/.gitignore b/.gitignore index d342254..de82992 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,11 @@ *.swp # Local index/serialisation artifacts produced by tests/benches. +# Current `.ov*` magics plus the legacy `.tv*` ones (still loadable, files persist). +*.ovr +*.ovrq +*.ovbm +*.ovsb *.tvr *.tvrq *.tvbm diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a22929f..329eb17 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,10 +19,14 @@ Contributions to the code, the docs, and the paper are all welcome. caveat. The Lean bitmap theorem proves a constant-weight overlap admission model under explicit assumptions; it is not a blanket retrieval guarantee. - **MSRV is Rust 1.89.** Don't use newer standard-library or language APIs. -- **Stable surface.** The persistence file magics (`.tvr` / `.tvrq` / - `.tvbm` / `.tvsb`) and the public method names - (`new` / `add` / `search` / `search_asymmetric*` / `top_m_candidates*` / - `write` / `load`) are stable — please don't rename them. +- **Stable surface.** The on-disk formats remain loadable forever: writers emit + the current `.ov*` magics (`.ovr` / `.ovrq` / `.ovbm` / `.ovsb`, renamed from + the turbovec-era `.tv*`), and the loaders accept **both** the current `.ov*` + and the legacy `.tv*` magics — so every file the crate has ever written still + loads. Only the write path changed; the read contract is never broken. The + public method names (`new` / `add` / `search` / `search_asymmetric*` / + `top_m_candidates*` / `write` / `load`) are likewise stable — please don't + rename them. - **Tests are required for new functionality.** As major new functionality is added, tests covering it MUST be added to the automated test suite (`cargo test`, plus `pytest` for the Python bindings). Changes that add diff --git a/README.md b/README.md index f6d7730..c2712f1 100644 --- a/README.md +++ b/README.md @@ -329,7 +329,8 @@ clean-checkout kernel sanity check. ## Security: index-file trust -The on-disk formats (`.tvr` / `.tvrq` / `.tvbm` / `.tvsb`) carry **no built-in +The on-disk formats (`.ovr` / `.ovrq` / `.ovbm` / `.ovsb`; legacy `.tvr` / +`.tvrq` / `.tvbm` / `.tvsb` files still load) carry **no built-in checksum, MAC, or signature — by design.** The loaders validate *structure* (magic, version, bounds, exact-length payload) but not *origin*: a structurally valid file can still be untrusted. If an index file crosses a diff --git a/SECURITY.md b/SECURITY.md index c2cba27..166b0a3 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -15,8 +15,9 @@ Use GitHub's private vulnerability reporting: We aim to acknowledge reports within a few business days. -`ordvec` parses serialized index files (`.tvr` / `.tvrq` / `.tvbm` / -`.tvsb`); the loaders are fuzzed (`cargo +nightly fuzz`), so +`ordvec` parses serialized index files (`.ovr` / `.ovrq` / `.ovbm` / +`.ovsb`; the loaders also accept the legacy `.tvr` / `.tvrq` / `.tvbm` / +`.tvsb` magics); the loaders are fuzzed (`cargo +nightly fuzz`), so parsing-robustness reports against the deserialization paths are especially welcome. Reports are also welcome against the `unsafe` SIMD kernels (shape / bounds invariants), the Python FFI contract (buffer handling, GIL discipline), diff --git a/THREAT_MODEL.md b/THREAT_MODEL.md index aa8c086..3ca834d 100644 --- a/THREAT_MODEL.md +++ b/THREAT_MODEL.md @@ -66,7 +66,7 @@ absence of a second maintainer is itself a tracked supply-chain residual | Layer | Components | Trust boundary | |---|---|---| -| **Deserialization** | `rank_io.rs` — `.tvr` / `.tvrq` / `.tvbm` / `.tvsb` loaders | Untrusted filesystem / network byte stream | +| **Deserialization** | `rank_io.rs` — `.ovr` / `.ovrq` / `.ovbm` / `.ovsb` loaders (also accept the legacy `.tvr` / `.tvrq` / `.tvbm` / `.tvsb` magics) | Untrusted filesystem / network byte stream | | **Manifest verification** | `ordvec-manifest` — JSON sidecar verifier | Manifest + index + optional row-map files before load | | **Compute kernels** | `fastscan.rs`, `quant_kernels.rs`, `bitmap.rs`, `sign_bitmap.rs` | Trust established after format validation | | **Index API** | `rank.rs`, `quant.rs`, `bitmap.rs`, `sign_bitmap.rs` | Caller-controlled query embeddings | @@ -221,8 +221,8 @@ those kernels, and layering ASAN onto the existing SDE leg remains a follow-up. ### 4.1 C ABI defenses (code-verified) -`ordvec-ffi` exposes only loaded `.tvrq` `RankQuant` and `.tvbm` `Bitmap` -indexes through one opaque handle. The ABI checks raw pointer nullness and +`ordvec-ffi` exposes only loaded `.ovrq` `RankQuant` and `.ovbm` `Bitmap` +indexes (legacy `.tvrq` / `.tvbm` files also load) through one opaque handle. The ABI checks raw pointer nullness and caller-supplied lengths before use, requires exact v1 `struct_size` values for input structs, rejects unknown flags and nonzero reserved input fields, validates query dimension and finiteness before entering core search, diff --git a/docs/INDEX_PROVENANCE.md b/docs/INDEX_PROVENANCE.md index d83b6b4..6f1b98e 100644 --- a/docs/INDEX_PROVENANCE.md +++ b/docs/INDEX_PROVENANCE.md @@ -1,6 +1,6 @@ # Index file provenance -`ordvec` persists indexes as `.tvr` / `.tvrq` / `.tvbm` / `.tvsb` files and +`ordvec` persists indexes as `.ovr` / `.ovrq` / `.ovbm` / `.ovsb` files and reloads them through `Rank::load`, `RankQuant::load`, `Bitmap::load`, and `SignBitmap::load`. This note states exactly **what the loaders guarantee and what they do not**, so you can decide whether an index file needs out-of-band diff --git a/docs/PERSISTED_FORMAT.md b/docs/PERSISTED_FORMAT.md index 4c3da8c..1d53282 100644 --- a/docs/PERSISTED_FORMAT.md +++ b/docs/PERSISTED_FORMAT.md @@ -1,8 +1,8 @@ # Persisted Index Format This document is the compatibility contract for ordvec persisted index files. -It covers the primitive index artifacts only: `.tvr`, `.tvrq`, `.tvbm`, and -`.tvsb`. It does not define a database, transaction log, replication protocol, +It covers the primitive index artifacts only: `.ovr`, `.ovrq`, `.ovbm`, and +`.ovsb`. It does not define a database, transaction log, replication protocol, provenance system, checksum manifest, signature, or trust policy. All integer fields are little-endian. Each format has one fixed header followed @@ -58,7 +58,7 @@ Example external segment entry: ```json { - "path": "segments/shard-0007/index.tvrq", + "path": "segments/shard-0007/index.ovrq", "sha256": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", "metadata": { "kind": "RankQuant", @@ -92,13 +92,16 @@ persisted row. ## Format Layouts -### Rank (`.tvr`, magic `TVR1`) +### Rank (`.ovr`, magic `OVR1`) + +Current writers emit magic `OVR1`. Loaders also accept the legacy magic `TVR1` +(written by versions before the format rename). Header: | Offset | Bytes | Field | | ---: | ---: | --- | -| 0 | 4 | magic `TVR1` | +| 0 | 4 | magic `OVR1` (or legacy `TVR1`) | | 4 | 1 | format version `1` | | 5 | 4 | `dim` as `u32` little-endian | | 9 | 4 | `n_vectors` as `u32` little-endian | @@ -112,13 +115,16 @@ Probe metadata: - `params = Rank` - `bytes_per_vec = dim * 2` -### RankQuant (`.tvrq`, magic `TVRQ`) +### RankQuant (`.ovrq`, magic `OVRQ`) + +Current writers emit magic `OVRQ`. Loaders also accept the legacy magic `TVRQ` +(written by versions before the format rename). Header: | Offset | Bytes | Field | | ---: | ---: | --- | -| 0 | 4 | magic `TVRQ` | +| 0 | 4 | magic `OVRQ` (or legacy `TVRQ`) | | 4 | 1 | format version `1` | | 5 | 1 | `bits` as `u8`, one of `1`, `2`, or `4` | | 6 | 4 | `dim` as `u32` little-endian | @@ -139,13 +145,16 @@ Probe metadata: - `params = RankQuant { bits }` - `bytes_per_vec = dim * bits / 8` -### Bitmap (`.tvbm`, magic `TVBM`) +### Bitmap (`.ovbm`, magic `OVBM`) + +Current writers emit magic `OVBM`. Loaders also accept the legacy magic `TVBM` +(written by versions before the format rename). Header: | Offset | Bytes | Field | | ---: | ---: | --- | -| 0 | 4 | magic `TVBM` | +| 0 | 4 | magic `OVBM` (or legacy `TVBM`) | | 4 | 1 | format version `1` | | 5 | 4 | `dim` as `u32` little-endian | | 9 | 4 | `n_top` as `u32` little-endian | @@ -161,13 +170,16 @@ Probe metadata: - `params = Bitmap { n_top }` - `bytes_per_vec = dim / 8` -### SignBitmap (`.tvsb`, magic `TVSB`) +### SignBitmap (`.ovsb`, magic `OVSB`) + +Current writers emit magic `OVSB`. Loaders also accept the legacy magic `TVSB` +(written by versions before the format rename). Header: | Offset | Bytes | Field | | ---: | ---: | --- | -| 0 | 4 | magic `TVSB` | +| 0 | 4 | magic `OVSB` (or legacy `TVSB`) | | 4 | 1 | format version `1` | | 5 | 4 | `dim` as `u32` little-endian | | 9 | 4 | `n_vectors` as `u32` little-endian | diff --git a/docs/c-api.md b/docs/c-api.md index a2f3fd7..d936ae3 100644 --- a/docs/c-api.md +++ b/docs/c-api.md @@ -1,7 +1,7 @@ # C API -`ordvec-ffi` exposes a small ABI v1 for loading persisted `.tvrq` -`RankQuant` and `.tvbm` `Bitmap` indexes and running synchronous single-query +`ordvec-ffi` exposes a small ABI v1 for loading persisted `.ovrq` +`RankQuant` and `.ovbm` `Bitmap` indexes and running synchronous single-query searches. The public header is [`../ordvec-ffi/include/ordvec.h`](../ordvec-ffi/include/ordvec.h). ## Build and Link @@ -33,7 +33,7 @@ When linking dynamically, make sure your platform's loader can find int main(void) { ordvec_index_t *index = NULL; - ordvec_status_t st = ordvec_index_load("index.tvrq", 0, &index); + ordvec_status_t st = ordvec_index_load("index.ovrq", 0, &index); if (st != ORDVEC_STATUS_OK) { fprintf(stderr, "load failed: %s\n", ordvec_last_error()); return 1; diff --git a/docs/compatibility-policy.md b/docs/compatibility-policy.md index a515cce..d96bfd6 100644 --- a/docs/compatibility-policy.md +++ b/docs/compatibility-policy.md @@ -121,10 +121,14 @@ with documented migration steps. The primitive index formats are the files written and loaded by the core index types: -- `.tvr` / `TVR1` for `Rank`; -- `.tvrq` / `TVRQ` for `RankQuant`; -- `.tvbm` / `TVBM` for `Bitmap`; -- `.tvsb` / `TVSB` for `SignBitmap`. +- `.ovr` / `OVR1` for `Rank`; +- `.ovrq` / `OVRQ` for `RankQuant`; +- `.ovbm` / `OVBM` for `Bitmap`; +- `.ovsb` / `OVSB` for `SignBitmap`. + +Legacy files using the old turbovec-era magics (`TVR1`, `TVRQ`, `TVBM`, `TVSB` +and extensions `.tvr`, `.tvrq`, `.tvbm`, `.tvsb`) are still accepted by current +loaders. Writers no longer emit those magics. Patch releases should keep valid files from the same minor series loadable. Loader hardening may reject malformed files, forged sizes, trailing bytes, bad diff --git a/fuzz/fuzz_targets/load_bitmap.rs b/fuzz/fuzz_targets/load_bitmap.rs index 7727788..985aa8d 100644 --- a/fuzz/fuzz_targets/load_bitmap.rs +++ b/fuzz/fuzz_targets/load_bitmap.rs @@ -1,5 +1,6 @@ -//! libFuzzer target for the `.tvbm` / `TVBM` loader, driven through the -//! public `ordvec::Bitmap::load` entry point. +//! libFuzzer target for the `.ovbm` / `OVBM` loader (which also accepts the +//! legacy `.tvbm` / `TVBM` magic), driven through the public +//! `ordvec::Bitmap::load` entry point. //! //! The low-level `rank_io::load_bitmap` parser is crate-internal //! (`pub(crate)`), so the fuzzer exercises it through `Bitmap::load` — which diff --git a/fuzz/fuzz_targets/load_rank.rs b/fuzz/fuzz_targets/load_rank.rs index 1a0dee7..62488b7 100644 --- a/fuzz/fuzz_targets/load_rank.rs +++ b/fuzz/fuzz_targets/load_rank.rs @@ -1,5 +1,6 @@ -//! libFuzzer target for the `.tvr` / `TVR1` loader, driven through the -//! public `ordvec::Rank::load` entry point. +//! libFuzzer target for the `.ovr` / `OVR1` loader (which also accepts the +//! legacy `.tvr` / `TVR1` magic), driven through the public `ordvec::Rank::load` +//! entry point. //! //! The low-level `rank_io::load_rank` parser is crate-internal (`pub(crate)`), //! so the fuzzer exercises it through `Rank::load` — which runs that exact diff --git a/fuzz/fuzz_targets/load_rankquant.rs b/fuzz/fuzz_targets/load_rankquant.rs index 1dc8a41..95b329b 100644 --- a/fuzz/fuzz_targets/load_rankquant.rs +++ b/fuzz/fuzz_targets/load_rankquant.rs @@ -1,5 +1,6 @@ -//! libFuzzer target for the `.tvrq` / `TVRQ` loader, driven through the -//! public `ordvec::RankQuant::load` entry point. +//! libFuzzer target for the `.ovrq` / `OVRQ` loader (which also accepts the +//! legacy `.tvrq` / `TVRQ` magic), driven through the public +//! `ordvec::RankQuant::load` entry point. //! //! The low-level `rank_io::load_rankquant` parser is crate-internal //! (`pub(crate)`), so the fuzzer exercises it through `RankQuant::load` — diff --git a/fuzz/fuzz_targets/load_sign_bitmap.rs b/fuzz/fuzz_targets/load_sign_bitmap.rs index 083c2dc..061f986 100644 --- a/fuzz/fuzz_targets/load_sign_bitmap.rs +++ b/fuzz/fuzz_targets/load_sign_bitmap.rs @@ -1,5 +1,6 @@ -//! libFuzzer target for the `.tvsb` / `TVSB` loader, driven through the -//! public `ordvec::SignBitmap::load` entry point. +//! libFuzzer target for the `.ovsb` / `OVSB` loader (which also accepts the +//! legacy `.tvsb` / `TVSB` magic), driven through the public +//! `ordvec::SignBitmap::load` entry point. //! //! The low-level `rank_io::load_sign_bitmap` parser is crate-internal //! (`pub(crate)`), so the fuzzer exercises it through `SignBitmap::load` — @@ -13,7 +14,7 @@ //! Contract: on arbitrary bytes the loader must return `Ok(..)` or //! `Err(..)` — never panic, abort, or read out of bounds. libFuzzer //! treats any panic/abort as a crash, so simply letting the result drop -//! is the assertion. The `.tvsb` dim validation path differs from the +//! is the assertion. The `.ovsb` dim validation path differs from the //! other three (`MAX_SIGN_BITMAP_DIM`, multiple-of-64), so it gets its //! own target rather than riding on `load_bitmap`. diff --git a/fuzz/fuzz_targets/roundtrip_rankquant.rs b/fuzz/fuzz_targets/roundtrip_rankquant.rs index 04814b7..d9d64f5 100644 --- a/fuzz/fuzz_targets/roundtrip_rankquant.rs +++ b/fuzz/fuzz_targets/roundtrip_rankquant.rs @@ -46,7 +46,7 @@ fuzz_target!(|data: &[u8]| { Ok(d) => d, Err(_) => return, }; - let path = dir.path().join("roundtrip.tvrq"); + let path = dir.path().join("roundtrip.ovrq"); idx.write(&path).expect("write of a validly-built index must succeed"); let reloaded = RankQuant::load(&path).expect("write output must reload (round-trip)"); assert_eq!(reloaded.dim(), idx.dim()); diff --git a/fuzz/fuzz_targets/scratch.rs b/fuzz/fuzz_targets/scratch.rs index 634c836..24053b4 100644 --- a/fuzz/fuzz_targets/scratch.rs +++ b/fuzz/fuzz_targets/scratch.rs @@ -1,5 +1,6 @@ -//! Shared per-worker scratch temp file for the `.tvr` / `.tvrq` / `.tvbm` / -//! `.tvsb` loader fuzz targets. +//! Shared per-worker scratch temp file for the `.ovr` / `.ovrq` / `.ovbm` / +//! `.ovsb` loader fuzz targets (the loaders also accept the legacy `.tv*` +//! magics). //! //! # Why this exists (issue #6) //! diff --git a/ordvec-ffi/include/ordvec.h b/ordvec-ffi/include/ordvec.h index 6907655..68a0ac5 100644 --- a/ordvec-ffi/include/ordvec.h +++ b/ordvec-ffi/include/ordvec.h @@ -180,7 +180,8 @@ void ordvec_search_params_init(ordvec_search_params_t *params); void ordvec_search_stats_init(ordvec_search_stats_t *stats); /** - * Load a `.tvrq` RankQuant or `.tvbm` Bitmap index. + * Load a `.ovrq` RankQuant or `.ovbm` Bitmap index (legacy `.tvrq` / `.tvbm` + * files are also accepted). * * # Safety * @@ -190,8 +191,9 @@ void ordvec_search_stats_init(ordvec_search_stats_t *stats); ordvec_status_t ordvec_index_load(const char *path, uint64_t flags, ordvec_index_t **out); /** - * Probe on-disk metadata for a `.tvrq` RankQuant or `.tvbm` Bitmap index - * without loading payload rows into an index handle. + * Probe on-disk metadata for a `.ovrq` RankQuant or `.ovbm` Bitmap index + * (legacy `.tvrq` / `.tvbm` also accepted) without loading payload rows into an + * index handle. * * This validates the fixed header, declared dimensions, payload byte count, * and exact file length. Full row-invariant validation remains the job of diff --git a/ordvec-ffi/src/lib.rs b/ordvec-ffi/src/lib.rs index 773e871..8a38580 100644 --- a/ordvec-ffi/src/lib.rs +++ b/ordvec-ffi/src/lib.rs @@ -370,7 +370,7 @@ fn info_for_metadata(meta: &IndexMetadata) -> Result ORDVEC_INDEX_KIND_BITMAP, IndexKind::Rank | IndexKind::SignBitmap => return Err(FfiError::new( ORDVEC_STATUS_UNSUPPORTED_FORMAT, - "ABI v1 supports metadata probes only for TVRQ RankQuant and TVBM Bitmap indexes", + "ABI v1 supports metadata probes only for RankQuant and Bitmap indexes", )), }; info.format_version = u32::from(meta.format_version); @@ -671,7 +671,8 @@ pub unsafe extern "C" fn ordvec_search_stats_init(stats: *mut ordvec_search_stat } #[no_mangle] -/// Load a `.tvrq` RankQuant or `.tvbm` Bitmap index. +/// Load a `.ovrq` RankQuant or `.ovbm` Bitmap index (legacy `.tvrq` / `.tvbm` +/// files are also accepted). /// /// # Safety /// @@ -719,17 +720,19 @@ pub unsafe extern "C" fn ordvec_index_load( .map_err(|err| io_to_ffi(err, "stat index"))? .len(); + // Accept both the current `OV*` magics and the legacy turbovec-era + // `TV*` magics (back-compat) — mirrors the loaders in `rank_io.rs`. let index = match &magic { - b"TVRQ" => LoadedIndex::RankQuant( - RankQuant::load(path).map_err(|err| io_to_ffi(err, "load TVRQ index"))?, + b"OVRQ" | b"TVRQ" => LoadedIndex::RankQuant( + RankQuant::load(path).map_err(|err| io_to_ffi(err, "load RankQuant index"))?, ), - b"TVBM" => LoadedIndex::Bitmap( - Bitmap::load(path).map_err(|err| io_to_ffi(err, "load TVBM index"))?, + b"OVBM" | b"TVBM" => LoadedIndex::Bitmap( + Bitmap::load(path).map_err(|err| io_to_ffi(err, "load Bitmap index"))?, ), - b"TVR1" | b"TVSB" => { + b"OVR1" | b"OVSB" | b"TVR1" | b"TVSB" => { return Err(FfiError::new( ORDVEC_STATUS_UNSUPPORTED_FORMAT, - "ABI v1 supports only TVRQ RankQuant and TVBM Bitmap indexes", + "ABI v1 supports only RankQuant and Bitmap indexes", )) } _ => { @@ -753,8 +756,9 @@ pub unsafe extern "C" fn ordvec_index_load( } #[no_mangle] -/// Probe on-disk metadata for a `.tvrq` RankQuant or `.tvbm` Bitmap index -/// without loading payload rows into an index handle. +/// Probe on-disk metadata for a `.ovrq` RankQuant or `.ovbm` Bitmap index +/// (legacy `.tv*` also accepted) without loading payload rows into an index +/// handle. /// /// This validates the fixed header, declared dimensions, payload byte count, /// and exact file length. Full row-invariant validation remains the job of @@ -992,7 +996,7 @@ mod tests { } fn make_rankquant_fixture() -> std::path::PathBuf { - let path = temp_path("rankquant", "tvrq"); + let path = temp_path("rankquant", "ovrq"); let mut index = RankQuant::new(16, 2); let doc: Vec = (0..16).map(|x| x as f32).collect(); let mut corpus = Vec::new(); @@ -1005,7 +1009,7 @@ mod tests { } fn make_bitmap_fixture() -> std::path::PathBuf { - let path = temp_path("bitmap", "tvbm"); + let path = temp_path("bitmap", "ovbm"); let mut index = Bitmap::new(64, 4); let mut doc = vec![0.0f32; 64]; for (j, value) in doc.iter_mut().take(4).enumerate() { @@ -1454,20 +1458,20 @@ mod tests { #[test] fn load_maps_unsupported_and_corrupt_formats() { - let rank_path = temp_path("rank", "tvr"); + let rank_path = temp_path("rank", "ovr"); let mut rank = Rank::new(16); rank.add(&[0.0f32; 16]); rank.write(&rank_path).unwrap(); - let sign_path = temp_path("sign", "tvsb"); + let sign_path = temp_path("sign", "ovsb"); let mut sign = SignBitmap::new(64); sign.add(&[0.0f32; 64]); sign.write(&sign_path).unwrap(); - let corrupt_path = temp_path("corrupt", "tvrq"); + let corrupt_path = temp_path("corrupt", "ovrq"); std::fs::File::create(&corrupt_path) .unwrap() - .write_all(b"TVRQ\x01") + .write_all(b"OVRQ\x01") .unwrap(); unsafe { diff --git a/ordvec-ffi/tests/c_link_smoke.rs b/ordvec-ffi/tests/c_link_smoke.rs index 8e93224..408ca98 100644 --- a/ordvec-ffi/tests/c_link_smoke.rs +++ b/ordvec-ffi/tests/c_link_smoke.rs @@ -25,7 +25,7 @@ fn write_file(path: &Path, body: &[u8]) { fn write_rankquant_fixture(path: &Path) { let mut bytes = Vec::new(); - bytes.extend_from_slice(b"TVRQ"); + bytes.extend_from_slice(b"OVRQ"); bytes.push(1); bytes.push(2); bytes.extend_from_slice(&16u32.to_le_bytes()); @@ -98,7 +98,7 @@ fn c_program_links_and_runs_against_static_library() { lib.display() ); - let fixture = temp_path("linked_fixture", "tvrq"); + let fixture = temp_path("linked_fixture", "ovrq"); write_rankquant_fixture(&fixture); let src = temp_path("linked_smoke", "c"); diff --git a/ordvec-go/ordvec_test.go b/ordvec-go/ordvec_test.go index c775b06..e18e8d2 100644 --- a/ordvec-go/ordvec_test.go +++ b/ordvec-go/ordvec_test.go @@ -13,11 +13,14 @@ import ( "testing" ) -func writeRankQuantFixture(t *testing.T) string { +// writeRankQuantFixtureMagic builds a RankQuant fixture with the given 4-byte +// magic and file extension. The loader accepts both the current "OVRQ" magic and +// the legacy "TVRQ" magic, so this is parameterised to exercise both. +func writeRankQuantFixtureMagic(t *testing.T, magic, ext string) string { t.Helper() - path := filepath.Join(t.TempDir(), "fixture.tvrq") + path := filepath.Join(t.TempDir(), "fixture."+ext) var b []byte - b = append(b, []byte("TVRQ")...) + b = append(b, []byte(magic)...) b = append(b, 1) // version b = append(b, 2) // bits b = binary.LittleEndian.AppendUint32(b, 16) @@ -32,11 +35,18 @@ func writeRankQuantFixture(t *testing.T) string { return path } +// writeRankQuantFixture builds a RankQuant fixture in the current on-disk format +// ("OVRQ" magic, ".ovrq" extension). +func writeRankQuantFixture(t *testing.T) string { + t.Helper() + return writeRankQuantFixtureMagic(t, "OVRQ", "ovrq") +} + func writeBitmapFixture(t *testing.T) string { t.Helper() - path := filepath.Join(t.TempDir(), "fixture.tvbm") + path := filepath.Join(t.TempDir(), "fixture.ovbm") var b []byte - b = append(b, []byte("TVBM")...) + b = append(b, []byte("OVBM")...) b = append(b, 1) // version b = binary.LittleEndian.AppendUint32(b, 64) b = binary.LittleEndian.AppendUint32(b, 4) @@ -121,6 +131,25 @@ func TestLoadInfoSearchRankQuant(t *testing.T) { } } +// TestLoadsLegacyTVMagic confirms the C ABI still loads files written with the +// pre-rename "TVRQ" magic (legacy turbovec-era on-disk format). New files are +// written with "OVRQ"; the loader accepts both, so old indexes never break. +func TestLoadsLegacyTVMagic(t *testing.T) { + idx, err := Load(writeRankQuantFixtureMagic(t, "TVRQ", "tvrq")) + if err != nil { + t.Fatal(err) + } + defer idx.Close() + + info, err := idx.Info() + if err != nil { + t.Fatal(err) + } + if info.Kind != KindRankQuant || info.Dim != 16 || info.BitWidth != 2 || info.VectorCount != 4 { + t.Fatalf("unexpected info from legacy TVRQ fixture: %+v", info) + } +} + func TestProbeRankQuantInfo(t *testing.T) { path := writeRankQuantFixture(t) @@ -253,14 +282,14 @@ func TestTypedStatusErrors(t *testing.T) { t.Fatalf("unexpected status: %v", statusErr.Status) } - _, err = Load(filepath.Join(t.TempDir(), "missing.tvrq")) + _, err = Load(filepath.Join(t.TempDir(), "missing.ovrq")) if !errors.As(err, &statusErr) || statusErr.Status != StatusIO { t.Fatalf("missing file should be IO status, got %T %[1]v", err) } } func TestLoadRejectsNullBytePath(t *testing.T) { - _, err := Load("bad\x00path.tvrq") + _, err := Load("bad\x00path.ovrq") if err == nil || !strings.Contains(err.Error(), "null byte") { t.Fatalf("Load should reject null byte paths, got %v", err) } diff --git a/ordvec-manifest-python/README.md b/ordvec-manifest-python/README.md index 6ca81b0..9fa3b11 100644 --- a/ordvec-manifest-python/README.md +++ b/ordvec-manifest-python/README.md @@ -24,7 +24,7 @@ Create manifests with caller-owned sidecars by passing dictionaries with ```python manifest = ordvec_manifest.create_manifest( - "index.tvrq", + "index.ovrq", "index.manifest.json", "bge-small-en-v1.5", row_id_is_identity=True, diff --git a/ordvec-manifest-python/tests/test_manifest_bindings.py b/ordvec-manifest-python/tests/test_manifest_bindings.py index 0bea7e9..320262d 100644 --- a/ordvec-manifest-python/tests/test_manifest_bindings.py +++ b/ordvec-manifest-python/tests/test_manifest_bindings.py @@ -12,7 +12,7 @@ def write_rankquant_index(path: Path, *, dim: int = 16, rows: int = 2, bits: int = 2): bytes_per_vec = dim * bits // 8 path.write_bytes( - b"TVRQ" + b"OVRQ" + bytes([1, bits]) + dim.to_bytes(4, "little") + rows.to_bytes(4, "little") @@ -21,7 +21,7 @@ def write_rankquant_index(path: Path, *, dim: int = 16, rows: int = 2, bits: int def write_unloadable_manifest(tmp_path): - artifact = tmp_path / "index.tvrq" + artifact = tmp_path / "index.ovrq" artifact.write_bytes(b"not an ordvec index") digest = hashlib.sha256(artifact.read_bytes()).hexdigest() manifest = { @@ -92,7 +92,7 @@ def test_verify_for_load_preserves_manifest_io_errors(tmp_path): def test_create_manifest_requires_explicit_row_identity(tmp_path): - index = tmp_path / "index.tvrq" + index = tmp_path / "index.ovrq" index.write_bytes(b"not an ordvec index") with pytest.raises(ValueError, match="row_map or row_id_is_identity"): @@ -100,7 +100,7 @@ def test_create_manifest_requires_explicit_row_identity(tmp_path): def test_create_manifest_accepts_auxiliary_artifacts(tmp_path): - index = tmp_path / "index.tvrq" + index = tmp_path / "index.ovrq" ids = tmp_path / "ids.bin" optional = tmp_path / "optional.json" manifest_path = tmp_path / "manifest.json" diff --git a/ordvec-manifest/README.md b/ordvec-manifest/README.md index 58ddc41..2b64a2c 100644 --- a/ordvec-manifest/README.md +++ b/ordvec-manifest/README.md @@ -18,7 +18,7 @@ library default feature set is empty and does not depend on `clap`. ```sh ordvec-manifest create \ - --index path/to/index.tvrq \ + --index path/to/index.ovrq \ --row-id-is-identity \ --aux app.ids=path/to/ids.bin \ --embedding-model bge-small-en-v1.5 \ @@ -163,9 +163,9 @@ paths, declared digest/length, and observed digest/length: "checked_at": "2026-06-03T17:20:00Z", "manifest_id": "urn:uuid:11111111-1111-4111-8111-111111111111", "artifact": { - "manifest_path": "index.tvrq", - "observed_path": "index.tvrq", - "canonical_path": "/srv/index/index.tvrq", + "manifest_path": "index.ovrq", + "observed_path": "index.ovrq", + "canonical_path": "/srv/index/index.ovrq", "sha256": "1111111111111111111111111111111111111111111111111111111111111111", "size_bytes": 4096, "metadata": null @@ -222,9 +222,9 @@ read and absent when the file is missing: "checked_at": "2026-06-03T17:21:00Z", "manifest_id": "urn:uuid:11111111-1111-4111-8111-111111111111", "artifact": { - "manifest_path": "index.tvrq", - "observed_path": "index.tvrq", - "canonical_path": "/srv/index/index.tvrq", + "manifest_path": "index.ovrq", + "observed_path": "index.ovrq", + "canonical_path": "/srv/index/index.ovrq", "sha256": "1111111111111111111111111111111111111111111111111111111111111111", "size_bytes": 4096, "metadata": null diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs index a555f97..dab4dbe 100644 --- a/ordvec-manifest/tests/manifest.rs +++ b/ordvec-manifest/tests/manifest.rs @@ -19,7 +19,7 @@ use std::path::{Path, PathBuf}; use std::process::Command; fn write_index(dir: &Path) -> PathBuf { - let path = dir.join("index.tvrq"); + let path = dir.join("index.ovrq"); let mut index = RankQuant::new(16, 2); let docs: Vec = (0..32).map(|i| i as f32 - 12.0).collect(); index.add(&docs); @@ -28,7 +28,7 @@ fn write_index(dir: &Path) -> PathBuf { } fn write_rankquant_index(dir: &Path, rows: usize) -> PathBuf { - let path = dir.join("index.tvrq"); + let path = dir.join("index.ovrq"); let mut index = RankQuant::new(16, 2); let docs: Vec = (0..16 * rows).map(|i| i as f32 - 12.0).collect(); index.add(&docs); @@ -47,7 +47,7 @@ enum FixtureKind { fn write_index_kind(dir: &Path, kind: FixtureKind) -> PathBuf { match kind { FixtureKind::Rank => { - let path = dir.join("index.tvr"); + let path = dir.join("index.ovr"); let mut index = Rank::new(8); index.add(&[ 1.0, 3.0, 2.0, 4.0, 8.0, 7.0, 6.0, 5.0, 8.0, 6.0, 7.0, 5.0, 1.0, 2.0, 3.0, 4.0, @@ -57,7 +57,7 @@ fn write_index_kind(dir: &Path, kind: FixtureKind) -> PathBuf { } FixtureKind::RankQuant => write_index(dir), FixtureKind::Bitmap => { - let path = dir.join("index.tvbm"); + let path = dir.join("index.ovbm"); let mut index = Bitmap::new(64, 16); let docs: Vec = (0..128).map(|i| ((i * 17) % 31) as f32).collect(); index.add(&docs); @@ -65,7 +65,7 @@ fn write_index_kind(dir: &Path, kind: FixtureKind) -> PathBuf { path } FixtureKind::SignBitmap => { - let path = dir.join("index.tvsb"); + let path = dir.join("index.ovsb"); let mut index = SignBitmap::new(64); let docs: Vec = (0usize..128) .map(|i| if i.is_multiple_of(3) { 1.0 } else { -1.0 }) @@ -1939,7 +1939,7 @@ fn path_policy_rejects_escapes_and_absolute_paths_by_default() { ) .unwrap(); - manifest.artifact.path = "../index.tvrq".to_string(); + manifest.artifact.path = "../index.ovrq".to_string(); let report = verify_manifest_with_base(manifest.clone(), &base, VerifyOptions::default()); assert!(report .errors @@ -1986,7 +1986,7 @@ fn symlink_escape_reports_observed_canonical_path() { fs::create_dir(&base).unwrap(); fs::create_dir(&outside).unwrap(); let index = write_index(&outside); - symlink(&index, base.join("link.tvrq")).unwrap(); + symlink(&index, base.join("link.ovrq")).unwrap(); let manifest_path = base.join("manifest.json"); let mut manifest = create_manifest_for_index_with_options( &index, @@ -1999,7 +1999,7 @@ fn symlink_escape_reports_observed_canonical_path() { }, ) .unwrap(); - manifest.artifact.path = "link.tvrq".to_string(); + manifest.artifact.path = "link.ovrq".to_string(); let report = verify_manifest_with_base(manifest.clone(), &base, VerifyOptions::default()); assert!(report @@ -2086,7 +2086,7 @@ fn verify_for_load_uses_explicit_index_override() { &manifest_path, ) .unwrap(); - manifest.artifact.path = "missing.tvrq".to_string(); + manifest.artifact.path = "missing.ovrq".to_string(); fs::write( &manifest_path, serde_json::to_string_pretty(&manifest).unwrap(), @@ -2096,7 +2096,7 @@ fn verify_for_load_uses_explicit_index_override() { let plan = verify_for_load( &manifest_path, VerifyOptions { - index_override: Some(PathBuf::from("index.tvrq")), + index_override: Some(PathBuf::from("index.ovrq")), ..VerifyOptions::default() }, ) @@ -2108,7 +2108,7 @@ fn verify_for_load_uses_explicit_index_override() { ); assert_eq!( plan.report().artifact.observed_path.as_deref(), - Some("index.tvrq") + Some("index.ovrq") ); } @@ -2215,7 +2215,7 @@ fn verify_for_load_fails_closed_with_report_for_default_path_policy() { }, ) .unwrap(); - manifest.artifact.path = "../index.tvrq".to_string(); + manifest.artifact.path = "../index.ovrq".to_string(); fs::write( &manifest_path, serde_json::to_string_pretty(&manifest).unwrap(), @@ -2766,7 +2766,7 @@ fn attestation_shape_requires_matching_subject_sha256() { manifest.attestations.push(json!({ "predicateType": "https://slsa.dev/provenance/v1", "predicate": {"builder": {"id": "builder"}}, - "subject": [{"name": "index.tvrq", "digest": {"sha256": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}}] + "subject": [{"name": "index.ovrq", "digest": {"sha256": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}}] })); let report = verify_manifest_with_base(manifest.clone(), temp.path(), VerifyOptions::default()); @@ -3044,7 +3044,7 @@ fn verify_index_manifest_uses_explicit_index_override() { &manifest_path, ) .unwrap(); - manifest.artifact.path = "missing.tvrq".to_string(); + manifest.artifact.path = "missing.ovrq".to_string(); fs::write( &manifest_path, serde_json::to_string_pretty(&manifest).unwrap(), @@ -3052,7 +3052,7 @@ fn verify_index_manifest_uses_explicit_index_override() { .unwrap(); let report = verify_index_manifest( - PathBuf::from("index.tvrq"), + PathBuf::from("index.ovrq"), &manifest_path, VerifyOptions::default(), ) @@ -3704,7 +3704,7 @@ fn sqlite_cache_key_is_scoped_to_manifest_location() { ) .unwrap(); - let index_b = case_b.join("index.tvrq"); + let index_b = case_b.join("index.ovrq"); let manifest_b = case_b.join("manifest.json"); fs::copy(&index_a, &index_b).unwrap(); fs::copy(&manifest_a, &manifest_b).unwrap(); diff --git a/ordvec-python/src/lib.rs b/ordvec-python/src/lib.rs index 6e75661..72825fa 100644 --- a/ordvec-python/src/lib.rs +++ b/ordvec-python/src/lib.rs @@ -512,7 +512,7 @@ impl Rank { Ok((scores, indices)) } - /// Serialise the rank index to a `.tvr` file. + /// Serialise the rank index to a `.ovr` file. /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -522,7 +522,8 @@ impl Rank { .map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string())) } - /// Load a `Rank` index from a `.tvr` file previously written by [`Rank::write`]. + /// Load a `Rank` index from a `.ovr` file previously written by [`Rank::write`] + /// (legacy `.tvr` files are also accepted). /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -695,7 +696,7 @@ impl RankQuant { Ok((scores, indices)) } - /// Serialise the quantised index to a `.tvrq` file. + /// Serialise the quantised index to a `.ovrq` file. /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -705,7 +706,8 @@ impl RankQuant { .map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string())) } - /// Load a `RankQuant` index from a `.tvrq` file written by [`RankQuant::write`]. + /// Load a `RankQuant` index from a `.ovrq` file written by [`RankQuant::write`] + /// (legacy `.tvrq` files are also accepted). /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -1154,7 +1156,7 @@ impl Bitmap { self.inner.is_empty() } - /// Serialise the bitmap index to a `.tvbm` file. + /// Serialise the bitmap index to a `.ovbm` file. /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -1164,7 +1166,8 @@ impl Bitmap { .map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string())) } - /// Load a `Bitmap` index from a `.tvbm` file written by [`Bitmap::write`]. + /// Load a `Bitmap` index from a `.ovbm` file written by [`Bitmap::write`] + /// (legacy `.tvbm` files are also accepted). /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -1398,8 +1401,8 @@ impl SignBitmap { self.inner.is_empty() } - /// Persist the sign-bitmap payload to a `.tvsb` file. Format: 13-byte header - /// (`TVSB` magic + version + dim + n_vectors) + LE u64 bitmaps. + /// Persist the sign-bitmap payload to a `.ovsb` file. Format: 13-byte header + /// (`OVSB` magic + version + dim + n_vectors) + LE u64 bitmaps. /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -1409,9 +1412,10 @@ impl SignBitmap { .map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string())) } - /// Load a `SignBitmap` from a `.tvsb` file previously written by - /// [`SignBitmap::write`]. Raises `IOError` if the file is missing, malformed, - /// or its payload length disagrees with the header-declared shape. + /// Load a `SignBitmap` from a `.ovsb` file previously written by + /// [`SignBitmap::write`] (legacy `.tvsb` files are also accepted). Raises + /// `IOError` if the file is missing, malformed, or its payload length + /// disagrees with the header-declared shape. /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -1683,7 +1687,7 @@ fn search_asymmetric_byte_lut<'py>( /// /// This rank-transforms and buckets the raw `corpus`/`queries` matrices on the /// fly, so it supports non-byte-aligned widths such as `bits=3` without changing -/// `RankQuant` storage or `.tvrq` persistence. Returns `(scores, indices)` with +/// `RankQuant` storage or `.ovrq` persistence. Returns `(scores, indices)` with /// the same shape contract as `RankQuant.search`. #[pyfunction] fn rankquant_eval_search<'py>( diff --git a/ordvec-python/tests/test_bitmap.py b/ordvec-python/tests/test_bitmap.py index f632b7f..e0be780 100644 --- a/ordvec-python/tests/test_bitmap.py +++ b/ordvec-python/tests/test_bitmap.py @@ -139,7 +139,7 @@ def test_save_load_roundtrip(tmp_path): idx = Bitmap(dim=128, n_top=32) idx.add(vectors) - path = str(tmp_path / "idx.tvbm") + path = str(tmp_path / "idx.ovbm") idx.write(path) loaded = Bitmap.load(path) @@ -157,7 +157,7 @@ def test_save_load_roundtrip(tmp_path): def test_load_rejects_nonexistent_file(): with pytest.raises(IOError): - Bitmap.load("/nonexistent/path/does-not-exist.tvbm") + Bitmap.load("/nonexistent/path/does-not-exist.ovbm") def test_invalid_n_top_rejected(): @@ -236,7 +236,7 @@ def test_add_float64_is_coerced(): def test_dim_above_u16_max_rejected(): # dim = 65536 is a multiple of 64 but exceeds u16::MAX; the binding must # reject it with a clean ValueError (mirrors the core Bitmap::new guard and - # the .tvbm loader cap) rather than defer to a Rust panic on add/search. + # the .ovbm loader cap) rather than defer to a Rust panic on add/search. with pytest.raises(ValueError, match="u16 rank invariant"): Bitmap(dim=65_536, n_top=256) diff --git a/ordvec-python/tests/test_rank.py b/ordvec-python/tests/test_rank.py index 0be221e..5531a52 100644 --- a/ordvec-python/tests/test_rank.py +++ b/ordvec-python/tests/test_rank.py @@ -117,7 +117,7 @@ def test_save_load_roundtrip(tmp_path): idx = Rank(dim=128) idx.add(vectors) - path = str(tmp_path / "idx.tvr") + path = str(tmp_path / "idx.ovr") idx.write(path) loaded = Rank.load(path) @@ -133,7 +133,7 @@ def test_save_load_roundtrip(tmp_path): def test_load_rejects_nonexistent_file(): with pytest.raises(IOError): - Rank.load("/nonexistent/path/does-not-exist.tvr") + Rank.load("/nonexistent/path/does-not-exist.ovr") def test_empty_index_search_does_not_panic(): diff --git a/ordvec-python/tests/test_rank_quant.py b/ordvec-python/tests/test_rank_quant.py index 79ef676..cdd0c5f 100644 --- a/ordvec-python/tests/test_rank_quant.py +++ b/ordvec-python/tests/test_rank_quant.py @@ -224,7 +224,7 @@ def test_save_load_roundtrip(tmp_path, bits): idx = RankQuant(dim=128, bits=bits) idx.add(vectors) - path = str(tmp_path / f"idx_b{bits}.tvrq") + path = str(tmp_path / f"idx_b{bits}.ovrq") idx.write(path) loaded = RankQuant.load(path) @@ -242,7 +242,7 @@ def test_save_load_roundtrip(tmp_path, bits): def test_load_rejects_nonexistent_file(): with pytest.raises(IOError): - RankQuant.load("/nonexistent/path/does-not-exist.tvrq") + RankQuant.load("/nonexistent/path/does-not-exist.ovrq") @pytest.mark.parametrize("bits", [1, 2, 4]) diff --git a/ordvec-python/tests/test_redteam_fuzz.py b/ordvec-python/tests/test_redteam_fuzz.py index 21fba01..0d2b168 100644 --- a/ordvec-python/tests/test_redteam_fuzz.py +++ b/ordvec-python/tests/test_redteam_fuzz.py @@ -867,8 +867,8 @@ def _write_real_rank(path: str) -> bytes: def test_rank_load_header_only_truncated_io_error(tmp_path): - data = _write_real_rank(str(tmp_path / "real.tvr")) - p = str(tmp_path / "trunc.tvr") + data = _write_real_rank(str(tmp_path / "real.ovr")) + p = str(tmp_path / "trunc.ovr") with open(p, "wb") as f: f.write(data[:13]) # header, zero payload with pytest.raises(IOError): @@ -876,8 +876,8 @@ def test_rank_load_header_only_truncated_io_error(tmp_path): def test_rank_load_mid_payload_truncated_io_error(tmp_path): - data = _write_real_rank(str(tmp_path / "real.tvr")) - p = str(tmp_path / "half.tvr") + data = _write_real_rank(str(tmp_path / "real.ovr")) + p = str(tmp_path / "half.ovr") with open(p, "wb") as f: f.write(data[: len(data) // 2]) with pytest.raises(IOError): @@ -887,8 +887,8 @@ def test_rank_load_mid_payload_truncated_io_error(tmp_path): def test_rank_load_trailing_bytes_io_error(tmp_path): # A structurally-valid file with extra trailing bytes is rejected (v1 has no # footer) — guards against record-smuggling past a smaller declared payload. - data = _write_real_rank(str(tmp_path / "real.tvr")) - p = str(tmp_path / "ext.tvr") + data = _write_real_rank(str(tmp_path / "real.ovr")) + p = str(tmp_path / "ext.ovr") with open(p, "wb") as f: f.write(data + b"\x00" * 64) with pytest.raises(IOError): @@ -899,9 +899,9 @@ def test_rank_load_forged_huge_n_vectors_io_error_no_oom(tmp_path): # Forge n_vectors (bytes 9..13) to ~268M into a tiny file. The DoS-alloc # hypothesis: a naive loader allocates n_vectors*dim*2 up front. The loader # must reject (MAX_VECTORS / payload-mismatch) BEFORE allocating. - data = bytearray(_write_real_rank(str(tmp_path / "real.tvr"))) + data = bytearray(_write_real_rank(str(tmp_path / "real.ovr"))) data[9:13] = struct.pack(") -> std::io::Result<()> { crate::rank_io::write_bitmap(path, self.dim, self.n_top, self.n_vectors, &self.bitmaps) } - /// Load from a `.tvbm` file produced by [`Self::write`]. + /// Load from a `.ovbm` file produced by [`Self::write`]. + /// + /// Legacy `.tvbm` files (magic `TVBM`) written by older versions of this + /// crate are also accepted; newly written files use the `OVBM` magic. /// /// Returns `io::Error::InvalidData` on any constructor-invariant /// violation (`load_bitmap` already validates dim/n_top/n_vectors; @@ -535,14 +538,14 @@ impl Bitmap { let expected = n_vectors.checked_mul(qpv).ok_or_else(|| { std::io::Error::new( std::io::ErrorKind::InvalidData, - "TVBM n_vectors * dim/64 overflows usize", + "OVBM n_vectors * dim/64 overflows usize", ) })?; if bitmaps.len() != expected { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, format!( - "TVBM payload length {} does not match expected {expected} u64 lanes", + "OVBM payload length {} does not match expected {expected} u64 lanes", bitmaps.len(), ), )); diff --git a/src/quant.rs b/src/quant.rs index 1d022ae..ee9f0db 100644 --- a/src/quant.rs +++ b/src/quant.rs @@ -861,10 +861,10 @@ impl RankQuant { last } - /// Persist to a `.tvrq` file. Format: 14-byte header + packed bytes. + /// Persist to a `.ovrq` file. Format: 14-byte header + packed bytes. /// /// # `b=8` - /// The `.tvrq` on-disk format and its loader currently support only + /// The `.ovrq` on-disk format and its loader currently support only /// `bits ∈ {1, 2, 4}`. `b=8` is an in-memory evidence/refinement surface /// in this phase; persisting it is a follow-up. To avoid writing a file /// that [`Self::load`] would then reject (a silent broken round-trip), @@ -874,7 +874,7 @@ impl RankQuant { if self.bits == 8 { return Err(std::io::Error::new( std::io::ErrorKind::Unsupported, - "RankQuant b=8 persistence is not supported yet (the .tvrq loader \ + "RankQuant b=8 persistence is not supported yet (the .ovrq loader \ accepts bits ∈ {1, 2, 4}); b=8 is an in-memory evidence surface \ in this phase", )); @@ -882,7 +882,10 @@ impl RankQuant { crate::rank_io::write_rankquant(path, self.bits, self.dim, self.n_vectors, &self.packed) } - /// Load from a `.tvrq` file produced by [`Self::write`]. + /// Load from a `.ovrq` file produced by [`Self::write`]. + /// + /// Legacy `.tvrq` files (magic `TVRQ`) written by older versions of this + /// crate are also accepted; newly written files use the `OVRQ` magic. /// /// Re-runs the same constructor invariants `RankQuant::new` /// enforces (`bits ∈ {1, 2, 4}`, `dim % (1 << bits) == 0`, @@ -897,7 +900,7 @@ impl RankQuant { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, format!( - "TVRQ dim {dim} is not a multiple of 2^bits = {n_buckets}; \ + "OVRQ dim {dim} is not a multiple of 2^bits = {n_buckets}; \ constant-composition invariant violated" ), )); @@ -906,7 +909,7 @@ impl RankQuant { if dim % codes_per_byte != 0 { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, - format!("TVRQ dim {dim} is not a multiple of codes_per_byte = {codes_per_byte}",), + format!("OVRQ dim {dim} is not a multiple of codes_per_byte = {codes_per_byte}",), )); } // `checked_mul` (not `saturating`): on a 32-bit target the byte count @@ -917,7 +920,7 @@ impl RankQuant { let nv_dim = n_vectors.checked_mul(dim).ok_or_else(|| { std::io::Error::new( std::io::ErrorKind::InvalidData, - "TVRQ n_vectors * dim overflows usize", + "OVRQ n_vectors * dim overflows usize", ) })?; let expected_bytes = nv_dim @@ -926,14 +929,14 @@ impl RankQuant { .ok_or_else(|| { std::io::Error::new( std::io::ErrorKind::InvalidData, - "TVRQ (n_vectors * dim) * bits overflows usize", + "OVRQ (n_vectors * dim) * bits overflows usize", ) })?; if packed.len() != expected_bytes { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, format!( - "TVRQ payload length {} does not match expected {expected_bytes}", + "OVRQ payload length {} does not match expected {expected_bytes}", packed.len(), ), )); @@ -1453,7 +1456,7 @@ fn validate_finite(values: &[f32], name: &'static str) -> Result<(), OrdvecError /// Standalone symmetric RankQuant-style eval search for arbitrary bit widths. /// -/// This does **not** use [`RankQuant`] storage and does not change the `.tvrq` +/// This does **not** use [`RankQuant`] storage and does not change the `.ovrq` /// packing contract. It rank-transforms `corpus` and `queries`, buckets each /// rank into `1 << bits` equal-width bins, mean-centres bucket ids, normalises /// by the **empirical** norm for that `(dim, bits)` (the exact L2 norm of the diff --git a/src/rank.rs b/src/rank.rs index c74bba8..10cd1e2 100644 --- a/src/rank.rs +++ b/src/rank.rs @@ -540,12 +540,15 @@ impl Rank { last } - /// Persist to a `.tvr` file. Format: 13-byte header + u16 ranks LE. + /// Persist to a `.ovr` file. Format: 13-byte header + u16 ranks LE. pub fn write(&self, path: impl AsRef) -> std::io::Result<()> { crate::rank_io::write_rank(path, self.dim, self.n_vectors, &self.ranks) } - /// Load from a `.tvr` file produced by [`Self::write`]. + /// Load from a `.ovr` file produced by [`Self::write`]. + /// + /// Legacy `.tvr` files (magic `TVR1`) written by older versions of this + /// crate are also accepted; newly written files use the `OVR1` magic. /// /// Returns `io::Error` (kind `InvalidData`) on any structural /// inconsistency between the header and the payload (`load_rank` @@ -560,13 +563,13 @@ impl Rank { let expected = n_vectors.checked_mul(dim).ok_or_else(|| { std::io::Error::new( std::io::ErrorKind::InvalidData, - "TVR1 n_vectors * dim overflows usize", + "OVR1 n_vectors * dim overflows usize", ) })?; if ranks.len() != expected { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, - "TVR1 payload length does not match dim * n_vectors", + "OVR1 payload length does not match dim * n_vectors", )); } Ok(Self { diff --git a/src/rank_io.rs b/src/rank_io.rs index 8e3be94..3a1e341 100644 --- a/src/rank_io.rs +++ b/src/rank_io.rs @@ -1,10 +1,13 @@ //! Read/write ordinal/sign index files. //! -//! Four formats live here, each self-describing via a 4-byte magic: -//! * `.tvr` — [`Rank`](crate::Rank) — magic `TVR1` -//! * `.tvrq` — [`RankQuant`](crate::RankQuant) — magic `TVRQ` -//! * `.tvbm` — [`Bitmap`](crate::Bitmap) — magic `TVBM` -//! * `.tvsb` — [`SignBitmap`](crate::SignBitmap) — magic `TVSB` +//! Four formats live here, each self-describing via a 4-byte magic. Files +//! written by this crate use the **`.ov*` / `OV*`** magics (the ordvec format); +//! the legacy turbovec-era **`.tv*` / `TV*`** magics are still accepted on load +//! for backward compatibility, but are never written: +//! * `.ovr` (legacy `.tvr`) — [`Rank`](crate::Rank) — magic `OVR1` (also reads `TVR1`) +//! * `.ovrq` (legacy `.tvrq`) — [`RankQuant`](crate::RankQuant) — magic `OVRQ` (also reads `TVRQ`) +//! * `.ovbm` (legacy `.tvbm`) — [`Bitmap`](crate::Bitmap) — magic `OVBM` (also reads `TVBM`) +//! * `.ovsb` (legacy `.tvsb`) — [`SignBitmap`](crate::SignBitmap) — magic `OVSB` (also reads `TVSB`) //! //! All formats are little-endian. Headers are small fixed-size structs //! followed by a single contiguous payload (the rank / packed / bitmap @@ -59,6 +62,14 @@ use std::fs::File; use std::io::{self, BufReader, BufWriter, Read, Seek, Write}; use std::path::Path; +// Current ordvec magics — written by this crate going forward. +const OVR_MAGIC: &[u8; 4] = b"OVR1"; +const OVRQ_MAGIC: &[u8; 4] = b"OVRQ"; +const OVBM_MAGIC: &[u8; 4] = b"OVBM"; +const OVSB_MAGIC: &[u8; 4] = b"OVSB"; +// Legacy turbovec-era magics — still accepted on load for backward +// compatibility, never written. Files produced before the ordvec rebrand carry +// these; loaders accept either the `OV*` or the matching `TV*` magic. const TVR_MAGIC: &[u8; 4] = b"TVR1"; const TVRQ_MAGIC: &[u8; 4] = b"TVRQ"; const TVBM_MAGIC: &[u8; 4] = b"TVBM"; @@ -345,10 +356,10 @@ pub fn probe_index_metadata(path: impl AsRef) -> io::Result let mut f = BufReader::new(file); let magic = read_magic(&mut f, "ordvec index")?; match &magic { - TVR_MAGIC => probe_rank_metadata(&mut f, file_size_bytes), - TVRQ_MAGIC => probe_rankquant_metadata(&mut f, file_size_bytes), - TVBM_MAGIC => probe_bitmap_metadata(&mut f, file_size_bytes), - TVSB_MAGIC => probe_sign_bitmap_metadata(&mut f, file_size_bytes), + OVR_MAGIC | TVR_MAGIC => probe_rank_metadata(&mut f, file_size_bytes), + OVRQ_MAGIC | TVRQ_MAGIC => probe_rankquant_metadata(&mut f, file_size_bytes), + OVBM_MAGIC | TVBM_MAGIC => probe_bitmap_metadata(&mut f, file_size_bytes), + OVSB_MAGIC | TVSB_MAGIC => probe_sign_bitmap_metadata(&mut f, file_size_bytes), _ => Err(invalid("unknown ordvec index magic")), } } @@ -494,7 +505,7 @@ pub(crate) fn write_rank( check_payload_bytes(payload_bytes)?; assert_eq!(ranks.len(), payload_bytes / 2); let mut f = BufWriter::new(File::create(path)?); - f.write_all(TVR_MAGIC)?; + f.write_all(OVR_MAGIC)?; f.write_all(&[VERSION])?; f.write_all(&(dim as u32).to_le_bytes())?; f.write_all(&(n_vectors as u32).to_le_bytes())?; @@ -515,8 +526,8 @@ pub(crate) fn load_rank(path: impl AsRef) -> io::Result<(usize, usize, Vec let file_len = file.metadata()?.len(); let mut f = BufReader::new(file); let magic = read_magic(&mut f, "TVR1")?; - if &magic != TVR_MAGIC { - return Err(invalid("not a TVR1 file: wrong magic")); + if &magic != OVR_MAGIC && &magic != TVR_MAGIC { + return Err(invalid("not an OVR1/TVR1 (Rank) file: wrong magic")); } read_version(&mut f, "TVR1")?; let dim = read_u32_le(&mut f, "TVR1", "dim")? as usize; @@ -587,7 +598,7 @@ pub(crate) fn write_rankquant( check_payload_bytes(payload_bytes)?; assert_eq!(packed.len(), payload_bytes); let mut f = BufWriter::new(File::create(path)?); - f.write_all(TVRQ_MAGIC)?; + f.write_all(OVRQ_MAGIC)?; f.write_all(&[VERSION])?; f.write_all(&[bits])?; f.write_all(&(dim as u32).to_le_bytes())?; @@ -607,8 +618,8 @@ pub(crate) fn load_rankquant(path: impl AsRef) -> io::Result<(u8, usize, u let file_len = file.metadata()?.len(); let mut f = BufReader::new(file); let magic = read_magic(&mut f, "TVRQ")?; - if &magic != TVRQ_MAGIC { - return Err(invalid("not a TVRQ file: wrong magic")); + if &magic != OVRQ_MAGIC && &magic != TVRQ_MAGIC { + return Err(invalid("not an OVRQ/TVRQ (RankQuant) file: wrong magic")); } read_version(&mut f, "TVRQ")?; let bits = read_u8_field(&mut f, "TVRQ", "bits")?; @@ -697,7 +708,7 @@ pub(crate) fn write_bitmap( check_payload_bytes(payload_bytes)?; assert_eq!(bitmaps.len(), payload_bytes / 8); let mut f = BufWriter::new(File::create(path)?); - f.write_all(TVBM_MAGIC)?; + f.write_all(OVBM_MAGIC)?; f.write_all(&[VERSION])?; f.write_all(&(dim as u32).to_le_bytes())?; f.write_all(&(n_top as u32).to_le_bytes())?; @@ -719,8 +730,8 @@ pub(crate) fn load_bitmap(path: impl AsRef) -> io::Result<(usize, usize, u let file_len = file.metadata()?.len(); let mut f = BufReader::new(file); let magic = read_magic(&mut f, "TVBM")?; - if &magic != TVBM_MAGIC { - return Err(invalid("not a TVBM file: wrong magic")); + if &magic != OVBM_MAGIC && &magic != TVBM_MAGIC { + return Err(invalid("not an OVBM/TVBM (Bitmap) file: wrong magic")); } read_version(&mut f, "TVBM")?; let dim = read_u32_le(&mut f, "TVBM", "dim")? as usize; @@ -788,7 +799,7 @@ pub(crate) fn write_sign_bitmap( check_payload_bytes(payload_bytes)?; assert_eq!(bitmaps.len(), payload_bytes / 8); let mut f = BufWriter::new(File::create(path)?); - f.write_all(TVSB_MAGIC)?; + f.write_all(OVSB_MAGIC)?; f.write_all(&[VERSION])?; f.write_all(&(dim as u32).to_le_bytes())?; f.write_all(&(n_vectors as u32).to_le_bytes())?; @@ -824,8 +835,8 @@ pub(crate) fn load_sign_bitmap(path: impl AsRef) -> io::Result<(usize, usi let file_len = file.metadata()?.len(); let mut f = BufReader::new(file); let magic = read_magic(&mut f, "TVSB")?; - if &magic != TVSB_MAGIC { - return Err(invalid("not a TVSB file: wrong magic")); + if &magic != OVSB_MAGIC && &magic != TVSB_MAGIC { + return Err(invalid("not an OVSB/TVSB (SignBitmap) file: wrong magic")); } read_version(&mut f, "TVSB")?; let dim = read_u32_le(&mut f, "TVSB", "dim")? as usize; diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs index 4f1ce09..e27d4a3 100644 --- a/src/sign_bitmap.rs +++ b/src/sign_bitmap.rs @@ -134,7 +134,7 @@ impl SignBitmap { /// can be persisted via [`Self::write`] and reloaded via /// [`Self::load`] — without it, `new` could produce indices the /// loader refuses to round-trip (the issue Codex caught after the - /// first `.tvsb` revision used [`crate::rank_io::MAX_DIM`]'s + /// first `.ovsb` revision used [`crate::rank_io::MAX_DIM`]'s /// rank-storage `u16::MAX` cap, which doesn't apply to sign /// bitmaps). pub fn new(dim: usize) -> Self { @@ -454,12 +454,15 @@ impl SignBitmap { last } - /// Persist to a `.tvsb` file. Format: 13-byte header + LE u64 bitmaps. + /// Persist to a `.ovsb` file. Format: 13-byte header + LE u64 bitmaps. pub fn write(&self, path: impl AsRef) -> std::io::Result<()> { crate::rank_io::write_sign_bitmap(path, self.dim, self.n_vectors, &self.bitmaps) } - /// Load from a `.tvsb` file produced by [`Self::write`]. + /// Load from a `.ovsb` file produced by [`Self::write`]. + /// + /// Legacy `.tvsb` files (magic `TVSB`) written by older versions of this + /// crate are also accepted; newly written files use the `OVSB` magic. /// /// Returns `io::Error::InvalidData` on any constructor-invariant /// violation. `load_sign_bitmap` already validates dim and n_vectors; @@ -474,14 +477,14 @@ impl SignBitmap { let expected = n_vectors.checked_mul(qpv).ok_or_else(|| { std::io::Error::new( std::io::ErrorKind::InvalidData, - "TVSB n_vectors * dim/64 overflows usize", + "OVSB n_vectors * dim/64 overflows usize", ) })?; if bitmaps.len() != expected { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, format!( - "TVSB payload length {} does not match expected {expected} u64 lanes", + "OVSB payload length {} does not match expected {expected} u64 lanes", bitmaps.len(), ), )); diff --git a/tests/persistence_compat.rs b/tests/persistence_compat.rs index 40684e3..dee669d 100644 --- a/tests/persistence_compat.rs +++ b/tests/persistence_compat.rs @@ -100,7 +100,7 @@ fn assert_rejects_version_and_trailing_bytes( #[test] fn rank_v1_fixture_bytes_are_stable() { let expected = [ - b'T', b'V', b'R', b'1', 1, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, + b'O', b'V', b'R', b'1', 1, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, ]; let path = tmp("rank"); @@ -129,7 +129,7 @@ fn rank_v1_fixture_bytes_are_stable() { #[test] fn rankquant_v1_fixture_bytes_are_stable() { let expected = [ - b'T', b'V', b'R', b'Q', 1, 2, 8, 0, 0, 0, 1, 0, 0, 0, 0x05, 0xaf, + b'O', b'V', b'R', b'Q', 1, 2, 8, 0, 0, 0, 1, 0, 0, 0, 0x05, 0xaf, ]; let path = tmp("rankquant"); @@ -159,7 +159,7 @@ fn rankquant_v1_fixture_bytes_are_stable() { #[test] fn bitmap_v1_fixture_bytes_are_stable() { let expected = [ - b'T', b'V', b'B', b'M', 1, 64, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xc0, + b'O', b'V', b'B', b'M', 1, 64, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xc0, ]; let path = tmp("bitmap"); @@ -190,7 +190,7 @@ fn bitmap_v1_fixture_bytes_are_stable() { #[test] fn sign_bitmap_v1_fixture_bytes_are_stable() { let expected = [ - b'T', b'V', b'S', b'B', 1, 64, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0x80, + b'O', b'V', b'S', b'B', 1, 64, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0x80, ]; let path = tmp("sign_bitmap"); @@ -220,3 +220,98 @@ fn sign_bitmap_v1_fixture_bytes_are_stable() { SignBitmap::load(path) }); } + +// Back-compat tests: files carrying the legacy TV* magic still load correctly. +// The fixture body (everything after the 4-byte magic) is identical; only the +// magic prefix differs. These prove the "accept both OV* and TV* on load, +// never write TV*" contract is upheld at the public index-type level. + +#[test] +fn rank_v1_legacy_tv_magic_still_loads() { + // Fixture body from `rank_v1_fixture_bytes_are_stable`, magic swapped TV*. + let ov_fixture: &[u8] = &[ + b'O', b'V', b'R', b'1', 1, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, + ]; + let mut legacy = vec![b'T', b'V', b'R', b'1']; + legacy.extend_from_slice(&ov_fixture[4..]); + + let path = tmp("rank_legacy_tv"); + write_bytes(&path, &legacy); + let loaded = Rank::load(&path).unwrap(); + assert_eq!(loaded.dim(), 4); + assert_eq!(loaded.len(), 1); + + // Also confirm the OV* fixture loads to the same shape (round-trip parity). + let ov_path = tmp("rank_ov"); + write_bytes(&ov_path, ov_fixture); + let ov_loaded = Rank::load(&ov_path).unwrap(); + assert_eq!(loaded.dim(), ov_loaded.dim()); + assert_eq!(loaded.len(), ov_loaded.len()); +} + +#[test] +fn rankquant_v1_legacy_tv_magic_still_loads() { + let ov_fixture: &[u8] = &[ + b'O', b'V', b'R', b'Q', 1, 2, 8, 0, 0, 0, 1, 0, 0, 0, 0x05, 0xaf, + ]; + let mut legacy = vec![b'T', b'V', b'R', b'Q']; + legacy.extend_from_slice(&ov_fixture[4..]); + + let path = tmp("rankquant_legacy_tv"); + write_bytes(&path, &legacy); + let loaded = RankQuant::load(&path).unwrap(); + assert_eq!(loaded.dim(), 8); + assert_eq!(loaded.len(), 1); + assert_eq!(loaded.bits(), 2); + + let ov_path = tmp("rankquant_ov"); + write_bytes(&ov_path, ov_fixture); + let ov_loaded = RankQuant::load(&ov_path).unwrap(); + assert_eq!(loaded.dim(), ov_loaded.dim()); + assert_eq!(loaded.len(), ov_loaded.len()); + assert_eq!(loaded.bits(), ov_loaded.bits()); +} + +#[test] +fn bitmap_v1_legacy_tv_magic_still_loads() { + let ov_fixture: &[u8] = &[ + b'O', b'V', b'B', b'M', 1, 64, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xc0, + ]; + let mut legacy = vec![b'T', b'V', b'B', b'M']; + legacy.extend_from_slice(&ov_fixture[4..]); + + let path = tmp("bitmap_legacy_tv"); + write_bytes(&path, &legacy); + let loaded = Bitmap::load(&path).unwrap(); + assert_eq!(loaded.dim(), 64); + assert_eq!(loaded.len(), 1); + assert_eq!(loaded.n_top(), 2); + + let ov_path = tmp("bitmap_ov"); + write_bytes(&ov_path, ov_fixture); + let ov_loaded = Bitmap::load(&ov_path).unwrap(); + assert_eq!(loaded.dim(), ov_loaded.dim()); + assert_eq!(loaded.len(), ov_loaded.len()); + assert_eq!(loaded.n_top(), ov_loaded.n_top()); +} + +#[test] +fn sign_bitmap_v1_legacy_tv_magic_still_loads() { + let ov_fixture: &[u8] = &[ + b'O', b'V', b'S', b'B', 1, 64, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0x80, + ]; + let mut legacy = vec![b'T', b'V', b'S', b'B']; + legacy.extend_from_slice(&ov_fixture[4..]); + + let path = tmp("sign_bitmap_legacy_tv"); + write_bytes(&path, &legacy); + let loaded = SignBitmap::load(&path).unwrap(); + assert_eq!(loaded.dim(), 64); + assert_eq!(loaded.len(), 1); + + let ov_path = tmp("sign_bitmap_ov"); + write_bytes(&ov_path, ov_fixture); + let ov_loaded = SignBitmap::load(&ov_path).unwrap(); + assert_eq!(loaded.dim(), ov_loaded.dim()); + assert_eq!(loaded.len(), ov_loaded.len()); +} From ccf963d5d1840bfbfad1355b4e87e5dacd86922d Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 21:36:28 -0500 Subject: [PATCH 2/4] style: rustfmt ordvec-ffi (match layout after error-string shortening) The shorter UNSUPPORTED_FORMAT message in info_for_metadata let rustfmt collapse `info.kind = match {...}` onto one line. Formatting only, no logic change. Signed-off-by: Nelson Spence --- ordvec-ffi/src/lib.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/ordvec-ffi/src/lib.rs b/ordvec-ffi/src/lib.rs index 8a38580..f4bf463 100644 --- a/ordvec-ffi/src/lib.rs +++ b/ordvec-ffi/src/lib.rs @@ -364,15 +364,16 @@ fn info_for_handle(handle: &IndexHandle) -> ordvec_index_info_t { fn info_for_metadata(meta: &IndexMetadata) -> Result { let mut info = default_info(); - info.kind = - match meta.kind { - IndexKind::RankQuant => ORDVEC_INDEX_KIND_RANK_QUANT, - IndexKind::Bitmap => ORDVEC_INDEX_KIND_BITMAP, - IndexKind::Rank | IndexKind::SignBitmap => return Err(FfiError::new( + info.kind = match meta.kind { + IndexKind::RankQuant => ORDVEC_INDEX_KIND_RANK_QUANT, + IndexKind::Bitmap => ORDVEC_INDEX_KIND_BITMAP, + IndexKind::Rank | IndexKind::SignBitmap => { + return Err(FfiError::new( ORDVEC_STATUS_UNSUPPORTED_FORMAT, "ABI v1 supports metadata probes only for RankQuant and Bitmap indexes", - )), - }; + )) + } + }; info.format_version = u32::from(meta.format_version); info.dim = meta.dim as u64; info.vector_count = meta.vector_count as u64; From 6402aa004d75255b6917f691f5dc503476dd4dd1 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Mon, 15 Jun 2026 11:11:11 -0500 Subject: [PATCH 3/4] fix(rank_io): canonicalize loader error labels to OV* + regen ffi header (qodo) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the loaders/probes were widened to accept both `OV*` and legacy `TV*` magics, the field-validation calls and baked-in error messages still hard-coded `TVR1`/`TVRQ`/`TVBM`/`TVSB` labels — so a malformed `OV*` file emitted errors referencing `TV*`, confusing readers and support tooling keyed off the text (qodo, Observability). Canonicalize every error label/prefix to the new `OV*` names (the format is byte-identical; `OV*` is now primary). The `b"TV*"` magic constants and the legacy-file test fixtures are unchanged — back-compat acceptance is intact — and the magic-mismatch messages keep their explicit `OV*/TV*` wording. Updated the loader_validation assertions to match. Regenerate `ordvec-ffi/include/ordvec.h` with cbindgen 0.29.3 (the committed header had drifted from the loader doc-comment wording). Full suite green. Signed-off-by: Nelson Spence --- ordvec-ffi/include/ordvec.h | 4 +- src/rank_io.rs | 146 +++++++++++++++---------------- tests/index/loader_validation.rs | 32 +++---- 3 files changed, 91 insertions(+), 91 deletions(-) diff --git a/ordvec-ffi/include/ordvec.h b/ordvec-ffi/include/ordvec.h index 68a0ac5..493567f 100644 --- a/ordvec-ffi/include/ordvec.h +++ b/ordvec-ffi/include/ordvec.h @@ -192,8 +192,8 @@ ordvec_status_t ordvec_index_load(const char *path, uint64_t flags, ordvec_index /** * Probe on-disk metadata for a `.ovrq` RankQuant or `.ovbm` Bitmap index - * (legacy `.tvrq` / `.tvbm` also accepted) without loading payload rows into an - * index handle. + * (legacy `.tv*` also accepted) without loading payload rows into an index + * handle. * * This validates the fixed header, declared dimensions, payload byte count, * and exact file length. Full row-invariant validation remains the job of diff --git a/src/rank_io.rs b/src/rank_io.rs index 3a1e341..3a4bde3 100644 --- a/src/rank_io.rs +++ b/src/rank_io.rs @@ -232,11 +232,11 @@ fn check_dim(dim: usize) -> io::Result<()> { fn check_sign_bitmap_dim(dim: usize) -> io::Result<()> { if !(64..=MAX_SIGN_BITMAP_DIM).contains(&dim) { return Err(invalid(format!( - "TVSB dim {dim} out of range [64, {MAX_SIGN_BITMAP_DIM}]" + "OVSB dim {dim} out of range [64, {MAX_SIGN_BITMAP_DIM}]" ))); } if !dim.is_multiple_of(64) { - return Err(invalid(format!("TVSB dim {dim} is not a multiple of 64"))); + return Err(invalid(format!("OVSB dim {dim} is not a multiple of 64"))); } Ok(()) } @@ -318,20 +318,20 @@ fn rank_payload_bytes(dim: usize, vector_count: usize) -> io::Result { vector_count .checked_mul(dim) .and_then(|x| x.checked_mul(2)) - .ok_or_else(|| invalid("TVR1 payload size overflows usize")) + .ok_or_else(|| invalid("OVR1 payload size overflows usize")) } fn rankquant_bytes_per_vec(dim: usize, bits: u8) -> io::Result { dim.checked_mul(bits as usize) .map(|x| x / 8) - .ok_or_else(|| invalid("TVRQ bytes_per_vec overflows usize")) + .ok_or_else(|| invalid("OVRQ bytes_per_vec overflows usize")) } fn rankquant_payload_bytes(dim: usize, vector_count: usize, bits: u8) -> io::Result { let bytes_per_vec = rankquant_bytes_per_vec(dim, bits)?; vector_count .checked_mul(bytes_per_vec) - .ok_or_else(|| invalid("TVRQ payload size overflows usize")) + .ok_or_else(|| invalid("OVRQ payload size overflows usize")) } fn bitmap_payload_bytes(dim: usize, vector_count: usize, label: &str) -> io::Result { @@ -368,15 +368,15 @@ fn probe_rank_metadata( reader: &mut R, file_size_bytes: u64, ) -> io::Result { - let format_version = read_version(reader, "TVR1")?; - let dim = read_u32_le(reader, "TVR1", "dim")? as usize; + let format_version = read_version(reader, "OVR1")?; + let dim = read_u32_le(reader, "OVR1", "dim")? as usize; check_dim(dim)?; - let vector_count = read_u32_le(reader, "TVR1", "n_vectors")? as usize; + let vector_count = read_u32_le(reader, "OVR1", "n_vectors")? as usize; check_n_vectors(vector_count)?; let bytes_per_vec = rank_payload_bytes(dim, 1)?; let payload_bytes = rank_payload_bytes(dim, vector_count)?; check_payload_bytes(payload_bytes)?; - check_payload_matches_file(reader, "TVR1", file_size_bytes, payload_bytes)?; + check_payload_matches_file(reader, "OVR1", file_size_bytes, payload_bytes)?; Ok(IndexMetadata { kind: IndexKind::Rank, format_version, @@ -392,33 +392,33 @@ fn probe_rankquant_metadata( reader: &mut R, file_size_bytes: u64, ) -> io::Result { - let format_version = read_version(reader, "TVRQ")?; - let bits = read_u8_field(reader, "TVRQ", "bits")?; + let format_version = read_version(reader, "OVRQ")?; + let bits = read_u8_field(reader, "OVRQ", "bits")?; if !matches!(bits, 1 | 2 | 4) { return Err(invalid(format!( "unsupported TVRQ bits: {bits} (expected 1, 2, or 4)" ))); } - let dim = read_u32_le(reader, "TVRQ", "dim")? as usize; + let dim = read_u32_le(reader, "OVRQ", "dim")? as usize; check_dim(dim)?; let n_buckets = 1usize << bits; if !dim.is_multiple_of(n_buckets) { return Err(invalid(format!( - "TVRQ dim {dim} is not a multiple of 2^bits = {n_buckets}; \ + "OVRQ dim {dim} is not a multiple of 2^bits = {n_buckets}; \ constant-composition invariant violated" ))); } let codes_per_byte = (8 / bits) as usize; if !dim.is_multiple_of(codes_per_byte) { return Err(invalid(format!( - "TVRQ dim {dim} is not a multiple of codes_per_byte = {codes_per_byte}" + "OVRQ dim {dim} is not a multiple of codes_per_byte = {codes_per_byte}" ))); } - let vector_count = read_u32_le(reader, "TVRQ", "n_vectors")? as usize; + let vector_count = read_u32_le(reader, "OVRQ", "n_vectors")? as usize; check_n_vectors(vector_count)?; let payload_bytes = rankquant_payload_bytes(dim, vector_count, bits)?; check_payload_bytes(payload_bytes)?; - check_payload_matches_file(reader, "TVRQ", file_size_bytes, payload_bytes)?; + check_payload_matches_file(reader, "OVRQ", file_size_bytes, payload_bytes)?; let bytes_per_vec = rankquant_bytes_per_vec(dim, bits)?; Ok(IndexMetadata { kind: IndexKind::RankQuant, @@ -435,23 +435,23 @@ fn probe_bitmap_metadata( reader: &mut R, file_size_bytes: u64, ) -> io::Result { - let format_version = read_version(reader, "TVBM")?; - let dim = read_u32_le(reader, "TVBM", "dim")? as usize; + let format_version = read_version(reader, "OVBM")?; + let dim = read_u32_le(reader, "OVBM", "dim")? as usize; check_dim(dim)?; if !dim.is_multiple_of(64) { - return Err(invalid(format!("TVBM dim {dim} is not a multiple of 64"))); + return Err(invalid(format!("OVBM dim {dim} is not a multiple of 64"))); } - let n_top = read_u32_le(reader, "TVBM", "n_top")? as usize; + let n_top = read_u32_le(reader, "OVBM", "n_top")? as usize; if n_top == 0 || n_top >= dim { return Err(invalid(format!( - "TVBM n_top {n_top} must satisfy 0 < n_top < dim ({dim})" + "OVBM n_top {n_top} must satisfy 0 < n_top < dim ({dim})" ))); } - let vector_count = read_u32_le(reader, "TVBM", "n_vectors")? as usize; + let vector_count = read_u32_le(reader, "OVBM", "n_vectors")? as usize; check_n_vectors(vector_count)?; - let payload_bytes = bitmap_payload_bytes(dim, vector_count, "TVBM")?; + let payload_bytes = bitmap_payload_bytes(dim, vector_count, "OVBM")?; check_payload_bytes(payload_bytes)?; - check_payload_matches_file(reader, "TVBM", file_size_bytes, payload_bytes)?; + check_payload_matches_file(reader, "OVBM", file_size_bytes, payload_bytes)?; Ok(IndexMetadata { kind: IndexKind::Bitmap, format_version, @@ -467,14 +467,14 @@ fn probe_sign_bitmap_metadata( reader: &mut R, file_size_bytes: u64, ) -> io::Result { - let format_version = read_version(reader, "TVSB")?; - let dim = read_u32_le(reader, "TVSB", "dim")? as usize; + let format_version = read_version(reader, "OVSB")?; + let dim = read_u32_le(reader, "OVSB", "dim")? as usize; check_sign_bitmap_dim(dim)?; - let vector_count = read_u32_le(reader, "TVSB", "n_vectors")? as usize; + let vector_count = read_u32_le(reader, "OVSB", "n_vectors")? as usize; check_n_vectors(vector_count)?; - let payload_bytes = bitmap_payload_bytes(dim, vector_count, "TVSB")?; + let payload_bytes = bitmap_payload_bytes(dim, vector_count, "OVSB")?; check_payload_bytes(payload_bytes)?; - check_payload_matches_file(reader, "TVSB", file_size_bytes, payload_bytes)?; + check_payload_matches_file(reader, "OVSB", file_size_bytes, payload_bytes)?; Ok(IndexMetadata { kind: IndexKind::SignBitmap, format_version, @@ -525,18 +525,18 @@ pub(crate) fn load_rank(path: impl AsRef) -> io::Result<(usize, usize, Vec // the trailing-byte check. Both are wrong on a metadata race (NFS/procfs). let file_len = file.metadata()?.len(); let mut f = BufReader::new(file); - let magic = read_magic(&mut f, "TVR1")?; + let magic = read_magic(&mut f, "OVR1")?; if &magic != OVR_MAGIC && &magic != TVR_MAGIC { return Err(invalid("not an OVR1/TVR1 (Rank) file: wrong magic")); } - read_version(&mut f, "TVR1")?; - let dim = read_u32_le(&mut f, "TVR1", "dim")? as usize; + read_version(&mut f, "OVR1")?; + let dim = read_u32_le(&mut f, "OVR1", "dim")? as usize; check_dim(dim)?; - let n_vectors = read_u32_le(&mut f, "TVR1", "n_vectors")? as usize; + let n_vectors = read_u32_le(&mut f, "OVR1", "n_vectors")? as usize; check_n_vectors(n_vectors)?; let payload_bytes = rank_payload_bytes(dim, n_vectors)?; check_payload_bytes(payload_bytes)?; - check_payload_matches_file(&mut f, "TVR1", file_len, payload_bytes)?; + check_payload_matches_file(&mut f, "OVR1", file_len, payload_bytes)?; // `payload_bytes == n_vectors * dim * 2`, so the u16 element count is // `payload_bytes / 2`. Read directly into a fallibly reserved Vec // instead of allocating a byte buffer and `.collect()`-ing it — the old @@ -564,12 +564,12 @@ pub(crate) fn load_rank(path: impl AsRef) -> io::Result<(usize, usize, Vec let ri = r as usize; if ri >= dim { return Err(invalid(format!( - "TVR1 rank value {r} >= dim ({dim}); ranks must be a permutation of [0, dim)" + "OVR1 rank value {r} >= dim ({dim}); ranks must be a permutation of [0, dim)" ))); } if seen[ri] == stamp { return Err(invalid(format!( - "TVR1 row {row_idx} is not a permutation of [0, dim): value {r} repeats" + "OVR1 row {row_idx} is not a permutation of [0, dim): value {r} repeats" ))); } seen[ri] = stamp; @@ -617,18 +617,18 @@ pub(crate) fn load_rankquant(path: impl AsRef) -> io::Result<(u8, usize, u // the trailing-byte check. Both are wrong on a metadata race (NFS/procfs). let file_len = file.metadata()?.len(); let mut f = BufReader::new(file); - let magic = read_magic(&mut f, "TVRQ")?; + let magic = read_magic(&mut f, "OVRQ")?; if &magic != OVRQ_MAGIC && &magic != TVRQ_MAGIC { return Err(invalid("not an OVRQ/TVRQ (RankQuant) file: wrong magic")); } - read_version(&mut f, "TVRQ")?; - let bits = read_u8_field(&mut f, "TVRQ", "bits")?; + read_version(&mut f, "OVRQ")?; + let bits = read_u8_field(&mut f, "OVRQ", "bits")?; if !matches!(bits, 1 | 2 | 4) { return Err(invalid(format!( "unsupported TVRQ bits: {bits} (expected 1, 2, or 4)" ))); } - let dim = read_u32_le(&mut f, "TVRQ", "dim")? as usize; + let dim = read_u32_le(&mut f, "OVRQ", "dim")? as usize; check_dim(dim)?; // Constant-composition invariants (documented at module level and // enforced by `RankQuant::new`): `dim` must be a multiple of @@ -640,21 +640,21 @@ pub(crate) fn load_rankquant(path: impl AsRef) -> io::Result<(u8, usize, u let n_buckets = 1usize << bits; if !dim.is_multiple_of(n_buckets) { return Err(invalid(format!( - "TVRQ dim {dim} is not a multiple of 2^bits = {n_buckets}; \ + "OVRQ dim {dim} is not a multiple of 2^bits = {n_buckets}; \ constant-composition invariant violated" ))); } let codes_per_byte = (8 / bits) as usize; if !dim.is_multiple_of(codes_per_byte) { return Err(invalid(format!( - "TVRQ dim {dim} is not a multiple of codes_per_byte = {codes_per_byte}" + "OVRQ dim {dim} is not a multiple of codes_per_byte = {codes_per_byte}" ))); } - let n_vectors = read_u32_le(&mut f, "TVRQ", "n_vectors")? as usize; + let n_vectors = read_u32_le(&mut f, "OVRQ", "n_vectors")? as usize; check_n_vectors(n_vectors)?; let payload_bytes = rankquant_payload_bytes(dim, n_vectors, bits)?; check_payload_bytes(payload_bytes)?; - check_payload_matches_file(&mut f, "TVRQ", file_len, payload_bytes)?; + check_payload_matches_file(&mut f, "OVRQ", file_len, payload_bytes)?; let mut packed = try_alloc_zeroed(payload_bytes)?; f.read_exact(&mut packed)?; // Constant-composition invariant: every document must place exactly @@ -679,7 +679,7 @@ pub(crate) fn load_rankquant(path: impl AsRef) -> io::Result<(u8, usize, u for (bucket, &count) in hist[..n_buckets].iter().enumerate() { if count != expected_per_bucket { return Err(invalid(format!( - "TVRQ row {row_idx} violates constant composition: bucket {bucket} \ + "OVRQ row {row_idx} violates constant composition: bucket {bucket} \ has {count} codes, expected {expected_per_bucket} (= dim / 2^bits)" ))); } @@ -704,7 +704,7 @@ pub(crate) fn write_bitmap( // Enforce the loaders' MAX_PAYLOAD cap *before* File::create (defense-in- // depth; a rejected write must not truncate an existing file). Mirrors // load_bitmap. - let payload_bytes = bitmap_payload_bytes(dim, n_vectors, "TVBM")?; + let payload_bytes = bitmap_payload_bytes(dim, n_vectors, "OVBM")?; check_payload_bytes(payload_bytes)?; assert_eq!(bitmaps.len(), payload_bytes / 8); let mut f = BufWriter::new(File::create(path)?); @@ -729,28 +729,28 @@ pub(crate) fn load_bitmap(path: impl AsRef) -> io::Result<(usize, usize, u // the trailing-byte check. Both are wrong on a metadata race (NFS/procfs). let file_len = file.metadata()?.len(); let mut f = BufReader::new(file); - let magic = read_magic(&mut f, "TVBM")?; + let magic = read_magic(&mut f, "OVBM")?; if &magic != OVBM_MAGIC && &magic != TVBM_MAGIC { return Err(invalid("not an OVBM/TVBM (Bitmap) file: wrong magic")); } - read_version(&mut f, "TVBM")?; - let dim = read_u32_le(&mut f, "TVBM", "dim")? as usize; + read_version(&mut f, "OVBM")?; + let dim = read_u32_le(&mut f, "OVBM", "dim")? as usize; check_dim(dim)?; if !dim.is_multiple_of(64) { - return Err(invalid(format!("TVBM dim {dim} is not a multiple of 64"))); + return Err(invalid(format!("OVBM dim {dim} is not a multiple of 64"))); } - let n_top = read_u32_le(&mut f, "TVBM", "n_top")? as usize; + let n_top = read_u32_le(&mut f, "OVBM", "n_top")? as usize; if n_top == 0 || n_top >= dim { return Err(invalid(format!( - "TVBM n_top {n_top} must satisfy 0 < n_top < dim ({dim})" + "OVBM n_top {n_top} must satisfy 0 < n_top < dim ({dim})" ))); } - let n_vectors = read_u32_le(&mut f, "TVBM", "n_vectors")? as usize; + let n_vectors = read_u32_le(&mut f, "OVBM", "n_vectors")? as usize; check_n_vectors(n_vectors)?; let qpv = dim / 64; - let payload_bytes = bitmap_payload_bytes(dim, n_vectors, "TVBM")?; + let payload_bytes = bitmap_payload_bytes(dim, n_vectors, "OVBM")?; check_payload_bytes(payload_bytes)?; - check_payload_matches_file(&mut f, "TVBM", file_len, payload_bytes)?; + check_payload_matches_file(&mut f, "OVBM", file_len, payload_bytes)?; // `payload_bytes == n_vectors * qpv * 8`, so the u64 element count is // `payload_bytes / 8`. Read directly into a fallibly reserved Vec // rather than allocating a byte buffer and `.collect()`-ing it. @@ -765,7 +765,7 @@ pub(crate) fn load_bitmap(path: impl AsRef) -> io::Result<(usize, usize, u let pop: u32 = row.iter().map(|w| w.count_ones()).sum(); if pop as usize != n_top { return Err(invalid(format!( - "TVBM row {row_idx} has {pop} bits set, expected n_top = {n_top}" + "OVBM row {row_idx} has {pop} bits set, expected n_top = {n_top}" ))); } } @@ -795,7 +795,7 @@ pub(crate) fn write_sign_bitmap( // Enforce the loaders' MAX_PAYLOAD cap *before* File::create (defense-in- // depth; a rejected write must not truncate an existing file). Mirrors // load_sign_bitmap. - let payload_bytes = bitmap_payload_bytes(dim, n_vectors, "TVSB")?; + let payload_bytes = bitmap_payload_bytes(dim, n_vectors, "OVSB")?; check_payload_bytes(payload_bytes)?; assert_eq!(bitmaps.len(), payload_bytes / 8); let mut f = BufWriter::new(File::create(path)?); @@ -834,18 +834,18 @@ pub(crate) fn load_sign_bitmap(path: impl AsRef) -> io::Result<(usize, usi // the trailing-byte check. Both are wrong on a metadata race (NFS/procfs). let file_len = file.metadata()?.len(); let mut f = BufReader::new(file); - let magic = read_magic(&mut f, "TVSB")?; + let magic = read_magic(&mut f, "OVSB")?; if &magic != OVSB_MAGIC && &magic != TVSB_MAGIC { return Err(invalid("not an OVSB/TVSB (SignBitmap) file: wrong magic")); } - read_version(&mut f, "TVSB")?; - let dim = read_u32_le(&mut f, "TVSB", "dim")? as usize; + read_version(&mut f, "OVSB")?; + let dim = read_u32_le(&mut f, "OVSB", "dim")? as usize; check_sign_bitmap_dim(dim)?; - let n_vectors = read_u32_le(&mut f, "TVSB", "n_vectors")? as usize; + let n_vectors = read_u32_le(&mut f, "OVSB", "n_vectors")? as usize; check_n_vectors(n_vectors)?; - let payload_bytes = bitmap_payload_bytes(dim, n_vectors, "TVSB")?; + let payload_bytes = bitmap_payload_bytes(dim, n_vectors, "OVSB")?; check_payload_bytes(payload_bytes)?; - check_payload_matches_file(&mut f, "TVSB", file_len, payload_bytes)?; + check_payload_matches_file(&mut f, "OVSB", file_len, payload_bytes)?; // `payload_bytes == n_vectors * qpv * 8`, so the u64 element count is // `payload_bytes / 8`. Read directly into a fallibly reserved Vec // rather than allocating a byte buffer and `.collect()`-ing it. @@ -1064,7 +1064,7 @@ mod tests { assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); assert!( err.to_string() - .contains("TVR1 header truncated while reading dim"), + .contains("OVR1 header truncated while reading dim"), "unexpected error: {err}" ); std::fs::remove_file(&truncated).ok(); @@ -1072,7 +1072,7 @@ mod tests { let length_mismatch = forge("length_mismatch", &rank_header(8, 1)); assert_err_contains( probe_index_metadata(&length_mismatch), - "TVR1 payload truncated", + "OVR1 payload truncated", ); std::fs::remove_file(&length_mismatch).ok(); @@ -1102,12 +1102,12 @@ mod tests { ( "rank_version", b"TVR1".to_vec(), - "TVR1 header truncated while reading version", + "OVR1 header truncated while reading version", ), ( "rankquant_bits", b"TVRQ\x01".to_vec(), - "TVRQ header truncated while reading bits", + "OVRQ header truncated while reading bits", ), ( "bitmap_n_top", @@ -1118,7 +1118,7 @@ mod tests { v.extend_from_slice(&64u32.to_le_bytes()); v }, - "TVBM header truncated while reading n_top", + "OVBM header truncated while reading n_top", ), ( "sign_n_vectors", @@ -1129,7 +1129,7 @@ mod tests { v.extend_from_slice(&64u32.to_le_bytes()); v }, - "TVSB header truncated while reading n_vectors", + "OVSB header truncated while reading n_vectors", ), ]; for (suffix, bytes, expected) in cases { @@ -1142,24 +1142,24 @@ mod tests { #[test] fn probe_reports_distinct_payload_truncation_and_trailing_bytes_for_all_formats() { let cases: [(&str, Vec, Vec, &str); 4] = [ - ("rank", rank_header(8, 1), rank_header(8, 0), "TVR1"), + ("rank", rank_header(8, 1), rank_header(8, 0), "OVR1"), ( "rankquant", rankquant_header(2, 8, 1), rankquant_header(2, 8, 0), - "TVRQ", + "OVRQ", ), ( "bitmap", bitmap_header(64, 16, 1), bitmap_header(64, 16, 0), - "TVBM", + "OVBM", ), ( "sign_bitmap", sign_bitmap_header(64, 1), sign_bitmap_header(64, 0), - "TVSB", + "OVSB", ), ]; diff --git a/tests/index/loader_validation.rs b/tests/index/loader_validation.rs index a454d5c..bc6fae4 100644 --- a/tests/index/loader_validation.rs +++ b/tests/index/loader_validation.rs @@ -98,7 +98,7 @@ fn load_rank_rejects_non_permutation_row() { // Positive control: the valid file round-trips. assert!(Rank::load(&p).is_ok(), "valid Rank file must load"); - // TVR1 header is 13 bytes; payload is u16 LE ranks. Force ranks[1] == + // OVR1 header is 13 bytes; payload is u16 LE ranks. Force ranks[1] == // ranks[0] in row 0, turning the row into a non-permutation (a repeat). let mut bytes = read_bytes(&p); let (a, b) = (13usize, 15usize); // byte offsets of the first two u16 ranks @@ -126,7 +126,7 @@ fn load_rankquant_rejects_skewed_composition() { "valid RankQuant file must load" ); - // TVRQ header is 14 bytes. Zero the entire packed payload so every + // OVRQ header is 14 bytes. Zero the entire packed payload so every // coordinate decodes to bucket 0 — a maximally skewed composition that // violates the dim/2^bits-per-bucket invariant on the very first row. let mut bytes = read_bytes(&p); @@ -153,7 +153,7 @@ fn load_bitmap_rejects_wrong_popcount_row() { idx.write(&p).unwrap(); assert!(Bitmap::load(&p).is_ok(), "valid Bitmap file must load"); - // TVBM header is 17 bytes; payload is u64 LE words, qpv = dim/64 per doc. + // OVBM header is 17 bytes; payload is u64 LE words, qpv = dim/64 per doc. // Zero the first document's whole row so its popcount becomes 0 != n_top. let qpv = D / 64; let mut bytes = read_bytes(&p); @@ -183,7 +183,7 @@ fn load_sign_bitmap_accepts_any_bit_pattern() { let p = tmp("sb_any"); idx.write(&p).unwrap(); - // TVSB header is 13 bytes. Flip bits across the payload; the result is + // OVSB header is 13 bytes. Flip bits across the payload; the result is // still a structurally valid sign bitmap of the same shape. let mut bytes = read_bytes(&p); for byte in bytes.iter_mut().skip(13) { @@ -208,29 +208,29 @@ fn public_loaders_report_stable_malformed_payload_context() { let bitmap = bitmap_payload_cases(64, 16); let sign_bitmap = sign_bitmap_payload_cases(64); let cases: [(&str, Vec, Vec, &str); 4] = [ - ("rank", rank.0, rank.1, "TVR1"), - ("rankquant", rankquant.0, rankquant.1, "TVRQ"), - ("bitmap", bitmap.0, bitmap.1, "TVBM"), - ("sign_bitmap", sign_bitmap.0, sign_bitmap.1, "TVSB"), + ("rank", rank.0, rank.1, "OVR1"), + ("rankquant", rankquant.0, rankquant.1, "OVRQ"), + ("bitmap", bitmap.0, bitmap.1, "OVBM"), + ("sign_bitmap", sign_bitmap.0, sign_bitmap.1, "OVSB"), ]; for (suffix, truncated_header, mut trailing_bytes, label) in cases { let truncated = tmp(&format!("{suffix}_truncated_context")); write_bytes(&truncated, &truncated_header); match label { - "TVR1" => assert_load_err_contains( + "OVR1" => assert_load_err_contains( Rank::load(&truncated), &format!("{label} payload truncated"), ), - "TVRQ" => assert_load_err_contains( + "OVRQ" => assert_load_err_contains( RankQuant::load(&truncated), &format!("{label} payload truncated"), ), - "TVBM" => assert_load_err_contains( + "OVBM" => assert_load_err_contains( Bitmap::load(&truncated), &format!("{label} payload truncated"), ), - "TVSB" => assert_load_err_contains( + "OVSB" => assert_load_err_contains( SignBitmap::load(&truncated), &format!("{label} payload truncated"), ), @@ -242,19 +242,19 @@ fn public_loaders_report_stable_malformed_payload_context() { let trailing = tmp(&format!("{suffix}_trailing_context")); write_bytes(&trailing, &trailing_bytes); match label { - "TVR1" => assert_load_err_contains( + "OVR1" => assert_load_err_contains( Rank::load(&trailing), &format!("{label} payload has trailing bytes"), ), - "TVRQ" => assert_load_err_contains( + "OVRQ" => assert_load_err_contains( RankQuant::load(&trailing), &format!("{label} payload has trailing bytes"), ), - "TVBM" => assert_load_err_contains( + "OVBM" => assert_load_err_contains( Bitmap::load(&trailing), &format!("{label} payload has trailing bytes"), ), - "TVSB" => assert_load_err_contains( + "OVSB" => assert_load_err_contains( SignBitmap::load(&trailing), &format!("{label} payload has trailing bytes"), ), From c0991a0c1aa2134d9b75aaff17cd1eba6f278895 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Mon, 15 Jun 2026 11:19:40 -0500 Subject: [PATCH 4/4] fix(rank_io): canonicalize remaining TVRQ invalid-bits error + SignBitmap doc magic (codex) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two stale `TV*` labels slipped past the first canonicalization pass because they were not in the `" ` quote-prefix form: - The RankQuant probe + loader emitted `"unsupported TVRQ bits: {bits} ..."` (TVRQ mid-string, after `unsupported `) even though the `bits` field is read with the `OVRQ` label — now `OVRQ`. - The SignBitmap on-disk-format doc table showed `magic = TVSB` / "shorter than TVBM"; the written magic is `OVSB`/`OVBM` — updated for accuracy. Remaining `TV*` references are intentional: the legacy-format module docs ("also reads TVR1"), the dual-magic acceptance code + mismatch messages ("OV*/TV*"), and the back-compat test fixtures/expectations that forge and load legacy `TV*` files. Full suite green; ffi header unaffected. Signed-off-by: Nelson Spence --- src/rank_io.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/rank_io.rs b/src/rank_io.rs index 3a4bde3..33376c5 100644 --- a/src/rank_io.rs +++ b/src/rank_io.rs @@ -396,7 +396,7 @@ fn probe_rankquant_metadata( let bits = read_u8_field(reader, "OVRQ", "bits")?; if !matches!(bits, 1 | 2 | 4) { return Err(invalid(format!( - "unsupported TVRQ bits: {bits} (expected 1, 2, or 4)" + "unsupported OVRQ bits: {bits} (expected 1, 2, or 4)" ))); } let dim = read_u32_le(reader, "OVRQ", "dim")? as usize; @@ -625,7 +625,7 @@ pub(crate) fn load_rankquant(path: impl AsRef) -> io::Result<(u8, usize, u let bits = read_u8_field(&mut f, "OVRQ", "bits")?; if !matches!(bits, 1 | 2 | 4) { return Err(invalid(format!( - "unsupported TVRQ bits: {bits} (expected 1, 2, or 4)" + "unsupported OVRQ bits: {bits} (expected 1, 2, or 4)" ))); } let dim = read_u32_le(&mut f, "OVRQ", "dim")? as usize; @@ -778,13 +778,13 @@ pub(crate) fn load_bitmap(path: impl AsRef) -> io::Result<(usize, usize, u /// /// | offset | bytes | field | /// |-------:|:-----:|-----------------------------| -/// | 0 | 4 | magic = `TVSB` | +/// | 0 | 4 | magic = `OVSB` | /// | 4 | 1 | version = 1 | /// | 5 | 4 | `dim` (u32) | /// | 9 | 4 | `n_vectors` (u32) | /// | 13 | … | `n_vectors * dim/64` u64s | /// -/// 13-byte header — one u32 shorter than `TVBM` because SignBitmap +/// 13-byte header — one u32 shorter than `OVBM` because SignBitmap /// has no `n_top` parameter (the threshold is fixed at zero). pub(crate) fn write_sign_bitmap( path: impl AsRef,