From a1bc2be3b8dbe86c28e3aa3c1f756eadf9289c79 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Tue, 2 Jun 2026 16:14:36 -0500 Subject: [PATCH 1/3] Document persisted format compatibility Signed-off-by: Nelson Spence --- README.md | 3 +- docs/INDEX_PROVENANCE.md | 4 +- docs/PERSISTED_FORMAT.md | 199 +++++++++++++++++++++++++++++++++++ tests/persistence_compat.rs | 202 ++++++++++++++++++++++++++++++++++++ 4 files changed, 406 insertions(+), 2 deletions(-) create mode 100644 docs/PERSISTED_FORMAT.md create mode 100644 tests/persistence_compat.rs diff --git a/README.md b/README.md index e920160..b4e9846 100644 --- a/README.md +++ b/README.md @@ -270,7 +270,8 @@ The full GitHub checkout includes a publish=false sidecar CLI, header metadata, row identity, and attestation shape checks. It does not sign artifacts, manage keys, or decide deployment trust policy. No in-format crypto is shipped because it would add key management the library can't own. See -[`docs/INDEX_PROVENANCE.md`](https://github.com/Fieldnote-Echo/ordvec/blob/main/docs/INDEX_PROVENANCE.md) +[`docs/PERSISTED_FORMAT.md`](https://github.com/Fieldnote-Echo/ordvec/blob/main/docs/PERSISTED_FORMAT.md), +[`docs/INDEX_PROVENANCE.md`](https://github.com/Fieldnote-Echo/ordvec/blob/main/docs/INDEX_PROVENANCE.md), and [`THREAT_MODEL.md`](https://github.com/Fieldnote-Echo/ordvec/blob/main/THREAT_MODEL.md) in the full repository. diff --git a/docs/INDEX_PROVENANCE.md b/docs/INDEX_PROVENANCE.md index be3ab2b..0b74eb8 100644 --- a/docs/INDEX_PROVENANCE.md +++ b/docs/INDEX_PROVENANCE.md @@ -4,7 +4,9 @@ reloads them through `Rank::load`, `RankQuant::load`, `Bitmap::load`, and `SignBitmap::load`. This note states exactly **what the loaders guarantee and what they do not**, so you can decide whether an index file needs out-of-band -verification before you load it. +verification before you load it. For the byte layout, versioning, and +compatibility policy of the persisted formats themselves, see +[`PERSISTED_FORMAT.md`](PERSISTED_FORMAT.md). ## What the loaders validate diff --git a/docs/PERSISTED_FORMAT.md b/docs/PERSISTED_FORMAT.md new file mode 100644 index 0000000..b5306b1 --- /dev/null +++ b/docs/PERSISTED_FORMAT.md @@ -0,0 +1,199 @@ +# Persisted Index Format + +This document is the compatibility contract for ordvec persisted index files. +It covers the primitive index artifacts only: `.tvr`, `.tvrq`, `.tvbm`, and +`.tvsb`. It does not define a database, transaction log, replication protocol, +provenance system, checksum manifest, signature, or trust policy. + +All integer fields are little-endian. Each format has one fixed header followed +by one contiguous payload. The payload must consume the rest of the file +exactly; v1 files have no footer, reserved trailing bytes, or extension block. + +## Compatibility Policy + +The current on-disk format version is `1` for every persisted index family. +Within the v1 contract: + +- loaders and `probe_index_metadata()` reject unknown magic, unsupported + versions, malformed header fields, impossible dimensions, impossible row + counts, payload-size overflow, short payloads, and trailing bytes; +- writers emit only v1 files matching the layouts below; +- `probe_index_metadata()` is the allocation-resistant preflight path for host + stores and sidecar manifests; +- full loaders additionally validate payload row invariants before search or + SIMD paths can observe the state. + +A breaking persisted-format change requires one of: + +- a new magic value; +- a format-version bump with documented rejection or migration behavior; +- a clearly documented migration tool that rewrites old bytes into the new + layout. + +Examples of breaking changes include changing endianness, changing fixed header +order or width, adding a trailing section, changing RankQuant packing order, +changing row-invariant interpretation, changing the primitive score assigned to +stored bytes, or assigning new semantics to an existing magic/version pair. +Strengthening rejection of malformed files is not a format break when valid v1 +writer output still loads. + +Rust API and release SemVer policies are tracked separately from this +byte-format contract. + +## Metadata + +`probe_index_metadata(path)` returns the segment descriptor host systems should +cache in their own manifests: + +- `kind`: `Rank`, `RankQuant`, `Bitmap`, or `SignBitmap`; +- `format_version`: currently `1`; +- `dim`: vector dimension declared by the file; +- `vector_count`: number of stored documents; +- `bytes_per_vec`: payload bytes per stored document; +- `params`: format-specific parameters such as RankQuant `bits` or Bitmap + `n_top`; +- `file_size_bytes`: total observed file size. + +Example external segment entry: + +```json +{ + "path": "segments/shard-0007/index.tvrq", + "sha256": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "metadata": { + "kind": "RankQuant", + "format_version": 1, + "dim": 1024, + "vector_count": 1250000, + "bytes_per_vec": 256, + "params": { "bits": 2 }, + "file_size_bytes": 320000014 + } +} +``` + +The metadata describes byte shape, not trust. If an artifact crosses a trust +boundary, bind the file bytes to a checksum, signature, attestation, or +application-owned manifest before loading. + +## Score Semantics + +The `format_version` is also the primitive score-semantics version for the +bytes under that magic. A valid v1 artifact must keep the same interpretation +of stored rank, bucket, bitmap, or sign bytes when computing per-row primitive +scores. A future change that makes identical persisted bytes produce different +primitive scores requires a new magic, a version bump, or documented migration +or rejection behavior. + +This contract does not freeze composed retrieval policy. Backend choice, +candidate-count selection, and ordering among equal scores are tracked outside +the byte-format contract unless they change the primitive score assigned to a +persisted row. + +## Format Layouts + +### Rank (`.tvr`, magic `TVR1`) + +Header: + +| Offset | Bytes | Field | +| ---: | ---: | --- | +| 0 | 4 | magic `TVR1` | +| 4 | 1 | format version `1` | +| 5 | 4 | `dim` as `u32` little-endian | +| 9 | 4 | `n_vectors` as `u32` little-endian | + +Payload: `n_vectors * dim` `u16` values, each little-endian. Each row must be a +permutation of `[0, dim)`. + +Probe metadata: + +- `kind = Rank` +- `params = Rank` +- `bytes_per_vec = dim * 2` + +### RankQuant (`.tvrq`, magic `TVRQ`) + +Header: + +| Offset | Bytes | Field | +| ---: | ---: | --- | +| 0 | 4 | magic `TVRQ` | +| 4 | 1 | format version `1` | +| 5 | 1 | `bits` as `u8`, one of `1`, `2`, or `4` | +| 6 | 4 | `dim` as `u32` little-endian | +| 10 | 4 | `n_vectors` as `u32` little-endian | + +Payload: `n_vectors * dim * bits / 8` packed bytes. Bucket codes are packed +MSB-first within each byte. For `bits = 2`, the first coordinate occupies bits +7..6 of the byte, the second coordinate bits 5..4, the third bits 3..2, and +the fourth bits 1..0. + +`dim` must be divisible by both `1 << bits` and `8 / bits`. Each row must have +constant composition: exactly `dim / (1 << bits)` coordinates in every bucket. + +Probe metadata: + +- `kind = RankQuant` +- `params = RankQuant { bits }` +- `bytes_per_vec = dim * bits / 8` + +### Bitmap (`.tvbm`, magic `TVBM`) + +Header: + +| Offset | Bytes | Field | +| ---: | ---: | --- | +| 0 | 4 | magic `TVBM` | +| 4 | 1 | format version `1` | +| 5 | 4 | `dim` as `u32` little-endian | +| 9 | 4 | `n_top` as `u32` little-endian | +| 13 | 4 | `n_vectors` as `u32` little-endian | + +Payload: `n_vectors * dim / 64` `u64` bitmap words, each little-endian. `dim` +must be a multiple of 64. Each row must have exactly `n_top` bits set. + +Probe metadata: + +- `kind = Bitmap` +- `params = Bitmap { n_top }` +- `bytes_per_vec = dim / 8` + +### SignBitmap (`.tvsb`, magic `TVSB`) + +Header: + +| Offset | Bytes | Field | +| ---: | ---: | --- | +| 0 | 4 | magic `TVSB` | +| 4 | 1 | format version `1` | +| 5 | 4 | `dim` as `u32` little-endian | +| 9 | 4 | `n_vectors` as `u32` little-endian | + +Payload: `n_vectors * dim / 64` `u64` bitmap words, each little-endian. `dim` +must be a multiple of 64 and within `MAX_SIGN_BITMAP_DIM`. Any bit pattern is a +valid sign-bitmap row; there is no per-row popcount invariant. + +Probe metadata: + +- `kind = SignBitmap` +- `params = SignBitmap` +- `bytes_per_vec = dim / 8` + +## Probe Versus Load + +`probe_index_metadata()` validates fixed headers, parameter domains, checked +payload byte counts, and exact file length without reading payload rows. Use it +when a host system wants to inspect an artifact before allocation or before +choosing a loader. + +The full loaders validate everything the probe validates and then inspect row +payload invariants: + +- `Rank::load`: each row is a permutation of `[0, dim)`; +- `RankQuant::load`: each row has the required constant bucket composition; +- `Bitmap::load`: each row has exactly `n_top` bits set; +- `SignBitmap::load`: no additional row invariant exists. + +Loader success is the primitive binary-safety boundary. It is not a provenance +or deployment-policy decision. diff --git a/tests/persistence_compat.rs b/tests/persistence_compat.rs new file mode 100644 index 0000000..fbad967 --- /dev/null +++ b/tests/persistence_compat.rs @@ -0,0 +1,202 @@ +//! Persisted byte-format compatibility fixtures. +//! +//! These are deliberately tiny committed byte expectations, not round trips +//! that only prove the current writer can feed the current loader. + +use std::io::Write; +use std::path::{Path, PathBuf}; + +use ordvec::{probe_index_metadata, Bitmap, IndexKind, IndexParams, Rank, RankQuant, SignBitmap}; + +fn tmp(name: &str) -> PathBuf { + let nonce = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!( + "ordvec_persistence_compat_{}_{}_{}.bin", + name, + std::process::id(), + nonce + )) +} + +fn write_bytes(path: &Path, bytes: &[u8]) { + std::fs::File::create(path) + .unwrap() + .write_all(bytes) + .unwrap(); +} + +fn assert_metadata( + path: &Path, + kind: IndexKind, + dim: usize, + vector_count: usize, + bytes_per_vec: usize, + params: IndexParams, + file_size_bytes: u64, +) { + let meta = probe_index_metadata(path).unwrap(); + assert_eq!(meta.kind, kind); + assert_eq!(meta.format_version, 1); + assert_eq!(meta.dim, dim); + assert_eq!(meta.vector_count, vector_count); + assert_eq!(meta.bytes_per_vec, bytes_per_vec); + assert_eq!(meta.params, params); + assert_eq!(meta.file_size_bytes, file_size_bytes); +} + +fn assert_rejects_version_and_trailing_bytes( + name: &str, + expected: &[u8], + load: impl Fn(&Path) -> std::io::Result, +) { + let path = tmp(name); + + let mut unsupported_version = expected.to_vec(); + unsupported_version[4] = 2; + write_bytes(&path, &unsupported_version); + assert!(probe_index_metadata(&path).is_err()); + assert!(load(&path).is_err()); + std::fs::remove_file(&path).ok(); + + let mut trailing = expected.to_vec(); + trailing.push(0); + write_bytes(&path, &trailing); + assert!(probe_index_metadata(&path).is_err()); + assert!(load(&path).is_err()); + std::fs::remove_file(&path).ok(); +} + +#[test] +fn rank_v1_fixture_bytes_are_stable() { + let expected = [ + b'T', b'V', b'R', b'1', 1, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, + ]; + let path = tmp("rank"); + + let mut index = Rank::new(4); + index.add(&[0.0, 1.0, 2.0, 3.0]); + index.write(&path).unwrap(); + assert_eq!(std::fs::read(&path).unwrap(), expected); + assert_metadata( + &path, + IndexKind::Rank, + 4, + 1, + 8, + IndexParams::Rank, + expected.len() as u64, + ); + std::fs::remove_file(&path).ok(); + + write_bytes(&path, &expected); + let loaded = Rank::load(&path).unwrap(); + assert_eq!(loaded.dim(), 4); + assert_eq!(loaded.len(), 1); + std::fs::remove_file(&path).ok(); + + assert_rejects_version_and_trailing_bytes("rank", &expected, |path| Rank::load(path)); +} + +#[test] +fn rankquant_v1_fixture_bytes_are_stable() { + let expected = [ + b'T', b'V', b'R', b'Q', 1, 2, 8, 0, 0, 0, 1, 0, 0, 0, 0x05, 0xaf, + ]; + let path = tmp("rankquant"); + + let mut index = RankQuant::new(8, 2); + index.add(&[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]); + index.write(&path).unwrap(); + assert_eq!(std::fs::read(&path).unwrap(), expected); + assert_metadata( + &path, + IndexKind::RankQuant, + 8, + 1, + 2, + IndexParams::RankQuant { bits: 2 }, + expected.len() as u64, + ); + std::fs::remove_file(&path).ok(); + + write_bytes(&path, &expected); + let loaded = RankQuant::load(&path).unwrap(); + assert_eq!(loaded.dim(), 8); + assert_eq!(loaded.len(), 1); + assert_eq!(loaded.bits(), 2); + std::fs::remove_file(&path).ok(); + + assert_rejects_version_and_trailing_bytes("rankquant", &expected, |path| RankQuant::load(path)); +} + +#[test] +fn bitmap_v1_fixture_bytes_are_stable() { + let expected = [ + b'T', b'V', b'B', b'M', 1, 64, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xc0, + ]; + let path = tmp("bitmap"); + + let mut index = Bitmap::new(64, 2); + let vector: Vec = (0..64).map(|value| value as f32).collect(); + index.add(&vector); + index.write(&path).unwrap(); + assert_eq!(std::fs::read(&path).unwrap(), expected); + assert_metadata( + &path, + IndexKind::Bitmap, + 64, + 1, + 8, + IndexParams::Bitmap { n_top: 2 }, + expected.len() as u64, + ); + std::fs::remove_file(&path).ok(); + + write_bytes(&path, &expected); + let loaded = Bitmap::load(&path).unwrap(); + assert_eq!(loaded.dim(), 64); + assert_eq!(loaded.len(), 1); + assert_eq!(loaded.n_top(), 2); + std::fs::remove_file(&path).ok(); + + assert_rejects_version_and_trailing_bytes("bitmap", &expected, |path| Bitmap::load(path)); +} + +#[test] +fn sign_bitmap_v1_fixture_bytes_are_stable() { + let expected = [ + b'T', b'V', b'S', b'B', 1, 64, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0x80, + ]; + let path = tmp("sign_bitmap"); + + let mut index = SignBitmap::new(64); + let mut vector = vec![-1.0; 64]; + vector[0] = 1.0; + vector[63] = 1.0; + index.add(&vector); + index.write(&path).unwrap(); + assert_eq!(std::fs::read(&path).unwrap(), expected); + assert_metadata( + &path, + IndexKind::SignBitmap, + 64, + 1, + 8, + IndexParams::SignBitmap, + expected.len() as u64, + ); + std::fs::remove_file(&path).ok(); + + write_bytes(&path, &expected); + let loaded = SignBitmap::load(&path).unwrap(); + assert_eq!(loaded.dim(), 64); + assert_eq!(loaded.len(), 1); + std::fs::remove_file(&path).ok(); + + assert_rejects_version_and_trailing_bytes("sign_bitmap", &expected, |path| { + SignBitmap::load(path) + }); +} From a683938251f2b2468f4873f4fa5663b7bb90e86e Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Wed, 3 Jun 2026 09:30:32 -0500 Subject: [PATCH 2/3] Address persisted format review findings Signed-off-by: Nelson Spence --- docs/PERSISTED_FORMAT.md | 10 ++++---- tests/persistence_compat.rs | 48 ++++++++++++++++++++++++++----------- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/docs/PERSISTED_FORMAT.md b/docs/PERSISTED_FORMAT.md index b5306b1..4c3da8c 100644 --- a/docs/PERSISTED_FORMAT.md +++ b/docs/PERSISTED_FORMAT.md @@ -104,7 +104,7 @@ Header: | 9 | 4 | `n_vectors` as `u32` little-endian | Payload: `n_vectors * dim` `u16` values, each little-endian. Each row must be a -permutation of `[0, dim)`. +permutation of `[0, dim)`. `dim` must be in `[2, 65,535]`. Probe metadata: @@ -129,8 +129,9 @@ MSB-first within each byte. For `bits = 2`, the first coordinate occupies bits 7..6 of the byte, the second coordinate bits 5..4, the third bits 3..2, and the fourth bits 1..0. -`dim` must be divisible by both `1 << bits` and `8 / bits`. Each row must have -constant composition: exactly `dim / (1 << bits)` coordinates in every bucket. +`dim` must be in `[2, 65,535]` and divisible by both `1 << bits` and +`8 / bits`. Each row must have constant composition: exactly +`dim / (1 << bits)` coordinates in every bucket. Probe metadata: @@ -151,7 +152,8 @@ Header: | 13 | 4 | `n_vectors` as `u32` little-endian | Payload: `n_vectors * dim / 64` `u64` bitmap words, each little-endian. `dim` -must be a multiple of 64. Each row must have exactly `n_top` bits set. +must be in `[2, 65,535]` and a multiple of 64. Each row must have exactly +`n_top` bits set. Probe metadata: diff --git a/tests/persistence_compat.rs b/tests/persistence_compat.rs index fbad967..34cf6ad 100644 --- a/tests/persistence_compat.rs +++ b/tests/persistence_compat.rs @@ -4,21 +4,47 @@ //! that only prove the current writer can feed the current loader. use std::io::Write; +use std::ops::Deref; use std::path::{Path, PathBuf}; use ordvec::{probe_index_metadata, Bitmap, IndexKind, IndexParams, Rank, RankQuant, SignBitmap}; -fn tmp(name: &str) -> PathBuf { +struct TempFile { + path: PathBuf, +} + +impl AsRef for TempFile { + fn as_ref(&self) -> &Path { + &self.path + } +} + +impl Deref for TempFile { + type Target = Path; + + fn deref(&self) -> &Self::Target { + &self.path + } +} + +impl Drop for TempFile { + fn drop(&mut self) { + let _ = std::fs::remove_file(&self.path); + } +} + +fn tmp(name: &str) -> TempFile { let nonce = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) - .unwrap() + .unwrap_or_default() .as_nanos(); - std::env::temp_dir().join(format!( + let path = std::env::temp_dir().join(format!( "ordvec_persistence_compat_{}_{}_{}.bin", name, std::process::id(), nonce - )) + )); + TempFile { path } } fn write_bytes(path: &Path, bytes: &[u8]) { @@ -52,6 +78,10 @@ fn assert_rejects_version_and_trailing_bytes( expected: &[u8], load: impl Fn(&Path) -> std::io::Result, ) { + assert!( + expected.len() > 4, + "expected fixture must include a version byte at index 4" + ); let path = tmp(name); let mut unsupported_version = expected.to_vec(); @@ -59,14 +89,12 @@ fn assert_rejects_version_and_trailing_bytes( write_bytes(&path, &unsupported_version); assert!(probe_index_metadata(&path).is_err()); assert!(load(&path).is_err()); - std::fs::remove_file(&path).ok(); let mut trailing = expected.to_vec(); trailing.push(0); write_bytes(&path, &trailing); assert!(probe_index_metadata(&path).is_err()); assert!(load(&path).is_err()); - std::fs::remove_file(&path).ok(); } #[test] @@ -89,13 +117,11 @@ fn rank_v1_fixture_bytes_are_stable() { IndexParams::Rank, expected.len() as u64, ); - std::fs::remove_file(&path).ok(); write_bytes(&path, &expected); let loaded = Rank::load(&path).unwrap(); assert_eq!(loaded.dim(), 4); assert_eq!(loaded.len(), 1); - std::fs::remove_file(&path).ok(); assert_rejects_version_and_trailing_bytes("rank", &expected, |path| Rank::load(path)); } @@ -120,14 +146,12 @@ fn rankquant_v1_fixture_bytes_are_stable() { IndexParams::RankQuant { bits: 2 }, expected.len() as u64, ); - std::fs::remove_file(&path).ok(); write_bytes(&path, &expected); let loaded = RankQuant::load(&path).unwrap(); assert_eq!(loaded.dim(), 8); assert_eq!(loaded.len(), 1); assert_eq!(loaded.bits(), 2); - std::fs::remove_file(&path).ok(); assert_rejects_version_and_trailing_bytes("rankquant", &expected, |path| RankQuant::load(path)); } @@ -153,14 +177,12 @@ fn bitmap_v1_fixture_bytes_are_stable() { IndexParams::Bitmap { n_top: 2 }, expected.len() as u64, ); - std::fs::remove_file(&path).ok(); write_bytes(&path, &expected); let loaded = Bitmap::load(&path).unwrap(); assert_eq!(loaded.dim(), 64); assert_eq!(loaded.len(), 1); assert_eq!(loaded.n_top(), 2); - std::fs::remove_file(&path).ok(); assert_rejects_version_and_trailing_bytes("bitmap", &expected, |path| Bitmap::load(path)); } @@ -188,13 +210,11 @@ fn sign_bitmap_v1_fixture_bytes_are_stable() { IndexParams::SignBitmap, expected.len() as u64, ); - std::fs::remove_file(&path).ok(); write_bytes(&path, &expected); let loaded = SignBitmap::load(&path).unwrap(); assert_eq!(loaded.dim(), 64); assert_eq!(loaded.len(), 1); - std::fs::remove_file(&path).ok(); assert_rejects_version_and_trailing_bytes("sign_bitmap", &expected, |path| { SignBitmap::load(path) From aa7969dfbb9a069d909bd961724401c1278aa1ca Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Wed, 3 Jun 2026 11:08:50 -0500 Subject: [PATCH 3/3] Avoid clock-based persistence test temp names Signed-off-by: Nelson Spence --- tests/persistence_compat.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/persistence_compat.rs b/tests/persistence_compat.rs index 34cf6ad..40684e3 100644 --- a/tests/persistence_compat.rs +++ b/tests/persistence_compat.rs @@ -6,9 +6,12 @@ use std::io::Write; use std::ops::Deref; use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicU64, Ordering}; use ordvec::{probe_index_metadata, Bitmap, IndexKind, IndexParams, Rank, RankQuant, SignBitmap}; +static NEXT_TMP_ID: AtomicU64 = AtomicU64::new(0); + struct TempFile { path: PathBuf, } @@ -34,10 +37,7 @@ impl Drop for TempFile { } fn tmp(name: &str) -> TempFile { - let nonce = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .as_nanos(); + let nonce = NEXT_TMP_ID.fetch_add(1, Ordering::Relaxed); let path = std::env::temp_dir().join(format!( "ordvec_persistence_compat_{}_{}_{}.bin", name,