diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a93f723..2c61864 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -84,7 +84,7 @@ jobs: - name: install cbindgen env: RUSTFLAGS: "" - run: cargo install cbindgen --version 0.29.2 --locked + run: cargo install cbindgen --version 0.29.3 --locked - name: cbindgen --verify run: cbindgen ordvec-ffi --config ordvec-ffi/cbindgen.toml --output ordvec-ffi/include/ordvec.h --verify - name: cargo build -p ordvec-ffi diff --git a/docs/c-api.md b/docs/c-api.md index 568f0d4..a2f3fd7 100644 --- a/docs/c-api.md +++ b/docs/c-api.md @@ -116,6 +116,17 @@ Search is synchronous. Caller pointers are borrowed only for the duration of `ordvec_index_search`; no query, candidate, hit, stats, or path pointer is retained after the function returns. +`ordvec_index_load` takes a non-null, NUL-terminated, valid UTF-8 path string. +Invalid UTF-8 paths return `ORDVEC_STATUS_BAD_ARGUMENT` in ABI v1. + +`ordvec_index_probe` is the metadata-only inspection path for C and Go callers. +It takes the same UTF-8 path contract as `ordvec_index_load` and fills +`ordvec_index_info_t` without returning an index handle or allocating payload +rows. The probe validates the fixed header, declared dimensions, payload byte +count, and exact file length. It does not validate row payload invariants; +call `ordvec_index_load` when the caller needs a searchable handle and full +loader validation. + Rows are internal row ordinals. ABI v1 has no external ID map: `ordvec_hit_t.id` is always equal to `ordvec_hit_t.row_id` widened to `uint64_t`. diff --git a/ordvec-ffi/include/ordvec.h b/ordvec-ffi/include/ordvec.h index 9627727..87ab16a 100644 --- a/ordvec-ffi/include/ordvec.h +++ b/ordvec-ffi/include/ordvec.h @@ -3,7 +3,7 @@ #pragma once -/* Generated with cbindgen:0.29.2 */ +/* Generated with cbindgen:0.29.3 */ /* Generated by cbindgen. Do not edit by hand. */ @@ -184,11 +184,27 @@ void ordvec_search_stats_init(ordvec_search_stats_t *stats); * * # Safety * - * `path` must be a non-null, NUL-terminated C string. `out` must be non-null - * and point to writable memory for one `ordvec_index_t *`. + * `path` must be a non-null, NUL-terminated, valid UTF-8 C string. `out` + * must be non-null and point to writable memory for one `ordvec_index_t *`. */ ordvec_status_t ordvec_index_load(const char *path, uint64_t flags, ordvec_index_t **out); +/** + * Probe on-disk metadata for a `.tvrq` RankQuant or `.tvbm` Bitmap index + * without loading payload rows into an index handle. + * + * This validates the fixed header, declared dimensions, payload byte count, + * and exact file length. Full row-invariant validation remains the job of + * `ordvec_index_load`. + * + * # Safety + * + * `path` must be a non-null, NUL-terminated, valid UTF-8 C string. `info_out` + * must be non-null, initialized with `ordvec_index_info_init`, and point to + * writable memory for `ordvec_index_info_t`. + */ +ordvec_status_t ordvec_index_probe(const char *path, uint64_t flags, ordvec_index_info_t *info_out); + /** * Copy metadata from a loaded index into `info_out`. * diff --git a/ordvec-ffi/src/lib.rs b/ordvec-ffi/src/lib.rs index 1e832f8..98eab78 100644 --- a/ordvec-ffi/src/lib.rs +++ b/ordvec-ffi/src/lib.rs @@ -10,7 +10,7 @@ use std::path::Path; use std::ptr; use std::time::Instant; -use ordvec::{Bitmap, RankQuant}; +use ordvec::{probe_index_metadata, Bitmap, IndexKind, IndexMetadata, IndexParams, RankQuant}; pub type ordvec_status_t = u32; pub type ordvec_index_kind_t = u32; @@ -362,6 +362,38 @@ fn info_for_handle(handle: &IndexHandle) -> ordvec_index_info_t { info } +fn info_for_metadata(meta: &IndexMetadata) -> Result { + let mut info = default_info(); + info.kind = + match meta.kind { + IndexKind::RankQuant => ORDVEC_INDEX_KIND_RANK_QUANT, + IndexKind::Bitmap => ORDVEC_INDEX_KIND_BITMAP, + IndexKind::Rank | IndexKind::SignBitmap => return Err(FfiError::new( + ORDVEC_STATUS_UNSUPPORTED_FORMAT, + "ABI v1 supports metadata probes only for TVRQ RankQuant and TVBM Bitmap indexes", + )), + }; + info.format_version = u32::from(meta.format_version); + info.dim = meta.dim as u64; + info.vector_count = meta.vector_count as u64; + info.bytes_per_vec = meta.bytes_per_vec as u64; + info.source_file_size_bytes = meta.file_size_bytes; + match meta.params { + IndexParams::RankQuant { bits } => { + info.bit_width = u32::from(bits); + } + IndexParams::Bitmap { n_top } => { + info.n_top = n_top as u32; + } + IndexParams::Rank | IndexParams::SignBitmap => {} + } + info.capabilities = ORDVEC_CAP_FULL_SEARCH + | ORDVEC_CAP_SUBSET_SEARCH + | ORDVEC_CAP_STATS + | ORDVEC_CAP_ID_EQUALS_ROW_ID; + Ok(info) +} + fn copy_hits(scores: &[f32], indices: &[i64], hits_out: *mut ordvec_hit_t) { debug_assert_eq!(scores.len(), indices.len()); for (slot, (&score, &row)) in scores.iter().zip(indices).enumerate() { @@ -643,8 +675,8 @@ pub unsafe extern "C" fn ordvec_search_stats_init(stats: *mut ordvec_search_stat /// /// # Safety /// -/// `path` must be a non-null, NUL-terminated C string. `out` must be non-null -/// and point to writable memory for one `ordvec_index_t *`. +/// `path` must be a non-null, NUL-terminated, valid UTF-8 C string. `out` +/// must be non-null and point to writable memory for one `ordvec_index_t *`. pub unsafe extern "C" fn ordvec_index_load( path: *const c_char, flags: u64, @@ -720,6 +752,70 @@ pub unsafe extern "C" fn ordvec_index_load( }) } +#[no_mangle] +/// Probe on-disk metadata for a `.tvrq` RankQuant or `.tvbm` Bitmap index +/// without loading payload rows into an index handle. +/// +/// This validates the fixed header, declared dimensions, payload byte count, +/// and exact file length. Full row-invariant validation remains the job of +/// `ordvec_index_load`. +/// +/// # Safety +/// +/// `path` must be a non-null, NUL-terminated, valid UTF-8 C string. `info_out` +/// must be non-null, initialized with `ordvec_index_info_init`, and point to +/// writable memory for `ordvec_index_info_t`. +pub unsafe extern "C" fn ordvec_index_probe( + path: *const c_char, + flags: u64, + info_out: *mut ordvec_index_info_t, +) -> ordvec_status_t { + ffi_boundary(|| { + if path.is_null() { + return Err(FfiError::new( + ORDVEC_STATUS_NULL_POINTER, + "path pointer is NULL", + )); + } + if info_out.is_null() { + return Err(FfiError::new( + ORDVEC_STATUS_NULL_POINTER, + "info_out pointer is NULL", + )); + } + if flags != 0 { + return Err(FfiError::new( + ORDVEC_STATUS_BAD_ARGUMENT, + format!("unknown probe flags: {flags}"), + )); + } + // SAFETY: info_out is non-null; read only the leading struct_size + // field before overwriting the full output struct. + let info_size = unsafe { ptr::addr_of!((*info_out).struct_size).read() }; + check_exact_size( + info_size, + std::mem::size_of::(), + "ordvec_index_info_t", + )?; + // SAFETY: path is a non-null NUL-terminated C string by caller contract. + let path = unsafe { CStr::from_ptr(path) }; + let path = path.to_str().map_err(|_| { + FfiError::new( + ORDVEC_STATUS_BAD_ARGUMENT, + "path must be valid UTF-8 in ABI v1", + ) + })?; + let meta = + probe_index_metadata(path).map_err(|err| io_to_ffi(err, "probe index metadata"))?; + let info = info_for_metadata(&meta)?; + // SAFETY: info_out is non-null and points to writable output storage. + unsafe { + ptr::write(info_out, info); + } + Ok(()) + }) +} + #[no_mangle] /// Copy metadata from a loaded index into `info_out`. /// @@ -958,6 +1054,32 @@ mod tests { std::fs::remove_file(path).ok(); } + #[test] + fn probe_rankquant_metadata_without_loading() { + let path = make_rankquant_fixture(); + let cpath = CString::new(path.to_str().unwrap()).unwrap(); + unsafe { + let mut info = default_info(); + assert_eq!( + ordvec_index_probe(cpath.as_ptr(), 0, &mut info), + ORDVEC_STATUS_OK + ); + assert_eq!(info.kind, ORDVEC_INDEX_KIND_RANK_QUANT); + assert_eq!(info.format_version, 1); + assert_eq!(info.dim, 16); + assert_eq!(info.bit_width, 2); + assert_eq!(info.n_top, 0); + assert_eq!(info.vector_count, 4); + assert_eq!(info.bytes_per_vec, 4); + assert!(info.source_file_size_bytes > 0); + assert_eq!( + info.capabilities & ORDVEC_CAP_SUBSET_SEARCH, + ORDVEC_CAP_SUBSET_SEARCH + ); + } + std::fs::remove_file(path).ok(); + } + #[test] fn full_and_subset_search_rankquant() { let path = make_rankquant_fixture(); diff --git a/ordvec-ffi/tests/c_link_smoke.rs b/ordvec-ffi/tests/c_link_smoke.rs index 5ed967c..8e93224 100644 --- a/ordvec-ffi/tests/c_link_smoke.rs +++ b/ordvec-ffi/tests/c_link_smoke.rs @@ -108,19 +108,27 @@ fn c_program_links_and_runs_against_static_library() { #include "ordvec.h" int main(void) {{ - ordvec_index_t *idx = 0; - ordvec_status_t st = ordvec_index_load({fixture}, 0, &idx); + ordvec_index_info_t probed; + ordvec_index_info_init(&probed); + ordvec_status_t st = ordvec_index_probe({fixture}, 0, &probed); if (st != ORDVEC_STATUS_OK) return 1; + if (probed.kind != ORDVEC_INDEX_KIND_RANK_QUANT || probed.dim != 16 || probed.vector_count != 4) {{ + return 2; + }} + + ordvec_index_t *idx = 0; + st = ordvec_index_load({fixture}, 0, &idx); + if (st != ORDVEC_STATUS_OK) return 3; ordvec_index_info_t info; ordvec_index_info_init(&info); if (ordvec_index_info(idx, &info) != ORDVEC_STATUS_OK) {{ ordvec_index_free(idx); - return 2; + return 4; }} if (info.kind != ORDVEC_INDEX_KIND_RANK_QUANT || info.dim != 16 || info.vector_count != 4) {{ ordvec_index_free(idx); - return 3; + return 5; }} float q[16] = {{0}}; @@ -137,9 +145,9 @@ int main(void) {{ st = ordvec_index_search(idx, &p, hits, 2, &returned, &stats); ordvec_index_free(idx); - if (st != ORDVEC_STATUS_OK) return 4; - if (returned > 2) return 5; - if (stats.returned_count != returned) return 6; + if (st != ORDVEC_STATUS_OK) return 6; + if (returned > 2) return 7; + if (stats.returned_count != returned) return 8; return 0; }} "#, diff --git a/ordvec-go/ordvec.go b/ordvec-go/ordvec.go index 1c4f080..c5bd89b 100644 --- a/ordvec-go/ordvec.go +++ b/ordvec-go/ordvec.go @@ -70,6 +70,13 @@ const ( KindBitmap Kind = C.ORDVEC_INDEX_KIND_BITMAP ) +const ( + CapFullSearch uint64 = C.ORDVEC_CAP_FULL_SEARCH + CapSubsetSearch uint64 = C.ORDVEC_CAP_SUBSET_SEARCH + CapStats uint64 = C.ORDVEC_CAP_STATS + CapIDEqualsRowID uint64 = C.ORDVEC_CAP_ID_EQUALS_ROW_ID +) + var ErrClosed = errors.New("ordvec: index closed") type StatusError struct { @@ -172,6 +179,24 @@ func callStatus(fn func() C.ordvec_status_t) error { return statusError(st) } +func Probe(path string) (Info, error) { + if strings.IndexByte(path, 0) >= 0 { + return Info{}, errors.New("ordvec: path contains null byte") + } + cpath := C.CString(path) + defer C.free(unsafe.Pointer(cpath)) + + var ci C.ordvec_index_info_t + C.ordvec_index_info_init(&ci) + err := callStatus(func() C.ordvec_status_t { + return C.ordvec_index_probe(cpath, 0, &ci) + }) + if err != nil { + return Info{}, err + } + return infoFromC(ci), nil +} + func Load(path string) (*Index, error) { if strings.IndexByte(path, 0) >= 0 { return nil, errors.New("ordvec: path contains null byte") @@ -232,6 +257,10 @@ func (idx *Index) infoLocked() (Info, error) { if err != nil { return Info{}, err } + return infoFromC(ci), nil +} + +func infoFromC(ci C.ordvec_index_info_t) Info { return Info{ Kind: Kind(ci.kind), FormatVersion: uint32(ci.format_version), @@ -242,7 +271,7 @@ func (idx *Index) infoLocked() (Info, error) { BytesPerVec: uint64(ci.bytes_per_vec), SourceFileSizeBytes: uint64(ci.source_file_size_bytes), Capabilities: uint64(ci.capabilities), - }, nil + } } func (idx *Index) Search(query []float32, k uint64, opts *SearchOptions) ([]Hit, Stats, error) { diff --git a/ordvec-go/ordvec_test.go b/ordvec-go/ordvec_test.go index f579d3f..c775b06 100644 --- a/ordvec-go/ordvec_test.go +++ b/ordvec-go/ordvec_test.go @@ -101,6 +101,10 @@ func TestLoadInfoSearchRankQuant(t *testing.T) { if info.Kind != KindRankQuant || info.Dim != 16 || info.BitWidth != 2 || info.VectorCount != 4 { t.Fatalf("unexpected info: %+v", info) } + wantCaps := CapFullSearch | CapSubsetSearch | CapStats | CapIDEqualsRowID + if info.Capabilities&wantCaps != wantCaps { + t.Fatalf("missing capabilities: got %#x want all %#x", info.Capabilities, wantCaps) + } hits, stats, err := idx.Search(query16(), 2, &SearchOptions{UserTag: 99}) if err != nil { @@ -117,6 +121,34 @@ func TestLoadInfoSearchRankQuant(t *testing.T) { } } +func TestProbeRankQuantInfo(t *testing.T) { + path := writeRankQuantFixture(t) + + probed, err := Probe(path) + if err != nil { + t.Fatal(err) + } + if probed.Kind != KindRankQuant || probed.Dim != 16 || probed.BitWidth != 2 || probed.VectorCount != 4 { + t.Fatalf("unexpected probed info: %+v", probed) + } + if probed.BytesPerVec != 4 || probed.SourceFileSizeBytes == 0 { + t.Fatalf("unexpected probed byte metadata: %+v", probed) + } + + idx, err := Load(path) + if err != nil { + t.Fatal(err) + } + defer idx.Close() + loaded, err := idx.Info() + if err != nil { + t.Fatal(err) + } + if probed != loaded { + t.Fatalf("probe/load metadata mismatch: probe=%+v load=%+v", probed, loaded) + } +} + func TestRankQuantSubsetSearchOrdersByRowID(t *testing.T) { idx, err := Load(writeRankQuantFixture(t)) if err != nil { diff --git a/ordvec-python/src/lib.rs b/ordvec-python/src/lib.rs index d86cb38..82a129e 100644 --- a/ordvec-python/src/lib.rs +++ b/ordvec-python/src/lib.rs @@ -906,6 +906,33 @@ impl Bitmap { Ok((scores, indices)) } + /// Search one query against a caller-supplied subset of document IDs. + /// + /// `doc_ids` are global row ordinals. They may be unsorted and may contain + /// duplicates; each entry is scored independently, so duplicate IDs can + /// produce duplicate hits. Results are ordered by bitmap-overlap descending, + /// then row ID ascending, matching the Rust core tie policy. + fn search_subset<'py>( + &self, + py: Python<'py>, + query: &Bound<'py, PyAny>, + doc_ids: &Bound<'py, PyAny>, + k: usize, + ) -> PyResult> { + let query = as_f32_1d(query, Some(self.inner.dim()))?; + let q = query.as_array(); + let q_slice = q.as_slice().ok_or_else(|| { + pyo3::exceptions::PyValueError::new_err( + "array must be C-contiguous; call np.ascontiguousarray() first", + ) + })?; + let ids = as_u32_ids_1d(doc_ids, "doc id")?; + let ids_slice = ids.as_slice()?; + check_ids_in_range(ids_slice, self.inner.len(), "doc id")?; + let (scores, out_ids) = py.detach(|| self.inner.search_subset(q_slice, ids_slice, k)); + Ok((scores.into_pyarray(py), out_ids.into_pyarray(py))) + } + /// Return top-`m` candidate doc IDs for a single query as a 1-D `uint32` /// array. Used as the candidate generator for two-stage retrieval (bitmap /// probe → exact RankQuant rerank). This is a fixed-budget shortlist over diff --git a/ordvec-python/tests/test_bitmap.py b/ordvec-python/tests/test_bitmap.py index 0aee255..f632b7f 100644 --- a/ordvec-python/tests/test_bitmap.py +++ b/ordvec-python/tests/test_bitmap.py @@ -46,6 +46,67 @@ def test_search_shape(): assert indices.shape == (4, 10) +def test_search_subset_matches_full_when_candidates_eq_all(): + vectors = unit_vectors(40, 128, seed=0) + idx = Bitmap(dim=128, n_top=32) + idx.add(vectors) + + query = unit_vectors(1, 128, seed=99)[0] + candidates = np.arange(40, dtype=np.uint32) + subset_scores, subset_ids = idx.search_subset(query, candidates, k=10) + + full_scores, full_ids = idx.search(query[None, :], k=10) + np.testing.assert_array_equal(subset_ids, full_ids[0]) + np.testing.assert_array_equal(subset_scores, full_scores[0]) + + +def test_search_subset_allows_unsorted_duplicates_and_ties_by_row_id(): + vectors = np.ones((12, 64), dtype=np.float32) + idx = Bitmap(dim=64, n_top=16) + idx.add(vectors) + + scores, ids = idx.search_subset( + np.zeros(64, dtype=np.float32), + np.array([9, 3, 3, 1], dtype=np.uint32), + k=3, + ) + + np.testing.assert_array_equal(ids, np.array([1, 3, 3], dtype=np.int64)) + assert scores.dtype == np.float32 + np.testing.assert_array_equal(scores, np.full(3, scores[0], dtype=np.float32)) + + +def test_search_subset_validates_doc_ids(): + idx = Bitmap(dim=128, n_top=32) + idx.add(unit_vectors(10, 128)) + q = unit_vectors(1, 128, seed=1)[0] + + with pytest.raises(IndexError): + idx.search_subset(q, np.array([0, 99], dtype=np.uint32), k=2) + with pytest.raises(ValueError, match="out of range"): + idx.search_subset(q, np.array([-1], dtype=np.int64), k=1) + with pytest.raises(TypeError, match="integer"): + idx.search_subset(q, np.array([0.0], dtype=np.float32), k=1) + with pytest.raises(ValueError, match="finite"): + idx.search_subset(np.full(128, np.nan, dtype=np.float32), np.array([0]), k=1) + with pytest.raises(ValueError, match="dim"): + idx.search_subset(np.zeros(64, dtype=np.float32), np.array([0]), k=1) + + +def test_search_subset_accepts_strided_int64_doc_ids_and_caps_k(): + vectors = unit_vectors(10, 128, seed=2) + idx = Bitmap(dim=128, n_top=32) + idx.add(vectors) + q = unit_vectors(1, 128, seed=3)[0] + + doc_ids = np.arange(10, dtype=np.int64)[::2] + scores, ids = idx.search_subset(q, doc_ids, k=99) + + assert scores.shape == (5,) + assert ids.shape == (5,) + assert set(ids.tolist()).issubset(set(doc_ids.tolist())) + + def test_top_m_candidates_shape_and_dtype(): idx = Bitmap(dim=128, n_top=32) idx.add(unit_vectors(50, 128))