diff --git a/Cargo.lock b/Cargo.lock index 89bb24dd5..118d965ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -822,6 +822,24 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "experimental-multi-vector-bench" +version = "0.45.0" +dependencies = [ + "anyhow", + "diskann-benchmark-runner", + "diskann-linalg", + "diskann-quantization", + "diskann-utils", + "diskann-vector", + "diskann-wide", + "rand 0.9.2", + "serde", + "serde_json", + "tempfile", + "thiserror 2.0.17", +] + [[package]] name = "faer" version = "0.23.2" diff --git a/Cargo.toml b/Cargo.toml index e71dff557..24d6428e4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,8 @@ members = [ "diskann-benchmark-simd", "diskann-benchmark", "diskann-tools", + # Experimental + "experimental-multi-vector-bench", ] default-members = [ @@ -62,6 +64,8 @@ diskann-label-filter = { path = "diskann-label-filter", version = "0.45.0" } diskann-benchmark-runner = { path = "diskann-benchmark-runner", version = "0.45.0" } diskann-benchmark-core = { path = "diskann-benchmark-core", version = "0.45.0" } diskann-tools = { path = "diskann-tools", version = "0.45.0" } +# Experimental +experimental-multi-vector-bench = { path = "experimental-multi-vector-bench", version = "0.45.0" } # External dependencies (shared versions) anyhow = "1.0.98" diff --git a/diskann-quantization/src/multi_vector/matrix.rs b/diskann-quantization/src/multi_vector/matrix.rs index d834784f2..201a4f4e2 100644 --- a/diskann-quantization/src/multi_vector/matrix.rs +++ b/diskann-quantization/src/multi_vector/matrix.rs @@ -495,7 +495,13 @@ impl Mat { } } - pub(crate) unsafe fn get_row_unchecked(&self, i: usize) -> T::Row<'_> { + /// Returns the i-th row without bounds checking. + /// + /// # Safety + /// + /// `i` must be less than `self.num_vectors()`. + #[inline] + pub unsafe fn get_row_unchecked(&self, i: usize) -> T::Row<'_> { // SAFETY: Caller must ensure i < self.num_vectors(). The constructors for this type // ensure that `ptr` is compatible with `T`. unsafe { self.repr.get_row(self.ptr, i) } @@ -581,6 +587,17 @@ impl Mat> { pub fn vector_dim(&self) -> usize { self.repr.ncols() } + + /// Returns the underlying data as a contiguous slice. + /// + /// The data is stored in row-major order: `[row0_col0, row0_col1, ..., row0_colN, row1_col0, ...]`. + #[inline] + pub fn as_slice(&self) -> &[T] { + let len = self.repr.nrows() * self.repr.ncols(); + // SAFETY: Standard representation guarantees contiguous row-major layout. + // The Mat was constructed with valid data of the correct length. + unsafe { std::slice::from_raw_parts(self.ptr.as_ptr().cast::(), len) } + } } //////////// @@ -651,7 +668,7 @@ impl<'a, T: Repr> MatRef<'a, T> { /// /// `i` must be less than `self.num_vectors()`. #[inline] - pub(crate) unsafe fn get_row_unchecked(&self, i: usize) -> T::Row<'_> { + pub unsafe fn get_row_unchecked(&self, i: usize) -> T::Row<'_> { // SAFETY: Caller must ensure i < self.num_vectors(). unsafe { self.repr.get_row(self.ptr, i) } } @@ -683,6 +700,17 @@ impl<'a, T: Copy> MatRef<'a, Standard> { pub fn vector_dim(&self) -> usize { self.repr.ncols() } + + /// Returns the underlying data as a contiguous slice. + /// + /// The data is stored in row-major order: `[row0_col0, row0_col1, ..., row0_colN, row1_col0, ...]`. + #[inline] + pub fn as_slice(&self) -> &[T] { + let len = self.repr.nrows() * self.repr.ncols(); + // SAFETY: Standard representation guarantees contiguous row-major layout. + // The MatRef was constructed with valid data of the correct length. + unsafe { std::slice::from_raw_parts(self.ptr.as_ptr().cast::(), len) } + } } // Reborrow: Mat -> MatRef @@ -784,7 +812,7 @@ impl<'a, T: ReprMut> MatMut<'a, T> { /// /// `i` must be less than `self.num_vectors()`. #[inline] - pub(crate) unsafe fn get_row_unchecked(&self, i: usize) -> T::Row<'_> { + pub unsafe fn get_row_unchecked(&self, i: usize) -> T::Row<'_> { // SAFETY: Caller must ensure i < self.num_vectors(). unsafe { self.repr.get_row(self.ptr, i) } } diff --git a/experimental-multi-vector-bench/Cargo.toml b/experimental-multi-vector-bench/Cargo.toml new file mode 100644 index 000000000..e4a234f15 --- /dev/null +++ b/experimental-multi-vector-bench/Cargo.toml @@ -0,0 +1,35 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. +[package] +name = "experimental-multi-vector-bench" +edition.workspace = true +version.workspace = true +authors.workspace = true +description = "Experimental multi-vector benchmarking support for DiskANN" +documentation.workspace = true +license.workspace = true + +[[bin]] +name = "multivec-bench" +path = "src/bin/multivec_bench.rs" + +[dependencies] +diskann-linalg.workspace = true +diskann-utils.workspace = true +diskann-quantization.workspace = true +diskann-vector.workspace = true +diskann-wide.workspace = true + +# Benchmark dependencies +anyhow.workspace = true +diskann-benchmark-runner.workspace = true +rand.workspace = true +serde = { workspace = true, features = ["derive"] } +serde_json.workspace = true +thiserror.workspace = true + +[dev-dependencies] +tempfile.workspace = true + +[lints] +workspace = true diff --git a/experimental-multi-vector-bench/README.md b/experimental-multi-vector-bench/README.md new file mode 100644 index 000000000..a5785008e --- /dev/null +++ b/experimental-multi-vector-bench/README.md @@ -0,0 +1,135 @@ +# experimental-multi-vector-bench + +Experimental multi-vector benchmarking support for DiskANN, enabling late interaction retrieval with token-level embeddings. + +## Scope & Goals + +This crate is an **experimental workspace** focused on: + +1. **Fast Chamfer distance implementation for `f32`** - Develop and benchmark high-performance implementations of the Chamfer distance function for multi-vector representations using 32-bit floating point values. + +2. **Multiple computation approaches** - Compare naive scalar, SIMD-accelerated, transposed, tiling, and SGEMM implementations to quantify performance gains. + +3. **Benchmarking infrastructure** - Provide tooling to measure and compare different implementation strategies. + +## Current Status + +- ✅ `MultiVector` type alias for `Mat>` (row-major storage from diskann-quantization) +- ✅ `TransposedMultiVector` type for block-transposed storage (SIMD-optimized) +- ✅ `Chamfer` - Generic distance calculator using Inner Product similarity +- ✅ `Chamfer` - Scalar baseline implementation +- ✅ `Chamfer` - SIMD-accelerated implementation +- ✅ `Chamfer` - Block-transposed SIMD with transposed documents +- ✅ `Chamfer` - Block-transposed SIMD with query pair tiling +- ✅ `Chamfer` - Transposed query with doc pair tiling +- ✅ `Chamfer` - BLAS SGEMM + SIMD row-max +- ✅ Implements `diskann_vector::DistanceFunction` trait for ecosystem compatibility +- ✅ Benchmark utility integrated with diskann-benchmark-runner + +## Usage + +```rust +use experimental_multi_vector_bench::{ + Chamfer, SimdApproach, TransposedWithTilingApproach, QueryTransposedWithTilingApproach, + MultiVector, TransposedMultiVector, Standard, +}; +use diskann_vector::DistanceFunction; + +// Create a multi-vector (3 vectors of dimension 4) +let mv = MultiVector::new(Standard::new(3, 4), 0.0f32).unwrap(); + +// Basic usage with row-major vectors (NaiveApproach or SimdApproach) +let chamfer = Chamfer::::new(); +let distance = chamfer.evaluate_similarity(&query, &document); + +// Optimized for few query tokens (≤8): transpose documents +let chamfer = Chamfer::::new(); +let transposed_doc = TransposedMultiVector::from(&document); +let distance = chamfer.evaluate_similarity(&query, &transposed_doc); + +// Optimized for many query tokens (≥16): transpose query instead +let chamfer = Chamfer::::new(); +let transposed_query = TransposedMultiVector::from(&query); +let distance = chamfer.evaluate_similarity(&transposed_query, &document); + +// For large Q×D: use SGEMM +use experimental_multi_vector_bench::{SgemmApproach, SgemmScratch}; +let chamfer = Chamfer::::new(); +let mut scratch = SgemmScratch::new(); +let distance = chamfer.evaluate_similarity_with_scratch(&query, &document, &mut scratch); +``` + +## Type Aliases + +This crate uses shared types from `diskann-quantization` for multi-vector representation: + +```rust +// Row-major owning matrix +pub type MultiVector = Mat>; + +// Immutable view +pub type MultiVectorRef<'a> = MatRef<'a, Standard>; +``` + +The `Standard` representation provides: + +- Contiguous row-major storage +- Direct `as_slice()` access for BLAS operations +- Zero-copy views via `MatRef` + +## Future Work + +- [ ] Add RFC based on findings for DiskANN integration +- [ ] Additional similarity measures (Cosine, SquaredL2) +- [ ] Support for additional element types (`f16`, `u8` quantized, etc.) + +## Running Benchmarks + +```bash +# Run benchmarks with example configuration +cargo run --release -p experimental-multi-vector-bench --bin multivec-bench -- run \ + --input-file experimental-multi-vector-bench/examples/bench.json \ + --output-file results.json + +# Verify correctness (all approaches should produce same checksum) +cargo run --release -p experimental-multi-vector-bench --bin multivec-bench -- run \ + --input-file experimental-multi-vector-bench/examples/verify.json \ + --output-file verify_results.json +``` + +See [examples/bench.json](examples/bench.json) for benchmark configuration format. + +### Benchmark Configuration + +The benchmark supports six approaches via the `approach` field: + +- `"naive"` - Scalar baseline +- `"simd"` - SIMD-accelerated +- `"transposed_simd"` - Block-transposed SIMD +- `"transposed_with_tiling"` - Block-transposed SIMD with query pair tiling +- `"query_transposed_with_tiling"` - Transposed query with doc pair tiling +- `"sgemm"` - BLAS SGEMM + SIMD row-max + +## Module Structure + +```text +src/ +├── lib.rs # Crate root with re-exports and type aliases +├── multi_vector.rs # TransposedMultiVector type (block-transposed storage) +├── distance/ +│ ├── mod.rs # Chamfer generic struct +│ ├── naive.rs # Scalar implementation (NaiveApproach) +│ ├── simd.rs # SIMD-accelerated (SimdApproach) +│ ├── transposed.rs # Transposed docs (TransposedApproach) +│ ├── transposed_tiling.rs # Transposed docs + query tiling (TransposedWithTilingApproach) +│ ├── query_transposed_tiling.rs # Transposed query + doc tiling (QueryTransposedWithTilingApproach) +│ └── sgemm.rs # BLAS SGEMM + row-max (SgemmApproach) +└── bench/ + ├── mod.rs # Benchmark registration and dispatch + ├── input.rs # Benchmark input types + └── runner.rs # Benchmark execution logic +``` + +## Contributing + +This work is experimental and will be submitted as separate PRs. diff --git a/experimental-multi-vector-bench/examples/bench.json b/experimental-multi-vector-bench/examples/bench.json new file mode 100644 index 000000000..bb8dd2fda --- /dev/null +++ b/experimental-multi-vector-bench/examples/bench.json @@ -0,0 +1,95 @@ +{ + "search_directories": [], + "jobs": [ + { + "type": "multivec-op", + "content": { + "approach": "simd", + "runs": [ + { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 }, + { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 }, + { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 16 }, + { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 }, + { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 }, + { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 } + ] + } + }, + { + "type": "multivec-op", + "content": { + "approach": "transposed_simd", + "runs": [ + { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 }, + { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 }, + { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 16 }, + { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 }, + { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 }, + { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 } + ] + } + }, + { + "type": "multivec-op", + "content": { + "approach": "transposed_with_tiling", + "runs": [ + { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 }, + { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 }, + { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 16 }, + { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 }, + { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 }, + { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 } + ] + } + }, + { + "type": "multivec-op", + "content": { + "approach": "query_transposed_with_tiling", + "runs": [ + { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 }, + { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 }, + { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 16 }, + { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 }, + { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 }, + { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 } + ] + } + }, + { + "type": "multivec-op", + "content": { + "approach": "sgemm", + "runs": [ + { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 }, + { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 }, + { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 }, + { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 16 }, + { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 }, + { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 }, + { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 } + ] + } + } + ] +} diff --git a/experimental-multi-vector-bench/examples/results.json b/experimental-multi-vector-bench/examples/results.json new file mode 100644 index 000000000..7a294f1e9 --- /dev/null +++ b/experimental-multi-vector-bench/examples/results.json @@ -0,0 +1,1267 @@ +[ + { + "input": { + "content": { + "approach": "simd", + "runs": [ + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + }, + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + }, + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 16, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + }, + { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + }, + { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + }, + { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + ], + "verify": false + }, + "type": "multivec-op" + }, + "results": [ + { + "percentiles": { + "mean": 2619.02, + "median": 2236.5, + "p90": 3932, + "p99": 5626 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + } + }, + { + "percentiles": { + "mean": 9659.88, + "median": 9224.0, + "p90": 11661, + "p99": 16490 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + } + }, + { + "percentiles": { + "mean": 49105.44, + "median": 47882.0, + "p90": 57101, + "p99": 59022 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + }, + { + "percentiles": { + "mean": 5260.0, + "median": 4653.5, + "p90": 7481, + "p99": 10543 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + } + }, + { + "percentiles": { + "mean": 28620.08, + "median": 25809.0, + "p90": 36768, + "p99": 53266 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + } + }, + { + "percentiles": { + "mean": 104133.9, + "median": 101093.0, + "p90": 110701, + "p99": 139688 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + }, + { + "percentiles": { + "mean": 8787.02, + "median": 8239.0, + "p90": 9846, + "p99": 19058 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 16, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + }, + { + "percentiles": { + "mean": 8881.02, + "median": 8411.5, + "p90": 10288, + "p99": 16025 + }, + "run": { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + } + }, + { + "percentiles": { + "mean": 38982.54, + "median": 38161.5, + "p90": 41242, + "p99": 51669 + }, + "run": { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + } + }, + { + "percentiles": { + "mean": 176903.96, + "median": 171430.5, + "p90": 213961, + "p99": 235763 + }, + "run": { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + } + ] + }, + { + "input": { + "content": { + "approach": "transposed_simd", + "runs": [ + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + }, + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + }, + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 16, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + }, + { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + }, + { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + }, + { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + ], + "verify": false + }, + "type": "multivec-op" + }, + "results": [ + { + "percentiles": { + "mean": 2583.94, + "median": 1668.0, + "p90": 5592, + "p99": 8338 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + } + }, + { + "percentiles": { + "mean": 6965.02, + "median": 6473.5, + "p90": 8112, + "p99": 13699 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + } + }, + { + "percentiles": { + "mean": 40871.14, + "median": 36205.5, + "p90": 65956, + "p99": 95103 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + }, + { + "percentiles": { + "mean": 3999.1, + "median": 3695.0, + "p90": 4938, + "p99": 9665 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + } + }, + { + "percentiles": { + "mean": 17661.14, + "median": 16528.5, + "p90": 21241, + "p99": 33413 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + } + }, + { + "percentiles": { + "mean": 73454.08, + "median": 71689.5, + "p90": 80504, + "p99": 92498 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + }, + { + "percentiles": { + "mean": 7163.36, + "median": 6726.0, + "p90": 8500, + "p99": 12427 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 16, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + }, + { + "percentiles": { + "mean": 6511.42, + "median": 5956.0, + "p90": 7538, + "p99": 13942 + }, + "run": { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + } + }, + { + "percentiles": { + "mean": 31328.02, + "median": 29352.0, + "p90": 36458, + "p99": 53238 + }, + "run": { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + } + }, + { + "percentiles": { + "mean": 115811.66, + "median": 112375.5, + "p90": 133449, + "p99": 165565 + }, + "run": { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + } + ] + }, + { + "input": { + "content": { + "approach": "transposed_with_tiling", + "runs": [ + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + }, + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + }, + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 16, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + }, + { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + }, + { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + }, + { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + ], + "verify": false + }, + "type": "multivec-op" + }, + "results": [ + { + "percentiles": { + "mean": 2050.04, + "median": 1281.5, + "p90": 4716, + "p99": 5114 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + } + }, + { + "percentiles": { + "mean": 5369.06, + "median": 4458.5, + "p90": 7185, + "p99": 20556 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + } + }, + { + "percentiles": { + "mean": 29915.1, + "median": 25710.5, + "p90": 47347, + "p99": 69910 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + }, + { + "percentiles": { + "mean": 3458.14, + "median": 2758.0, + "p90": 5144, + "p99": 10795 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + } + }, + { + "percentiles": { + "mean": 15840.54, + "median": 13289.0, + "p90": 26007, + "p99": 37759 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + } + }, + { + "percentiles": { + "mean": 65014.14, + "median": 59231.5, + "p90": 92056, + "p99": 105748 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + }, + { + "percentiles": { + "mean": 5269.28, + "median": 4641.5, + "p90": 7368, + "p99": 11010 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 16, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + }, + { + "percentiles": { + "mean": 5924.42, + "median": 5094.5, + "p90": 8496, + "p99": 14387 + }, + "run": { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + } + }, + { + "percentiles": { + "mean": 27112.44, + "median": 25971.5, + "p90": 32414, + "p99": 57985 + }, + "run": { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + } + }, + { + "percentiles": { + "mean": 93114.02, + "median": 88488.0, + "p90": 114377, + "p99": 126912 + }, + "run": { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + } + ] + }, + { + "input": { + "content": { + "approach": "query_transposed_with_tiling", + "runs": [ + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + }, + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + }, + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 16, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + }, + { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + }, + { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + }, + { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + ], + "verify": false + }, + "type": "multivec-op" + }, + "results": [ + { + "percentiles": { + "mean": 2737.42, + "median": 2125.0, + "p90": 3948, + "p99": 8440 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + } + }, + { + "percentiles": { + "mean": 4736.12, + "median": 3923.0, + "p90": 6618, + "p99": 14193 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + } + }, + { + "percentiles": { + "mean": 20463.8, + "median": 18123.5, + "p90": 34167, + "p99": 38949 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + }, + { + "percentiles": { + "mean": 5325.46, + "median": 4119.5, + "p90": 10821, + "p99": 15835 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + } + }, + { + "percentiles": { + "mean": 12167.86, + "median": 10770.0, + "p90": 18693, + "p99": 24051 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + } + }, + { + "percentiles": { + "mean": 38737.54, + "median": 37825.5, + "p90": 44695, + "p99": 60780 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + }, + { + "percentiles": { + "mean": 4805.74, + "median": 4073.0, + "p90": 7665, + "p99": 11767 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 16, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + }, + { + "percentiles": { + "mean": 7505.82, + "median": 6493.0, + "p90": 9394, + "p99": 20680 + }, + "run": { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + } + }, + { + "percentiles": { + "mean": 23943.58, + "median": 22457.5, + "p90": 32867, + "p99": 36549 + }, + "run": { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + } + }, + { + "percentiles": { + "mean": 85142.66, + "median": 84216.5, + "p90": 99213, + "p99": 115430 + }, + "run": { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + } + ] + }, + { + "input": { + "content": { + "approach": "sgemm", + "runs": [ + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + }, + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + }, + { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + }, + { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 16, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + }, + { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + }, + { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + }, + { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + ], + "verify": false + }, + "type": "multivec-op" + }, + "results": [ + { + "percentiles": { + "mean": 2816.1, + "median": 1974.5, + "p90": 5313, + "p99": 10544 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + } + }, + { + "percentiles": { + "mean": 6950.22, + "median": 6217.0, + "p90": 8113, + "p99": 16349 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + } + }, + { + "percentiles": { + "mean": 28288.28, + "median": 25519.0, + "p90": 40668, + "p99": 71398 + }, + "run": { + "dim": 128, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + }, + { + "percentiles": { + "mean": 5934.08, + "median": 4850.5, + "p90": 12171, + "p99": 23287 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + } + }, + { + "percentiles": { + "mean": 14509.7, + "median": 13783.0, + "p90": 16174, + "p99": 26689 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + } + }, + { + "percentiles": { + "mean": 53404.38, + "median": 51651.0, + "p90": 60145, + "p99": 68631 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + }, + { + "percentiles": { + "mean": 5853.86, + "median": 5245.5, + "p90": 7933, + "p99": 9907 + }, + "run": { + "dim": 256, + "loops_per_measurement": 10, + "num_doc_token": 16, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + }, + { + "percentiles": { + "mean": 7287.5, + "median": 6784.5, + "p90": 9495, + "p99": 13360 + }, + "run": { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 32, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 8 + } + }, + { + "percentiles": { + "mean": 23220.62, + "median": 23043.0, + "p90": 26293, + "p99": 34596 + }, + "run": { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 64, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 16 + } + }, + { + "percentiles": { + "mean": 83098.88, + "median": 79397.5, + "p90": 96292, + "p99": 141549 + }, + "run": { + "dim": 384, + "loops_per_measurement": 10, + "num_doc_token": 128, + "num_measurements": 50, + "num_points": 100, + "num_query_token": 32 + } + } + ] + } +] \ No newline at end of file diff --git a/experimental-multi-vector-bench/examples/verify.json b/experimental-multi-vector-bench/examples/verify.json new file mode 100644 index 000000000..72a4b754f --- /dev/null +++ b/experimental-multi-vector-bench/examples/verify.json @@ -0,0 +1,65 @@ +{ + "search_directories": [], + "jobs": [ + { + "type": "multivec-op", + "content": { + "approach": "naive", + "verify": true, + "runs": [ + { "dim": 128, "num_points": 10, "loops_per_measurement": 1, "num_measurements": 1, "num_query_token": 8, "num_doc_token": 32 } + ] + } + }, + { + "type": "multivec-op", + "content": { + "approach": "simd", + "verify": true, + "runs": [ + { "dim": 128, "num_points": 10, "loops_per_measurement": 1, "num_measurements": 1, "num_query_token": 8, "num_doc_token": 32 } + ] + } + }, + { + "type": "multivec-op", + "content": { + "approach": "transposed_simd", + "verify": true, + "runs": [ + { "dim": 128, "num_points": 10, "loops_per_measurement": 1, "num_measurements": 1, "num_query_token": 8, "num_doc_token": 32 } + ] + } + }, + { + "type": "multivec-op", + "content": { + "approach": "transposed_with_tiling", + "verify": true, + "runs": [ + { "dim": 128, "num_points": 10, "loops_per_measurement": 1, "num_measurements": 1, "num_query_token": 8, "num_doc_token": 32 } + ] + } + }, + { + "type": "multivec-op", + "content": { + "approach": "query_transposed_with_tiling", + "verify": true, + "runs": [ + { "dim": 128, "num_points": 10, "loops_per_measurement": 1, "num_measurements": 1, "num_query_token": 8, "num_doc_token": 32 } + ] + } + }, + { + "type": "multivec-op", + "content": { + "approach": "sgemm", + "verify": true, + "runs": [ + { "dim": 128, "num_points": 10, "loops_per_measurement": 1, "num_measurements": 1, "num_query_token": 8, "num_doc_token": 32 } + ] + } + } + ] +} diff --git a/experimental-multi-vector-bench/examples/verify_results.json b/experimental-multi-vector-bench/examples/verify_results.json new file mode 100644 index 000000000..f6f53131e --- /dev/null +++ b/experimental-multi-vector-bench/examples/verify_results.json @@ -0,0 +1,230 @@ +[ + { + "input": { + "content": { + "approach": "naive", + "runs": [ + { + "dim": 128, + "loops_per_measurement": 1, + "num_doc_token": 32, + "num_measurements": 1, + "num_points": 10, + "num_query_token": 8 + } + ], + "verify": true + }, + "type": "multivec-op" + }, + "results": [ + { + "percentiles": { + "mean": 384.0, + "median": 384.0, + "p90": 384, + "p99": 384 + }, + "run": { + "dim": 128, + "loops_per_measurement": 1, + "num_doc_token": 32, + "num_measurements": 1, + "num_points": 10, + "num_query_token": 8 + }, + "sample_distance_checksum": -2764.606689453125 + } + ] + }, + { + "input": { + "content": { + "approach": "simd", + "runs": [ + { + "dim": 128, + "loops_per_measurement": 1, + "num_doc_token": 32, + "num_measurements": 1, + "num_points": 10, + "num_query_token": 8 + } + ], + "verify": true + }, + "type": "multivec-op" + }, + "results": [ + { + "percentiles": { + "mean": 114.0, + "median": 114.0, + "p90": 114, + "p99": 114 + }, + "run": { + "dim": 128, + "loops_per_measurement": 1, + "num_doc_token": 32, + "num_measurements": 1, + "num_points": 10, + "num_query_token": 8 + }, + "sample_distance_checksum": -2764.606689453125 + } + ] + }, + { + "input": { + "content": { + "approach": "transposed_simd", + "runs": [ + { + "dim": 128, + "loops_per_measurement": 1, + "num_doc_token": 32, + "num_measurements": 1, + "num_points": 10, + "num_query_token": 8 + } + ], + "verify": true + }, + "type": "multivec-op" + }, + "results": [ + { + "percentiles": { + "mean": 47.0, + "median": 47.0, + "p90": 47, + "p99": 47 + }, + "run": { + "dim": 128, + "loops_per_measurement": 1, + "num_doc_token": 32, + "num_measurements": 1, + "num_points": 10, + "num_query_token": 8 + }, + "sample_distance_checksum": -2764.606689453125 + } + ] + }, + { + "input": { + "content": { + "approach": "transposed_with_tiling", + "runs": [ + { + "dim": 128, + "loops_per_measurement": 1, + "num_doc_token": 32, + "num_measurements": 1, + "num_points": 10, + "num_query_token": 8 + } + ], + "verify": true + }, + "type": "multivec-op" + }, + "results": [ + { + "percentiles": { + "mean": 75.0, + "median": 75.0, + "p90": 75, + "p99": 75 + }, + "run": { + "dim": 128, + "loops_per_measurement": 1, + "num_doc_token": 32, + "num_measurements": 1, + "num_points": 10, + "num_query_token": 8 + }, + "sample_distance_checksum": -2764.606689453125 + } + ] + }, + { + "input": { + "content": { + "approach": "query_transposed_with_tiling", + "runs": [ + { + "dim": 128, + "loops_per_measurement": 1, + "num_doc_token": 32, + "num_measurements": 1, + "num_points": 10, + "num_query_token": 8 + } + ], + "verify": true + }, + "type": "multivec-op" + }, + "results": [ + { + "percentiles": { + "mean": 93.0, + "median": 93.0, + "p90": 93, + "p99": 93 + }, + "run": { + "dim": 128, + "loops_per_measurement": 1, + "num_doc_token": 32, + "num_measurements": 1, + "num_points": 10, + "num_query_token": 8 + }, + "sample_distance_checksum": -2764.606689453125 + } + ] + }, + { + "input": { + "content": { + "approach": "sgemm", + "runs": [ + { + "dim": 128, + "loops_per_measurement": 1, + "num_doc_token": 32, + "num_measurements": 1, + "num_points": 10, + "num_query_token": 8 + } + ], + "verify": true + }, + "type": "multivec-op" + }, + "results": [ + { + "percentiles": { + "mean": 235.0, + "median": 235.0, + "p90": 235, + "p99": 235 + }, + "run": { + "dim": 128, + "loops_per_measurement": 1, + "num_doc_token": 32, + "num_measurements": 1, + "num_points": 10, + "num_query_token": 8 + }, + "sample_distance_checksum": -2764.606689453125 + } + ] + } +] \ No newline at end of file diff --git a/experimental-multi-vector-bench/src/bench/input.rs b/experimental-multi-vector-bench/src/bench/input.rs new file mode 100644 index 000000000..1b4b4ffee --- /dev/null +++ b/experimental-multi-vector-bench/src/bench/input.rs @@ -0,0 +1,143 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! Input types for multi-vector benchmarks. + +use serde::{Deserialize, Serialize}; +use std::num::NonZeroUsize; + +use diskann_benchmark_runner::{Any, CheckDeserialization, Checker}; + +/// Approach to use for multi-vector distance computation. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum Approach { + /// Naive scalar approach. + Naive, + /// SIMD-accelerated approach. + Simd, + /// Transposed SIMD approach using block-transposed data layout. + TransposedSimd, + /// Transposed SIMD approach with tiling optimization. + TransposedWithTiling, + /// Query-transposed SIMD approach with tiling optimization. + QueryTransposedWithTiling, + /// SGEMM-based approach using BLAS matrix multiplication (baseline). + Sgemm, +} + +impl std::fmt::Display for Approach { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Naive => write!(f, "naive"), + Self::Simd => write!(f, "simd"), + Self::TransposedSimd => write!(f, "transposed_simd"), + Self::TransposedWithTiling => write!(f, "transposed_with_tiling"), + Self::QueryTransposedWithTiling => write!(f, "query_transposed_with_tiling"), + Self::Sgemm => write!(f, "sgemm"), + } + } +} + +/// A single benchmark run configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Run { + /// Dimensionality of each token embedding. + pub dim: NonZeroUsize, + /// Number of document multi-vectors to compare against. + pub num_points: NonZeroUsize, + /// Number of loops per measurement (for timing stability). + pub loops_per_measurement: NonZeroUsize, + /// Number of measurements to collect. + pub num_measurements: NonZeroUsize, + /// Number of tokens in the query multi-vector. + pub num_query_token: NonZeroUsize, + /// Number of tokens in each document multi-vector. + pub num_doc_token: NonZeroUsize, +} + +/// Input specification for multi-vector benchmarks. +#[derive(Debug, Serialize, Deserialize)] +pub struct MultiVectorOp { + /// Approach to use: naive or simd. + pub approach: Approach, + /// List of benchmark runs to execute. + pub runs: Vec, + /// Whether to compute and output distance checksum for verification. + #[serde(default)] + pub verify: bool, +} + +impl CheckDeserialization for MultiVectorOp { + fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> { + Ok(()) + } +} + +macro_rules! write_field { + ($f:ident, $field:tt, $($expr:tt)*) => { + writeln!($f, "{:>24}: {}", $field, $($expr)*) + } +} + +impl MultiVectorOp { + pub(crate) const fn tag() -> &'static str { + "multivec-op" + } + + fn summarize_fields(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write_field!(f, "approach", self.approach)?; + write_field!(f, "number of runs", self.runs.len())?; + Ok(()) + } +} + +impl std::fmt::Display for MultiVectorOp { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Multi-Vector Operation\n")?; + write_field!(f, "tag", Self::tag())?; + self.summarize_fields(f) + } +} + +/// Input parser for multi-vector benchmarks. +#[derive(Debug)] +pub struct MultiVectorInput; + +impl diskann_benchmark_runner::Input for MultiVectorInput { + fn tag(&self) -> &'static str { + MultiVectorOp::tag() + } + + fn try_deserialize( + &self, + serialized: &serde_json::Value, + checker: &mut Checker, + ) -> anyhow::Result { + checker.any(MultiVectorOp::deserialize(serialized)?) + } + + fn example(&self) -> anyhow::Result { + const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap(); + const NUM_POINTS: NonZeroUsize = NonZeroUsize::new(100).unwrap(); + const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(5).unwrap(); + const NUM_MEASUREMENTS: NonZeroUsize = NonZeroUsize::new(10).unwrap(); + const NUM_QUERY_TOKEN: NonZeroUsize = NonZeroUsize::new(32).unwrap(); + const NUM_DOC_TOKEN: NonZeroUsize = NonZeroUsize::new(64).unwrap(); + + let runs = vec![Run { + dim: DIM, + num_points: NUM_POINTS, + loops_per_measurement: LOOPS_PER_MEASUREMENT, + num_measurements: NUM_MEASUREMENTS, + num_query_token: NUM_QUERY_TOKEN, + num_doc_token: NUM_DOC_TOKEN, + }]; + + Ok(serde_json::to_value(&MultiVectorOp { + approach: Approach::Simd, + runs, + verify: false, + })?) + } +} diff --git a/experimental-multi-vector-bench/src/bench/mod.rs b/experimental-multi-vector-bench/src/bench/mod.rs new file mode 100644 index 000000000..05bfe4fe6 --- /dev/null +++ b/experimental-multi-vector-bench/src/bench/mod.rs @@ -0,0 +1,166 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! Benchmarking utilities for multi-vector distance computations. +//! +//! This module provides a diskann-benchmark-runner based benchmark framework for measuring +//! the performance of multi-vector similarity computations using both naive +//! and SIMD-accelerated approaches. + +mod input; +mod runner; + +use std::io::Write; + +use diskann_benchmark_runner::{ + describeln, + dispatcher::{self, DispatchRule, FailureScore, MatchScore}, + Any, +}; + +use crate::distance::{ + NaiveApproach, QueryTransposedWithTilingApproach, SgemmApproach, SimdApproach, + TransposedApproach, TransposedWithTilingApproach, +}; + +pub use input::{Approach, MultiVectorInput, MultiVectorOp, Run}; +use runner::{ + run_benchmark_with_approach, run_benchmark_with_query_transposed_approach, + run_benchmark_with_sgemm_approach, run_benchmark_with_transposed_approach, DisplayWrapper, +}; + +//////////////// +// Public API // +//////////////// + +/// Register multi-vector benchmarks. +pub fn register(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) { + dispatcher.register::>("multivec-op", run_benchmark); +} + +////////////// +// Dispatch // +////////////// + +/// Kernel for multi-vector benchmarks. +struct MultiVectorKernel<'a> { + input: &'a MultiVectorOp, +} + +impl<'a> MultiVectorKernel<'a> { + fn new(input: &'a MultiVectorOp) -> Self { + Self { input } + } +} + +impl dispatcher::Map for MultiVectorKernel<'static> { + type Type<'a> = MultiVectorKernel<'a>; +} + +impl<'a> DispatchRule<&'a MultiVectorOp> for MultiVectorKernel<'a> { + type Error = std::convert::Infallible; + + fn try_match(_from: &&'a MultiVectorOp) -> Result { + Ok(MatchScore(0)) + } + + fn convert(from: &'a MultiVectorOp) -> Result { + Ok(Self::new(from)) + } + + fn description( + f: &mut std::fmt::Formatter<'_>, + from: Option<&&'a MultiVectorOp>, + ) -> std::fmt::Result { + match from { + None => describeln!(f, "- Multi-vector benchmark (naive or simd)"), + Some(input) => describeln!(f, "- Approach: {}", input.approach), + } + } +} + +impl<'a> DispatchRule<&'a diskann_benchmark_runner::Any> for MultiVectorKernel<'a> { + type Error = anyhow::Error; + + fn try_match(from: &&'a diskann_benchmark_runner::Any) -> Result { + from.try_match::() + } + + fn convert(from: &'a diskann_benchmark_runner::Any) -> Result { + from.convert::() + } + + fn description( + f: &mut std::fmt::Formatter<'_>, + from: Option<&&'a diskann_benchmark_runner::Any>, + ) -> std::fmt::Result { + Any::description::(f, from, MultiVectorOp::tag()) + } +} + +/////////////// +// Benchmark // +/////////////// + +fn run_benchmark( + kernel: MultiVectorKernel<'_>, + _: diskann_benchmark_runner::Checkpoint<'_>, + mut output: &mut dyn diskann_benchmark_runner::Output, +) -> Result { + writeln!(output, "{}", kernel.input)?; + + let results = match kernel.input.approach { + Approach::Naive => { + writeln!(output, "Running with Naive (scalar) approach...\n")?; + run_benchmark_with_approach::(kernel.input, kernel.input.verify, output)? + } + Approach::Simd => { + writeln!(output, "Running with SIMD (vectorized) approach...\n")?; + run_benchmark_with_approach::(kernel.input, kernel.input.verify, output)? + } + Approach::TransposedSimd => { + writeln!(output, "Running with Transposed SIMD approach...\n")?; + run_benchmark_with_transposed_approach::( + kernel.input, + kernel.input.verify, + output, + )? + } + Approach::TransposedWithTiling => { + writeln!( + output, + "Running with Transposed SIMD + Tiling approach...\n" + )?; + run_benchmark_with_transposed_approach::( + kernel.input, + kernel.input.verify, + output, + )? + } + Approach::QueryTransposedWithTiling => { + writeln!( + output, + "Running with Query-Transposed SIMD + Tiling approach...\n" + )?; + run_benchmark_with_query_transposed_approach::( + kernel.input, + kernel.input.verify, + output, + )? + } + Approach::Sgemm => { + writeln!( + output, + "Running with SGEMM (BLAS matrix multiplication) approach...\n" + )?; + run_benchmark_with_sgemm_approach::( + kernel.input, + kernel.input.verify, + output, + )? + } + }; + + writeln!(output, "\n{}", DisplayWrapper(&*results))?; + Ok(serde_json::to_value(results)?) +} diff --git a/experimental-multi-vector-bench/src/bench/runner.rs b/experimental-multi-vector-bench/src/bench/runner.rs new file mode 100644 index 000000000..e193d01a4 --- /dev/null +++ b/experimental-multi-vector-bench/src/bench/runner.rs @@ -0,0 +1,393 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! Benchmark execution logic for multi-vector distance computations. + +use std::io::Write; + +use rand::{distr::StandardUniform, rngs::StdRng, SeedableRng}; +use serde::Serialize; + +use diskann_benchmark_runner::utils::{percentiles, MicroSeconds}; +use diskann_vector::DistanceFunction; + +use crate::distance::{Chamfer, SgemmScratch}; +use crate::{MultiVector, Standard, TransposedMultiVector}; + +use super::input::{MultiVectorOp, Run}; + +/// Result of a single benchmark run. +#[derive(Debug, Serialize)] +pub(super) struct RunResult { + /// The run configuration. + pub run: Run, + /// Latency percentiles. + pub percentiles: percentiles::Percentiles, + /// Checksum of the first 10 distance values for verification. + /// Only populated when verify is enabled in the input. + #[serde(skip_serializing_if = "Option::is_none")] + pub sample_distance_checksum: Option, +} + +/// Display wrapper for formatting results. +pub(super) struct DisplayWrapper<'a, T: ?Sized>(pub &'a T); + +impl std::ops::Deref for DisplayWrapper<'_, T> { + type Target = T; + fn deref(&self) -> &T { + self.0 + } +} + +impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.is_empty() { + return Ok(()); + } + + let header = [ + "Dim", + "QueryTok", + "DocTok", + "Points", + "Mean(µs)", + "P90(µs)", + "P99(µs)", + ]; + + let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.len()); + + self.iter().enumerate().for_each(|(row, r)| { + let mut row = table.row(row); + + row.insert(r.run.dim, 0); + row.insert(r.run.num_query_token, 1); + row.insert(r.run.num_doc_token, 2); + row.insert(r.run.num_points, 3); + row.insert(format!("{:.2}", r.percentiles.mean), 4); + row.insert(format!("{:.2}", r.percentiles.p90.as_f64()), 5); + row.insert(format!("{:.2}", r.percentiles.p99.as_f64()), 6); + }); + + table.fmt(f) + } +} + +/// Generate a random multi-vector with the given number of tokens and dimension. +fn generate_random_multivector(rng: &mut StdRng, num_tokens: usize, dim: usize) -> MultiVector { + use rand::distr::Distribution; + + let mut mat = MultiVector::new(Standard::new(num_tokens, dim), 0.0f32).unwrap(); + for i in 0..num_tokens { + if let Some(row) = mat.get_row_mut(i) { + for val in row.iter_mut() { + *val = StandardUniform.sample(rng); + } + } + } + mat +} + +/// Run benchmark with the specified approach using the `DistanceFunction` trait. +pub(super) fn run_benchmark_with_approach( + input: &MultiVectorOp, + verify: bool, + mut output: &mut dyn diskann_benchmark_runner::Output, +) -> Result, anyhow::Error> +where + Chamfer: for<'a> DistanceFunction<&'a MultiVector, &'a MultiVector>, +{ + // For MultiVector docs, we need to generate directly (no conversion) + let mut results = Vec::new(); + + for run in input.runs.iter() { + let mut rng = StdRng::seed_from_u64(0x12345); + + // Generate query multi-vector (always row-major) + let query = generate_random_multivector(&mut rng, run.num_query_token.get(), run.dim.get()); + + // Generate document multi-vectors + let docs: Vec = (0..run.num_points.get()) + .map(|_| generate_random_multivector(&mut rng, run.num_doc_token.get(), run.dim.get())) + .collect(); + + let chamfer = Chamfer::::new(); + + // Collect latencies + let mut latencies = Vec::with_capacity(run.num_measurements.get()); + let mut distances = vec![0.0f32; docs.len()]; + + for _ in 0..run.num_measurements.get() { + let start = std::time::Instant::now(); + for _ in 0..run.loops_per_measurement.get() { + for (i, doc) in docs.iter().enumerate() { + distances[i] = chamfer.evaluate_similarity(&query, doc); + } + std::hint::black_box(&mut distances); + } + latencies.push(start.elapsed().into()); + } + + let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap(); + + // Compute checksum of first 10 distances for verification + let sample_distance_checksum = if verify { + Some(distances.iter().take(10).sum::()) + } else { + None + }; + + results.push(RunResult { + run: run.clone(), + percentiles, + sample_distance_checksum, + }); + + writeln!( + output, + " Completed run: dim={}, points={}", + run.dim, run.num_points + )?; + } + + Ok(results) +} + +/// Run benchmark with the transposed approach using the `DistanceFunction` trait. +/// +/// This variant uses `MultiVector` for the query and `TransposedMultiVector` for documents. +pub(super) fn run_benchmark_with_transposed_approach( + input: &MultiVectorOp, + verify: bool, + mut output: &mut dyn diskann_benchmark_runner::Output, +) -> Result, anyhow::Error> +where + Chamfer: for<'a> DistanceFunction<&'a MultiVector, &'a TransposedMultiVector>, +{ + let mut results = Vec::new(); + + for run in input.runs.iter() { + let mut rng = StdRng::seed_from_u64(0x12345); + + // Generate query multi-vector (always row-major) + let query = generate_random_multivector(&mut rng, run.num_query_token.get(), run.dim.get()); + + // Generate document multi-vectors and transpose them + let docs: Vec = (0..run.num_points.get()) + .map(|_| { + let mv = + generate_random_multivector(&mut rng, run.num_doc_token.get(), run.dim.get()); + TransposedMultiVector::from(&mv) + }) + .collect(); + + let chamfer = Chamfer::::new(); + + // Collect latencies + let mut latencies = Vec::with_capacity(run.num_measurements.get()); + let mut distances = vec![0.0f32; docs.len()]; + + for _ in 0..run.num_measurements.get() { + let start = std::time::Instant::now(); + for _ in 0..run.loops_per_measurement.get() { + for (i, doc) in docs.iter().enumerate() { + distances[i] = chamfer.evaluate_similarity(&query, doc); + } + std::hint::black_box(&mut distances); + } + latencies.push(start.elapsed().into()); + } + + let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap(); + + // Compute checksum of first 10 distances for verification + let sample_distance_checksum = if verify { + Some(distances.iter().take(10).sum::()) + } else { + None + }; + + results.push(RunResult { + run: run.clone(), + percentiles, + sample_distance_checksum, + }); + + writeln!( + output, + " Completed run: dim={}, points={}", + run.dim, run.num_points + )?; + } + + Ok(results) +} + +/// Run benchmark with query-transposed approach using the `DistanceFunction` trait. +/// +/// This variant uses `TransposedMultiVector` for the query and `MultiVector` for documents. +/// The scratch buffer is stored in the `Chamfer` itself. +pub(super) fn run_benchmark_with_query_transposed_approach( + input: &MultiVectorOp, + verify: bool, + mut output: &mut dyn diskann_benchmark_runner::Output, +) -> Result, anyhow::Error> +where + Chamfer: for<'a> DistanceFunction<&'a TransposedMultiVector, &'a MultiVector>, +{ + let mut results = Vec::new(); + + for run in input.runs.iter() { + let mut rng = StdRng::seed_from_u64(0x12345); + + // Generate query multi-vector and transpose it + let query_mv = + generate_random_multivector(&mut rng, run.num_query_token.get(), run.dim.get()); + let query = TransposedMultiVector::from(&query_mv); + + // Generate document multi-vectors (row-major) + let docs: Vec = (0..run.num_points.get()) + .map(|_| generate_random_multivector(&mut rng, run.num_doc_token.get(), run.dim.get())) + .collect(); + + // Chamfer instance holds the scratch buffer for reuse across documents + let chamfer = Chamfer::::new(); + + // Collect latencies + let mut latencies = Vec::with_capacity(run.num_measurements.get()); + let mut distances = vec![0.0f32; docs.len()]; + + for _ in 0..run.num_measurements.get() { + let start = std::time::Instant::now(); + for _ in 0..run.loops_per_measurement.get() { + for (i, doc) in docs.iter().enumerate() { + distances[i] = chamfer.evaluate_similarity(&query, doc); + } + std::hint::black_box(&mut distances); + } + latencies.push(start.elapsed().into()); + } + + let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap(); + + // Compute checksum of first 10 distances for verification + let sample_distance_checksum = if verify { + Some(distances.iter().take(10).sum::()) + } else { + None + }; + + results.push(RunResult { + run: run.clone(), + percentiles, + sample_distance_checksum, + }); + + writeln!( + output, + " Completed run: dim={}, points={}", + run.dim, run.num_points + )?; + } + + Ok(results) +} + +/// Run benchmark with SGEMM approach using pre-allocated scratch buffer. +/// +/// This variant uses `Chamfer::evaluate_similarity_with_scratch` to avoid +/// allocation on the hot path. The scratch buffer is pre-allocated before the timing +/// loop starts, ensuring fair comparison against custom SIMD approaches. +pub(super) fn run_benchmark_with_sgemm_approach( + input: &MultiVectorOp, + verify: bool, + mut output: &mut dyn diskann_benchmark_runner::Output, +) -> Result, anyhow::Error> +where + Chamfer: SgemmEvaluator, +{ + let mut results = Vec::new(); + + for run in input.runs.iter() { + let mut rng = StdRng::seed_from_u64(0x12345); + + // Generate query multi-vector + let query = generate_random_multivector(&mut rng, run.num_query_token.get(), run.dim.get()); + + // Generate document multi-vectors (row-major, not transposed) + let docs: Vec = (0..run.num_points.get()) + .map(|_| generate_random_multivector(&mut rng, run.num_doc_token.get(), run.dim.get())) + .collect(); + + let chamfer = Chamfer::::new(); + + // Pre-allocate scratch buffer BEFORE timing loop + // This ensures allocation time is excluded from measurements + let mut scratch = + SgemmScratch::with_capacity(run.num_query_token.get(), run.num_doc_token.get()); + + // Collect latencies + let mut latencies = Vec::with_capacity(run.num_measurements.get()); + let mut distances = vec![0.0f32; docs.len()]; + + for _ in 0..run.num_measurements.get() { + let start = std::time::Instant::now(); + for _ in 0..run.loops_per_measurement.get() { + for (i, doc) in docs.iter().enumerate() { + distances[i] = + chamfer.evaluate_similarity_with_scratch(&query, doc, &mut scratch); + } + std::hint::black_box(&mut distances); + } + latencies.push(start.elapsed().into()); + } + + let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap(); + + // Compute checksum of first 10 distances for verification + let sample_distance_checksum = if verify { + Some(distances.iter().take(10).sum::()) + } else { + None + }; + + results.push(RunResult { + run: run.clone(), + percentiles, + sample_distance_checksum, + }); + + writeln!( + output, + " Completed run: dim={}, points={}", + run.dim, run.num_points + )?; + } + + Ok(results) +} + +/// Trait for SGEMM-based distance computation with scratch buffer support. +/// +/// This trait is implemented by `Chamfer` to provide the +/// `evaluate_similarity_with_scratch` method for benchmarking. +pub(super) trait SgemmEvaluator { + /// Evaluates similarity using a pre-allocated scratch buffer. + fn evaluate_similarity_with_scratch( + &self, + query: &MultiVector, + doc: &MultiVector, + scratch: &mut SgemmScratch, + ) -> f32; +} + +impl SgemmEvaluator for Chamfer { + fn evaluate_similarity_with_scratch( + &self, + query: &MultiVector, + doc: &MultiVector, + scratch: &mut SgemmScratch, + ) -> f32 { + self.evaluate_similarity_with_scratch(query, doc, scratch) + } +} diff --git a/experimental-multi-vector-bench/src/bin/multivec_bench.rs b/experimental-multi-vector-bench/src/bin/multivec_bench.rs new file mode 100644 index 000000000..e2bb5c1f2 --- /dev/null +++ b/experimental-multi-vector-bench/src/bin/multivec_bench.rs @@ -0,0 +1,88 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! Multi-vector benchmark binary. +//! +//! This binary provides a CLI for benchmarking multi-vector distance computations +//! using the diskann-benchmark-runner framework. +//! +//! # Usage +//! +//! ```bash +//! # List available inputs +//! multivec-bench inputs +//! +//! # List available benchmarks +//! multivec-bench benchmarks +//! +//! # Generate example input JSON +//! multivec-bench skeleton +//! +//! # Run benchmarks +//! multivec-bench run --input examples/bench.json --output results.json +//! ``` + +use diskann_benchmark_runner::{output, registry, App, Output}; +use experimental_multi_vector_bench::bench::{register, MultiVectorInput}; + +pub fn main() -> anyhow::Result<()> { + let app = App::parse(); + main_inner(&app, &mut output::default()) +} + +fn main_inner(app: &App, output: &mut dyn Output) -> anyhow::Result<()> { + // Register inputs + let mut inputs = registry::Inputs::new(); + inputs.register(MultiVectorInput)?; + + // Register benchmarks + let mut benchmarks = registry::Benchmarks::new(); + register(&mut benchmarks); + + // Run the application + app.run(&inputs, &benchmarks, output) +} + +/////////// +// Tests // +/////////// + +#[cfg(test)] +mod tests { + use super::*; + + use std::path::{Path, PathBuf}; + + use diskann_benchmark_runner::app::Commands; + + fn run_integration_test(input_file: &Path, output_file: &Path) { + let commands = Commands::Run { + input_file: input_file.to_str().unwrap().into(), + output_file: output_file.to_str().unwrap().into(), + dry_run: false, + }; + + let app = App::from_commands(commands); + + let mut output = output::Memory::new(); + main_inner(&app, &mut output).unwrap(); + println!( + "output = {}", + String::from_utf8(output.into_inner()).unwrap() + ); + + assert!(output_file.exists()); + } + + #[test] + fn integration_test() { + let input_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("examples") + .join("bench.json"); + + let tempdir = tempfile::tempdir().unwrap(); + let output_path = tempdir.path().join("output.json"); + + run_integration_test(&input_path, &output_path); + } +} diff --git a/experimental-multi-vector-bench/src/distance/mod.rs b/experimental-multi-vector-bench/src/distance/mod.rs new file mode 100644 index 000000000..5b94bfa8b --- /dev/null +++ b/experimental-multi-vector-bench/src/distance/mod.rs @@ -0,0 +1,107 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! Distance computation for multi-vector representations. +//! +//! This module provides implementations for computing distances +//! between multi-vector representations using various approaches. +//! +//! # Approaches +//! +//! | Approach | Query Type | Doc Type | Description | +//! |----------|------------|----------|-------------| +//! | [`NaiveApproach`] | `MultiVector` | `MultiVector` | Scalar O(n²) baseline | +//! | [`SimdApproach`] | `MultiVector` | `MultiVector` | SIMD inner products | +//! | [`TransposedApproach`] | `MultiVector` | `TransposedMultiVector` | Block-transposed docs | +//! | [`TransposedWithTilingApproach`] | `MultiVector` | `TransposedMultiVector` | + Query pair tiling | +//! | [`QueryTransposedWithTilingApproach`] | `TransposedMultiVector` | `MultiVector` | Transposed query + doc pair tiling | +//! | [`SgemmApproach`] | `MultiVector` | `MultiVector` | BLAS SGEMM + SIMD row-max | +//! +//! # Choosing an Approach +//! +//! - **Few query tokens (≤8)**: Use [`TransposedWithTilingApproach`] - transposes documents +//! - **Many query tokens (≥16)**: Use [`QueryTransposedWithTilingApproach`] - transposes query +//! - **Large Q×D (≥32×128)**: Use [`SgemmApproach`] - BLAS dominates at scale (up to 4.3x vs SIMD) +//! - **Baseline/debugging**: Use [`NaiveApproach`] or [`SimdApproach`] +//! +//! # Distance Calculator +//! +//! - [`Chamfer`]: Computes asymmetric Chamfer distance using Inner Product similarity. +//! +//! # Example +//! +//! ``` +//! use experimental_multi_vector_bench::{ +//! Chamfer, TransposedWithTilingApproach, QueryTransposedWithTilingApproach, +//! MultiVector, TransposedMultiVector, Standard, +//! }; +//! use diskann_vector::DistanceFunction; +//! +//! // Create multi-vectors +//! let query = MultiVector::new(Standard::new(8, 128), 0.0f32).unwrap(); +//! let doc = MultiVector::new(Standard::new(32, 128), 0.0f32).unwrap(); +//! +//! // For queries with few tokens: transpose documents +//! let chamfer = Chamfer::::new(); +//! let transposed_doc = TransposedMultiVector::from(&doc); +//! let distance = chamfer.evaluate_similarity(&query, &transposed_doc); +//! +//! // For queries with many tokens: transpose query +//! let chamfer = Chamfer::::new(); +//! let transposed_query = TransposedMultiVector::from(&query); +//! let distance = chamfer.evaluate_similarity(&transposed_query, &doc); +//! ``` + +mod naive; +mod query_transposed_tiling; +mod sgemm; +mod simd; +mod transposed; +mod transposed_tiling; + +pub use naive::NaiveApproach; +pub use query_transposed_tiling::QueryTransposedWithTilingApproach; +pub use sgemm::{SgemmApproach, SgemmScratch}; +pub use simd::SimdApproach; +pub use transposed::TransposedApproach; +pub use transposed_tiling::TransposedWithTilingApproach; + +/// Chamfer aggregation strategy for multi-vector similarity using Inner Product. +/// +/// Computes the sum of maximum similarities from each vector in `a` to vectors in `b`, +/// negated to produce a distance metric: +/// +/// ```text +/// Chamfer(Q, D) = Σᵢ -maxⱼ IP(qᵢ, dⱼ) +/// ``` +/// +/// This uses Inner Product similarity (higher = more similar), negated for +/// compatibility with min-heap operations. Also known as asymmetric Chamfer distance. +/// +/// # Type Parameters +/// +/// * `Approach` - The computation approach to use: +/// - `NaiveApproach`: Scalar baseline +/// - `SimdApproach`: SIMD-accelerated (recommended) +/// - `TransposedApproach`: Block-transposed SIMD for large datasets +/// - `TransposedWithTilingApproach`: Block-transposed SIMD with tiling +/// - `QueryTransposedWithTilingApproach`: Query-transposed SIMD with tiling and scratch buffer +#[derive(Debug)] +pub struct Chamfer { + approach: Approach, +} + +impl Default for Chamfer { + fn default() -> Self { + Self { + approach: Approach::default(), + } + } +} + +impl Chamfer { + /// Creates a new Chamfer distance calculator. + pub fn new() -> Self { + Self::default() + } +} diff --git a/experimental-multi-vector-bench/src/distance/naive.rs b/experimental-multi-vector-bench/src/distance/naive.rs new file mode 100644 index 000000000..68eac447a --- /dev/null +++ b/experimental-multi-vector-bench/src/distance/naive.rs @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! Naive scalar implementation of multi-vector distance computation. + +use diskann_vector::DistanceFunction; + +use super::Chamfer; +use crate::MultiVector; + +/// Naive O(n²) approach using scalar operations for multi-vector distance computation. +/// +/// This approach iterates through vectors using standard scalar arithmetic. +/// Use [`super::SimdApproach`] for better performance on supported hardware. +/// +/// # Performance +/// +/// This implementation is useful for: +/// - Baseline performance comparisons +/// - Debugging and verification +/// - Platforms without SIMD support +#[derive(Debug, Clone, Copy, Default)] +pub struct NaiveApproach; + +impl DistanceFunction<&MultiVector, &MultiVector> for Chamfer { + fn evaluate_similarity(&self, query: &MultiVector, doc: &MultiVector) -> f32 { + let mut score = 0.0; + for q_vec in query.rows() { + // Find max similarity (highest inner product) for this query vector + let mut max_similarity = f32::MIN; + for d_vec in doc.rows() { + let similarity: f32 = q_vec.iter().zip(d_vec.iter()).map(|(x, y)| x * y).sum(); + max_similarity = max_similarity.max(similarity); + } + // Negate to convert similarity to distance (for min-heap compatibility) + score += -max_similarity; + } + score + } +} diff --git a/experimental-multi-vector-bench/src/distance/query_transposed_tiling.rs b/experimental-multi-vector-bench/src/distance/query_transposed_tiling.rs new file mode 100644 index 000000000..3dad63dd2 --- /dev/null +++ b/experimental-multi-vector-bench/src/distance/query_transposed_tiling.rs @@ -0,0 +1,519 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! Block-transposed SIMD implementation with query tiling for multi-vector distance computation. +//! +//! This module provides a SIMD-accelerated implementation that uses block-transposed +//! memory layout for **query** vectors (instead of documents), with documents remaining +//! in row-major format. +//! +//! # Tiling Strategy +//! +//! The key optimization is processing **pairs of document vectors together** against each +//! query block. This amortizes the cost of loading query data from memory by reusing it +//! for both document vectors simultaneously. +//! +//! # Use Case +//! +//! This approach is beneficial when: +//! - Queries are reused across multiple documents (batch scoring) +//! - Query transposition can be amortized over many document comparisons +//! - Documents are received in streaming/row-major format +//! +//! # Memory Layout +//! +//! - **Query**: Block-transposed (16 vectors per block, dimensions contiguous) +//! - **Document**: Row-major (standard MultiVector format) + +use std::cell::UnsafeCell; + +use diskann_quantization::algorithms::kmeans::BlockTranspose; +use diskann_vector::DistanceFunction; +use diskann_wide::{SIMDMinMax, SIMDMulAdd, SIMDVector}; + +use super::Chamfer; +use crate::{MultiVector, TransposedMultiVector}; + +diskann_wide::alias!(f32s = f32x8); +diskann_wide::alias!(m32s = mask_f32x8); + +/// Block-transposed SIMD approach with query tiling for Chamfer distance computation. +/// +/// This approach uses a block-transposed memory layout for **query** vectors and +/// row-major format for documents. It processes pairs of document vectors together +/// to amortize query memory loads. +/// +/// The approach holds a pre-allocated scratch buffer for storing per-query max similarities, +/// avoiding allocation on each `evaluate_similarity` call. +/// +/// # Algorithm +/// +/// Computes the asymmetric Chamfer distance: `Σ_q -max_d IP(q, d)` +/// +/// 1. Reset scratch buffer (stored in this approach) to f32::MIN +/// 2. Process document vectors in pairs (d1, d2) +/// 3. For each document pair, compute inner products with all query vectors +/// using the transposed query layout +/// 4. Update max similarities for each query vector +/// 5. Sum and negate the max similarities +/// +/// # Performance Characteristics +/// +/// - **Best for**: Scenarios where queries are reused across multiple documents +/// - **Trade-off**: Requires mutable reference to self (via UnsafeCell) for scratch buffer +/// - **Register usage**: Optimized for AVX2 (16 YMM registers in hot loops) +/// +/// # Example +/// +/// ``` +/// use experimental_multi_vector_bench::{ +/// Chamfer, QueryTransposedWithTilingApproach, TransposedMultiVector, MultiVector, Standard, +/// }; +/// use diskann_vector::DistanceFunction; +/// +/// let query = MultiVector::new(Standard::new(16, 128), 0.0f32).unwrap(); +/// let doc = MultiVector::new(Standard::new(32, 128), 0.0f32).unwrap(); +/// let query_transposed = TransposedMultiVector::from(&query); +/// +/// let chamfer = Chamfer::::new(); +/// let distance = chamfer.evaluate_similarity(&query_transposed, &doc); +/// ``` +pub struct QueryTransposedWithTilingApproach { + /// Pre-allocated scratch buffer for per-query max similarities. + /// Uses UnsafeCell for interior mutability with zero overhead. + scratch: UnsafeCell>, +} + +impl std::fmt::Debug for QueryTransposedWithTilingApproach { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("QueryTransposedWithTilingApproach") + .field( + "scratch_capacity", + // SAFETY: Read-only access to get capacity. Single-threaded access assumed. + &unsafe { &*self.scratch.get() }.capacity(), + ) + .finish() + } +} + +impl Default for QueryTransposedWithTilingApproach { + fn default() -> Self { + Self { + scratch: UnsafeCell::new(Vec::new()), + } + } +} + +impl QueryTransposedWithTilingApproach { + /// Returns a mutable reference to the scratch buffer, resized and reset to f32::MIN. + /// + /// # Safety + /// + /// This uses UnsafeCell for interior mutability. The caller must ensure + /// single-threaded access (this type is !Sync). + #[inline(always)] + #[allow(clippy::mut_from_ref)] // Intentional: UnsafeCell provides interior mutability + fn scratch_reset(&self, num_queries: usize) -> &mut [f32] { + // SAFETY: Single-threaded access assumed. This type is !Sync. + let scratch = unsafe { &mut *self.scratch.get() }; + + // Resize if needed, then reset to f32::MIN + scratch.resize(num_queries, f32::MIN); + scratch[..num_queries].fill(f32::MIN); + + &mut scratch[..num_queries] + } +} + +/// Block size for transposed layout (number of vectors per block). +const N: usize = 16; +/// Half block size for dual-register processing. +const N2: usize = N / 2; + +impl DistanceFunction<&TransposedMultiVector, &MultiVector> + for Chamfer +{ + fn evaluate_similarity(&self, query: &TransposedMultiVector, doc: &MultiVector) -> f32 { + let query_transposed = query.block_transposed(); + let num_queries = query.num_vectors(); + let num_docs = doc.num_vectors(); + + // Use pre-allocated scratch buffer from the approach (resets to f32::MIN) + let max_similarities = self.approach.scratch_reset(num_queries); + + // Process pairs of document vectors together to amortize query load costs + for i in (0..num_docs.saturating_sub(1)).step_by(2) { + // SAFETY: i + 1 < num_docs ensures both indices are valid. + let (d1, d2) = unsafe { (doc.get_row_unchecked(i), doc.get_row_unchecked(i + 1)) }; + update_max_similarities_pair(d1, d2, query_transposed, max_similarities); + } + + // Handle odd remainder document vector + if !num_docs.is_multiple_of(2) { + // SAFETY: num_docs - 1 < num_docs ensures index is valid + update_max_similarities_single( + unsafe { doc.get_row_unchecked(num_docs - 1) }, + query_transposed, + max_similarities, + ); + } + + // Sum negated max similarities to get Chamfer distance + max_similarities.iter().map(|&s| -s).sum() + } +} + +/// Process two document vectors against all query blocks simultaneously. +/// Updates max_similarities in-place for each query vector. +/// +/// This amortizes query memory loads by reusing them for both documents. +#[inline(always)] +fn update_max_similarities_pair( + d1: &[f32], + d2: &[f32], + query: &BlockTranspose, + max_similarities: &mut [f32], +) { + // Process full blocks of 16 query vectors + for block in 0..query.full_blocks() { + let (sim_d1_lo, sim_d1_hi, sim_d2_lo, sim_d2_hi) = + compute_block_inner_products_pair(d1, d2, query, block); + + // Update max similarities for this block's query vectors + let base_idx = block * N; + update_max_from_simd_pair( + &sim_d1_lo, + &sim_d1_hi, + &sim_d2_lo, + &sim_d2_hi, + &mut max_similarities[base_idx..base_idx + N], + ); + } + + // Process remainder block if present + let remainder = query.remainder(); + if remainder != 0 { + let (sim_d1_lo, sim_d1_hi, sim_d2_lo, sim_d2_hi) = + compute_block_inner_products_pair(d1, d2, query, query.full_blocks()); + + let base_idx = query.full_blocks() * N; + update_max_from_simd_pair_masked( + &sim_d1_lo, + &sim_d1_hi, + &sim_d2_lo, + &sim_d2_hi, + &mut max_similarities[base_idx..], + remainder, + ); + } +} + +/// Compute inner products between two document vectors and 16 query vectors in a block. +/// Returns (sim_d1_lo, sim_d1_hi, sim_d2_lo, sim_d2_hi) for query vectors 0-7 and 8-15. +#[inline(always)] +fn compute_block_inner_products_pair( + d1: &[f32], + d2: &[f32], + query: &BlockTranspose, + block: usize, +) -> (f32s, f32s, f32s, f32s) { + debug_assert!(block < query.num_blocks()); + + // 8 accumulators total (4 per document) + let mut sim_d1_lo_a = f32s::default(diskann_wide::ARCH); + let mut sim_d1_hi_a = f32s::default(diskann_wide::ARCH); + let mut sim_d1_lo_b = f32s::default(diskann_wide::ARCH); + let mut sim_d1_hi_b = f32s::default(diskann_wide::ARCH); + let mut sim_d2_lo_a = f32s::default(diskann_wide::ARCH); + let mut sim_d2_hi_a = f32s::default(diskann_wide::ARCH); + let mut sim_d2_lo_b = f32s::default(diskann_wide::ARCH); + let mut sim_d2_hi_b = f32s::default(diskann_wide::ARCH); + + // SAFETY: block < num_blocks() ensures this access is in-bounds. + let block_ptr = unsafe { query.block_ptr_unchecked(block) }; + + let ncols = query.ncols(); + + // Process 2 dimensions at a time + for dim in (0..ncols.saturating_sub(1)).step_by(2) { + // SAFETY: For all rows in this block, 16 reads are valid. + // dim + 1 < ncols ensures all dimension accesses are in-bounds. + let (q_lo_0, q_hi_0, q_lo_1, q_hi_1, d1_0, d1_1, d2_0, d2_1) = unsafe { + ( + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim + N2)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 1))), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 1) + N2)), + f32s::splat(diskann_wide::ARCH, *d1.get_unchecked(dim)), + f32s::splat(diskann_wide::ARCH, *d1.get_unchecked(dim + 1)), + f32s::splat(diskann_wide::ARCH, *d2.get_unchecked(dim)), + f32s::splat(diskann_wide::ARCH, *d2.get_unchecked(dim + 1)), + ) + }; + + // FMA for document 1 + sim_d1_lo_a = d1_0.mul_add_simd(q_lo_0, sim_d1_lo_a); + sim_d1_hi_a = d1_0.mul_add_simd(q_hi_0, sim_d1_hi_a); + sim_d1_lo_b = d1_1.mul_add_simd(q_lo_1, sim_d1_lo_b); + sim_d1_hi_b = d1_1.mul_add_simd(q_hi_1, sim_d1_hi_b); + + // FMA for document 2 + sim_d2_lo_a = d2_0.mul_add_simd(q_lo_0, sim_d2_lo_a); + sim_d2_hi_a = d2_0.mul_add_simd(q_hi_0, sim_d2_hi_a); + sim_d2_lo_b = d2_1.mul_add_simd(q_lo_1, sim_d2_lo_b); + sim_d2_hi_b = d2_1.mul_add_simd(q_hi_1, sim_d2_hi_b); + } + + // Handle remaining dimension + if !ncols.is_multiple_of(2) { + let dim = ncols - 1; + // SAFETY: dim < ncols ensures all dimension accesses are in-bounds. + // block_ptr is valid for N * ncols elements. + let (q_lo, q_hi, d1_val, d2_val) = unsafe { + ( + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim + N2)), + f32s::splat(diskann_wide::ARCH, *d1.get_unchecked(dim)), + f32s::splat(diskann_wide::ARCH, *d2.get_unchecked(dim)), + ) + }; + sim_d1_lo_a = d1_val.mul_add_simd(q_lo, sim_d1_lo_a); + sim_d1_hi_a = d1_val.mul_add_simd(q_hi, sim_d1_hi_a); + sim_d2_lo_a = d2_val.mul_add_simd(q_lo, sim_d2_lo_a); + sim_d2_hi_a = d2_val.mul_add_simd(q_hi, sim_d2_hi_a); + } + + ( + sim_d1_lo_a + sim_d1_lo_b, + sim_d1_hi_a + sim_d1_hi_b, + sim_d2_lo_a + sim_d2_lo_b, + sim_d2_hi_a + sim_d2_hi_b, + ) +} + +/// Update max similarities from SIMD results for a pair of documents. +#[inline(always)] +fn update_max_from_simd_pair( + sim_d1_lo: &f32s, + sim_d1_hi: &f32s, + sim_d2_lo: &f32s, + sim_d2_hi: &f32s, + max_sims: &mut [f32], +) { + debug_assert!(max_sims.len() >= N); + + // SAFETY: max_sims.len() >= N ensures we can read/write 16 f32s (2x f32x8) + unsafe { + // Load current max values as SIMD + let current_max_lo = f32s::load_simd(diskann_wide::ARCH, max_sims.as_ptr()); + let current_max_hi = f32s::load_simd(diskann_wide::ARCH, max_sims.as_ptr().add(N2)); + + // SIMD max: max(d1, d2) then max with current + let max_from_docs_lo = sim_d1_lo.max_simd(*sim_d2_lo); + let max_from_docs_hi = sim_d1_hi.max_simd(*sim_d2_hi); + + let new_max_lo = current_max_lo.max_simd(max_from_docs_lo); + let new_max_hi = current_max_hi.max_simd(max_from_docs_hi); + + // Store back + new_max_lo.store_simd(max_sims.as_mut_ptr()); + new_max_hi.store_simd(max_sims.as_mut_ptr().add(N2)); + } +} + +/// Update max similarities from SIMD results with masking for remainder block. +#[inline(always)] +fn update_max_from_simd_pair_masked( + sim_d1_lo: &f32s, + sim_d1_hi: &f32s, + sim_d2_lo: &f32s, + sim_d2_hi: &f32s, + max_sims: &mut [f32], + valid_count: usize, +) { + if valid_count >= N2 { + // SIMD for full lo portion (8 elements) + // SAFETY: valid_count >= N2 ensures we have at least 8 elements + unsafe { + let current_max_lo = f32s::load_simd(diskann_wide::ARCH, max_sims.as_ptr()); + let max_from_docs_lo = sim_d1_lo.max_simd(*sim_d2_lo); + let new_max_lo = current_max_lo.max_simd(max_from_docs_lo); + new_max_lo.store_simd(max_sims.as_mut_ptr()); + } + + // Scalar for hi remainder (0-7 elements) + let hi_count = valid_count - N2; + let arr_d1_hi = sim_d1_hi.to_array(); + let arr_d2_hi = sim_d2_hi.to_array(); + for i in 0..hi_count { + let max_from_docs = arr_d1_hi[i].max(arr_d2_hi[i]); + max_sims[N2 + i] = max_sims[N2 + i].max(max_from_docs); + } + } else { + // Scalar for partial lo (1-7 elements) + let arr_d1_lo = sim_d1_lo.to_array(); + let arr_d2_lo = sim_d2_lo.to_array(); + for i in 0..valid_count { + let max_from_docs = arr_d1_lo[i].max(arr_d2_lo[i]); + max_sims[i] = max_sims[i].max(max_from_docs); + } + } +} + +/// Fallback for single document vector (odd remainder). +#[inline(always)] +fn update_max_similarities_single( + doc_vec: &[f32], + query: &BlockTranspose, + max_similarities: &mut [f32], +) { + // Process full blocks of 16 query vectors + for block in 0..query.full_blocks() { + let (sim_lo, sim_hi) = compute_block_inner_products_single(doc_vec, query, block); + + let base_idx = block * N; + update_max_from_simd_single( + &sim_lo, + &sim_hi, + &mut max_similarities[base_idx..base_idx + N], + ); + } + + // Process remainder block if present + let remainder = query.remainder(); + if remainder != 0 { + let (sim_lo, sim_hi) = + compute_block_inner_products_single(doc_vec, query, query.full_blocks()); + + let base_idx = query.full_blocks() * N; + update_max_from_simd_single_masked( + &sim_lo, + &sim_hi, + &mut max_similarities[base_idx..], + remainder, + ); + } +} + +/// Compute inner products between one document vector and 16 query vectors in a block. +#[inline(always)] +fn compute_block_inner_products_single( + doc_vec: &[f32], + query: &BlockTranspose, + block: usize, +) -> (f32s, f32s) { + debug_assert!(block < query.num_blocks()); + + // Use 4 accumulator registers to reduce FMA dependency chains + let mut sim_lo_a = f32s::default(diskann_wide::ARCH); + let mut sim_hi_a = f32s::default(diskann_wide::ARCH); + let mut sim_lo_b = f32s::default(diskann_wide::ARCH); + let mut sim_hi_b = f32s::default(diskann_wide::ARCH); + + // SAFETY: block < num_blocks() ensures this access is in-bounds. + let block_ptr = unsafe { query.block_ptr_unchecked(block) }; + + let ncols = query.ncols(); + + // Process 4 dimensions at a time + for dim in (0..ncols.saturating_sub(3)).step_by(4) { + // SAFETY: dim + 3 < ncols ensures all dimension accesses are in-bounds. + // block_ptr is valid for N * ncols elements. + let (q_lo_0, q_hi_0, q_lo_1, q_hi_1, q_lo_2, q_hi_2, q_lo_3, q_hi_3, d0, d1, d2, d3) = unsafe { + ( + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim + N2)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 1))), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 1) + N2)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 2))), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 2) + N2)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 3))), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 3) + N2)), + f32s::splat(diskann_wide::ARCH, *doc_vec.get_unchecked(dim)), + f32s::splat(diskann_wide::ARCH, *doc_vec.get_unchecked(dim + 1)), + f32s::splat(diskann_wide::ARCH, *doc_vec.get_unchecked(dim + 2)), + f32s::splat(diskann_wide::ARCH, *doc_vec.get_unchecked(dim + 3)), + ) + }; + + // Fused multiply-add into alternating accumulators + sim_lo_a = d0.mul_add_simd(q_lo_0, sim_lo_a); + sim_hi_a = d0.mul_add_simd(q_hi_0, sim_hi_a); + sim_lo_b = d1.mul_add_simd(q_lo_1, sim_lo_b); + sim_hi_b = d1.mul_add_simd(q_hi_1, sim_hi_b); + sim_lo_a = d2.mul_add_simd(q_lo_2, sim_lo_a); + sim_hi_a = d2.mul_add_simd(q_hi_2, sim_hi_a); + sim_lo_b = d3.mul_add_simd(q_lo_3, sim_lo_b); + sim_hi_b = d3.mul_add_simd(q_hi_3, sim_hi_b); + } + + // Handle remaining dimensions + for dim in (ncols - (ncols % 4))..ncols { + // SAFETY: dim < ncols ensures all dimension accesses are in-bounds. + // block_ptr is valid for N * ncols elements. + let (q_lo, q_hi, d) = unsafe { + ( + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim + N2)), + f32s::splat(diskann_wide::ARCH, *doc_vec.get_unchecked(dim)), + ) + }; + sim_lo_a = d.mul_add_simd(q_lo, sim_lo_a); + sim_hi_a = d.mul_add_simd(q_hi, sim_hi_a); + } + + (sim_lo_a + sim_lo_b, sim_hi_a + sim_hi_b) +} + +/// Update max similarities from SIMD results for a single document. +#[inline(always)] +fn update_max_from_simd_single(sim_lo: &f32s, sim_hi: &f32s, max_sims: &mut [f32]) { + debug_assert!(max_sims.len() >= N); + + // SAFETY: max_sims.len() >= N ensures we can read/write 16 f32s (2x f32x8) + unsafe { + // Load current max values as SIMD + let current_max_lo = f32s::load_simd(diskann_wide::ARCH, max_sims.as_ptr()); + let current_max_hi = f32s::load_simd(diskann_wide::ARCH, max_sims.as_ptr().add(N2)); + + // SIMD max with current + let new_max_lo = current_max_lo.max_simd(*sim_lo); + let new_max_hi = current_max_hi.max_simd(*sim_hi); + + // Store back + new_max_lo.store_simd(max_sims.as_mut_ptr()); + new_max_hi.store_simd(max_sims.as_mut_ptr().add(N2)); + } +} + +/// Update max similarities from SIMD results with masking for remainder block. +#[inline(always)] +fn update_max_from_simd_single_masked( + sim_lo: &f32s, + sim_hi: &f32s, + max_sims: &mut [f32], + valid_count: usize, +) { + if valid_count >= N2 { + // SIMD for full lo portion (8 elements) + // SAFETY: valid_count >= N2 ensures we have at least 8 elements + unsafe { + let current_max_lo = f32s::load_simd(diskann_wide::ARCH, max_sims.as_ptr()); + let new_max_lo = current_max_lo.max_simd(*sim_lo); + new_max_lo.store_simd(max_sims.as_mut_ptr()); + } + + // Scalar for hi remainder (0-7 elements) + let hi_count = valid_count - N2; + let arr_hi = sim_hi.to_array(); + for i in 0..hi_count { + max_sims[N2 + i] = max_sims[N2 + i].max(arr_hi[i]); + } + } else { + // Scalar for partial lo (1-7 elements) + let arr_lo = sim_lo.to_array(); + for i in 0..valid_count { + max_sims[i] = max_sims[i].max(arr_lo[i]); + } + } +} diff --git a/experimental-multi-vector-bench/src/distance/sgemm.rs b/experimental-multi-vector-bench/src/distance/sgemm.rs new file mode 100644 index 000000000..4f0880cd2 --- /dev/null +++ b/experimental-multi-vector-bench/src/distance/sgemm.rs @@ -0,0 +1,350 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! SGEMM-based Chamfer distance computation using BLAS matrix multiplication. +//! +//! This module provides a high-performance implementation that computes Chamfer distance +//! by first computing the full Q×D similarity matrix via SGEMM, then performing +//! SIMD-accelerated row-wise max reduction. +//! +//! # Backend +//! +//! This uses **faer** (pure Rust BLAS with AVX2/AVX-512 optimizations), which provides +//! excellent performance without any external dependencies. +//! +//! # Algorithm +//! +//! The Chamfer distance computation is expressed as: +//! +//! ```text +//! 1. Compute similarity matrix: S = Q × Dᵀ (dimensions: [num_query × num_doc]) +//! 2. For each query i: max_sim[i] = max_j(S[i, j]) (SIMD row-wise max) +//! 3. Chamfer distance = -Σ max_sim[i] +//! ``` +//! +//! # Performance Characteristics +//! +//! - **Best for large Q×D configurations** (e.g., 32 query tokens × 128 doc tokens) +//! - **1.4x–4.3x faster** than baseline SIMD depending on configuration +//! - Peak gains (3.7x–4.3x) at large Q×D (32×128) with dim ≥ 256 +//! - Leverages highly-optimized BLAS SGEMM kernels (faer) +//! - SIMD-accelerated row reduction using f32x8 vectors +//! - Pre-allocated scratch buffer avoids allocation overhead +//! +//! # Scratch Buffer +//! +//! The [`SgemmScratch`] type provides a pre-allocated buffer for the similarity matrix, +//! avoiding allocation on the hot path. This is critical for fair benchmarking. + +use diskann_linalg::Transpose; +use diskann_vector::DistanceFunction; +use diskann_wide::{SIMDMinMax, SIMDVector}; + +use super::Chamfer; +use crate::MultiVector; + +diskann_wide::alias!(f32s = f32x8); + +/// Pre-allocated scratch buffer for SGEMM-based Chamfer distance computation. +/// +/// This struct holds a reusable buffer for the Q×D similarity matrix, avoiding +/// allocation overhead during distance computation. The buffer is automatically +/// resized when needed. +/// +/// # Example +/// +/// ``` +/// use experimental_multi_vector_bench::{ +/// Chamfer, SgemmApproach, SgemmScratch, MultiVector, Standard, +/// }; +/// +/// let query = MultiVector::new(Standard::new(8, 128), 0.0f32).unwrap(); +/// let doc = MultiVector::new(Standard::new(32, 128), 0.0f32).unwrap(); +/// +/// let mut scratch = SgemmScratch::new(); +/// let chamfer = Chamfer::::new(); +/// +/// // The scratch buffer is reused across multiple distance computations +/// let distance = chamfer.evaluate_similarity_with_scratch(&query, &doc, &mut scratch); +/// ``` +#[derive(Debug, Default)] +pub struct SgemmScratch { + /// Buffer for the Q×D similarity matrix (row-major). + similarity_matrix: Vec, +} + +impl SgemmScratch { + /// Creates a new empty scratch buffer. + pub fn new() -> Self { + Self::default() + } + + /// Creates a scratch buffer with pre-allocated capacity. + /// + /// # Arguments + /// + /// * `num_query` - Expected number of query tokens + /// * `num_doc` - Expected number of document tokens + pub fn with_capacity(num_query: usize, num_doc: usize) -> Self { + Self { + similarity_matrix: vec![0.0; num_query * num_doc], + } + } + + /// Ensures the buffer has sufficient capacity for the given dimensions. + /// + /// This only reallocates if the current capacity is insufficient. + #[inline] + fn ensure_capacity(&mut self, num_query: usize, num_doc: usize) { + let required = num_query * num_doc; + if self.similarity_matrix.len() < required { + self.similarity_matrix.resize(required, 0.0); + } + } + + /// Returns a mutable slice of the similarity matrix with the given dimensions. + #[inline] + fn as_mut_slice(&mut self, num_query: usize, num_doc: usize) -> &mut [f32] { + self.ensure_capacity(num_query, num_doc); + &mut self.similarity_matrix[..num_query * num_doc] + } +} + +/// SGEMM-based approach for Chamfer distance computation. +/// +/// This approach computes the full similarity matrix via BLAS SGEMM, then performs +/// row-wise max reduction. It serves as a baseline to compare against custom SIMD +/// implementations. +/// +/// # Algorithm +/// +/// 1. Compute `S = Q × Dᵀ` using SGEMM (similarity matrix) +/// 2. For each row i in S, find `max_j(S[i, j])` +/// 3. Sum and negate the max values to get Chamfer distance +/// +/// # Usage +/// +/// ``` +/// use experimental_multi_vector_bench::{ +/// Chamfer, SgemmApproach, SgemmScratch, MultiVector, Standard, +/// }; +/// +/// let query = MultiVector::new(Standard::new(8, 128), 0.0f32).unwrap(); +/// let doc = MultiVector::new(Standard::new(32, 128), 0.0f32).unwrap(); +/// +/// let chamfer = Chamfer::::new(); +/// let mut scratch = SgemmScratch::new(); +/// +/// let distance = chamfer.evaluate_similarity_with_scratch(&query, &doc, &mut scratch); +/// ``` +#[derive(Debug, Clone, Copy, Default)] +pub struct SgemmApproach; + +impl Chamfer { + /// Computes Chamfer distance using SGEMM with a pre-allocated scratch buffer. + /// + /// This is the recommended entry point for benchmarking, as it avoids allocation + /// overhead on the hot path. + /// + /// # Arguments + /// + /// * `query` - The query multi-vector (Q tokens × D dimensions) + /// * `doc` - The document multi-vector (N tokens × D dimensions) + /// * `scratch` - Pre-allocated scratch buffer for the similarity matrix + /// + /// # Returns + /// + /// The Chamfer distance (negated sum of max similarities). + pub fn evaluate_similarity_with_scratch( + &self, + query: &MultiVector, + doc: &MultiVector, + scratch: &mut SgemmScratch, + ) -> f32 { + let num_query = query.num_vectors(); + let num_doc = doc.num_vectors(); + let dim = query.vector_dim(); + + debug_assert_eq!( + dim, + doc.vector_dim(), + "Query and document must have the same embedding dimension" + ); + + // Early return for empty inputs + if num_query == 0 || num_doc == 0 { + return 0.0; + } + + // Get similarity matrix buffer + let similarity = scratch.as_mut_slice(num_query, num_doc); + + // Compute S = Q × Dᵀ using SGEMM + // Q is [num_query × dim], D is [num_doc × dim] + // S = Q × Dᵀ = [num_query × dim] × [dim × num_doc] = [num_query × num_doc] + diskann_linalg::sgemm( + Transpose::None, // Q is not transposed + Transpose::Ordinary, // D is transposed to get Dᵀ + num_query, // m = rows in output (and Q) + num_doc, // n = cols in output (and rows in D, cols in Dᵀ) + dim, // k = cols in Q = rows in Dᵀ = cols in D + 1.0, // alpha = 1.0 + query.as_slice(), + doc.as_slice(), + None, // beta = None means overwrite C entirely + similarity, + ); + + // Row-wise max reduction, then negate and sum + // Chamfer(Q, D) = Σᵢ -maxⱼ IP(qᵢ, dⱼ) + let mut total = 0.0f32; + for i in 0..num_query { + let row_start = i * num_doc; + let row_end = row_start + num_doc; + let row = &similarity[row_start..row_end]; + + // Find max in this row using SIMD + let max_sim = simd_row_max(row); + total += max_sim; + } + + -total + } +} + +/// SIMD-accelerated row-wise maximum using f32x8 vectors. +/// +/// Processes 8 elements at a time, then reduces the SIMD vector to a scalar max. +#[inline] +fn simd_row_max(row: &[f32]) -> f32 { + let len = row.len(); + if len == 0 { + return f32::NEG_INFINITY; + } + + // Process full SIMD lanes (8 elements at a time) + let simd_lanes = 8; + let simd_chunks = len / simd_lanes; + let remainder = len % simd_lanes; + + let mut max_vec = f32s::splat(diskann_wide::ARCH, f32::NEG_INFINITY); + + // Main SIMD loop + let ptr = row.as_ptr(); + for i in 0..simd_chunks { + // SAFETY: i * simd_lanes + simd_lanes <= simd_chunks * simd_lanes <= len + let chunk = unsafe { f32s::load_simd(diskann_wide::ARCH, ptr.add(i * simd_lanes)) }; + max_vec = max_vec.max_simd(chunk); + } + + // Reduce SIMD vector to scalar using to_array() pattern from transposed_tiling.rs + let mut scalar_max = max_vec + .to_array() + .into_iter() + .fold(f32::NEG_INFINITY, f32::max); + + // Handle remainder elements + if remainder > 0 { + let remainder_start = simd_chunks * simd_lanes; + for j in remainder_start..len { + // SAFETY: j < len + let val = unsafe { *row.get_unchecked(j) }; + scalar_max = scalar_max.max(val); + } + } + + scalar_max +} + +// Note: We implement DistanceFunction for API compatibility, but it allocates internally. +// For benchmarking, use `evaluate_similarity_with_scratch` instead. +impl DistanceFunction<&MultiVector, &MultiVector> for Chamfer { + fn evaluate_similarity(&self, query: &MultiVector, doc: &MultiVector) -> f32 { + let mut scratch = SgemmScratch::with_capacity(query.num_vectors(), doc.num_vectors()); + self.evaluate_similarity_with_scratch(query, doc, &mut scratch) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::distance::NaiveApproach; + use crate::Standard; + + fn make_multi_vector(data: &[f32], rows: usize, cols: usize) -> MultiVector { + let mut mat = MultiVector::new(Standard::new(rows, cols), 0.0f32).unwrap(); + for (i, chunk) in data.chunks(cols).enumerate() { + if let Some(row) = mat.get_row_mut(i) { + row.copy_from_slice(chunk); + } + } + mat + } + + #[test] + fn test_sgemm_matches_naive() { + // Create test data + let query = make_multi_vector( + &[ + 1.0, 0.0, 0.0, // q0 + 0.0, 1.0, 0.0, // q1 + ], + 2, + 3, + ); + let doc = make_multi_vector( + &[ + 1.0, 0.0, 0.0, // d0 + 0.0, 1.0, 0.0, // d1 + 0.5, 0.5, 0.0, // d2 + ], + 3, + 3, + ); + + let naive = Chamfer::::new(); + let sgemm = Chamfer::::new(); + + let naive_result = naive.evaluate_similarity(&query, &doc); + let sgemm_result = sgemm.evaluate_similarity(&query, &doc); + + assert!( + (naive_result - sgemm_result).abs() < 1e-5, + "SGEMM result {} should match naive result {}", + sgemm_result, + naive_result + ); + } + + #[test] + fn test_sgemm_with_scratch_reuse() { + let query = make_multi_vector(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 2, 3); + let doc1 = make_multi_vector(&[1.0, 0.0, 0.0, 0.0, 1.0, 0.0], 2, 3); + let doc2 = make_multi_vector(&[0.0, 0.0, 1.0, 1.0, 1.0, 1.0], 2, 3); + + let chamfer = Chamfer::::new(); + let mut scratch = SgemmScratch::new(); + + // Compute distances reusing scratch buffer + let d1 = chamfer.evaluate_similarity_with_scratch(&query, &doc1, &mut scratch); + let d2 = chamfer.evaluate_similarity_with_scratch(&query, &doc2, &mut scratch); + + // Verify against DistanceFunction trait implementation + assert!((d1 - chamfer.evaluate_similarity(&query, &doc1)).abs() < 1e-5); + assert!((d2 - chamfer.evaluate_similarity(&query, &doc2)).abs() < 1e-5); + } + + #[test] + fn test_empty_inputs() { + let empty_query = make_multi_vector(&[], 0, 3); + let doc = make_multi_vector(&[1.0, 2.0, 3.0], 1, 3); + + let chamfer = Chamfer::::new(); + let mut scratch = SgemmScratch::new(); + + assert_eq!( + chamfer.evaluate_similarity_with_scratch(&empty_query, &doc, &mut scratch), + 0.0 + ); + } +} diff --git a/experimental-multi-vector-bench/src/distance/simd.rs b/experimental-multi-vector-bench/src/distance/simd.rs new file mode 100644 index 000000000..033511b81 --- /dev/null +++ b/experimental-multi-vector-bench/src/distance/simd.rs @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! SIMD-accelerated implementation of multi-vector distance computation. + +use diskann_vector::distance::InnerProduct; +use diskann_vector::{DistanceFunction, PureDistanceFunction}; + +use super::Chamfer; +use crate::MultiVector; + +/// SIMD-accelerated approach using vectorized distance computations. +/// +/// This approach leverages SIMD instructions (e.g., AVX2 on x86_64, NEON on ARM) +/// for faster distance calculations between vectors. +/// +/// # Performance +/// +/// This implementation provides significant speedups over [`super::NaiveApproach`] +/// by using hardware-accelerated vector operations for the inner distance +/// computations. Typical speedups are 5-10x depending on vector dimensions +/// and hardware capabilities. +#[derive(Debug, Clone, Copy, Default)] +pub struct SimdApproach; + +impl DistanceFunction<&MultiVector, &MultiVector> for Chamfer { + fn evaluate_similarity(&self, query: &MultiVector, doc: &MultiVector) -> f32 { + let mut score = 0.0; + for q_vec in query.rows() { + // InnerProduct::evaluate returns negated inner product (-dot), + // so we find the minimum (most similar = highest dot = lowest -dot) + let min_dist = doc + .rows() + .map(|d_vec| InnerProduct::evaluate(q_vec, d_vec)) + .fold(f32::MAX, f32::min); + + score += min_dist; + } + score + } +} diff --git a/experimental-multi-vector-bench/src/distance/transposed.rs b/experimental-multi-vector-bench/src/distance/transposed.rs new file mode 100644 index 000000000..db50440d2 --- /dev/null +++ b/experimental-multi-vector-bench/src/distance/transposed.rs @@ -0,0 +1,209 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! Block-transposed SIMD implementation of multi-vector distance computation. +//! +//! This module provides a SIMD-accelerated implementation that uses a block-transposed +//! memory layout for the document vectors, enabling efficient SIMD computation of +//! inner products between a row-major query and transposed document vectors. +//! +//! # Memory Layout +//! +//! The block-transposed layout groups 16 document vectors together and stores their +//! dimensions contiguously. For vectors with dimensions `[d0, d1, d2, ...]`, the +//! transposed layout stores all 16 `d0` values together, then all 16 `d1` values, etc: +//! +//! ```text +//! Standard: [v0_d0, v0_d1, ...], [v1_d0, v1_d1, ...], ... +//! Transposed: [v0_d0..v15_d0], [v0_d1..v15_d1], ... +//! ``` +//! +//! This layout enables efficient SIMD operations by loading 8 document values at once +//! (f32x8) and computing 16 inner products simultaneously using two SIMD registers. + +use diskann_quantization::algorithms::kmeans::BlockTranspose; +use diskann_vector::DistanceFunction; +use diskann_wide::{SIMDMask, SIMDMinMax, SIMDMulAdd, SIMDSelect, SIMDVector}; + +use super::Chamfer; +use crate::{MultiVector, TransposedMultiVector}; + +diskann_wide::alias!(f32s = f32x8); +diskann_wide::alias!(m32s = mask_f32x8); + +/// Block-transposed SIMD approach for Chamfer distance computation. +/// +/// This approach uses a block-transposed memory layout for document vectors, +/// enabling efficient SIMD computation. The query remains in row-major format +/// for sequential iteration. +/// +/// # Algorithm +/// +/// Computes the asymmetric Chamfer distance: `Σ_q -max_d IP(q, d)` +/// +/// For each query vector `q` (row-major, sequential access): +/// 1. Process document vectors in blocks of 16 (transposed layout) +/// 2. For each dimension, broadcast `q[dim]` and multiply-add with 16 doc values +/// 3. Track maximum similarity across all 16 lanes using SIMD max operations +/// 4. Handle remainder vectors (< 16) with masked operations +/// 5. Reduce to scalar max and negate for distance +/// +/// # Performance +/// +/// This implementation is optimized for scenarios with many document tokens where +/// the block-transposed layout improves cache utilization and enables SIMD parallelism. +/// +/// # Example +/// +/// ``` +/// use experimental_multi_vector_bench::{ +/// Chamfer, TransposedApproach, TransposedMultiVector, MultiVector, Standard, +/// }; +/// use diskann_vector::DistanceFunction; +/// +/// let query = MultiVector::new(Standard::new(8, 128), 0.0f32).unwrap(); +/// let doc = MultiVector::new(Standard::new(32, 128), 0.0f32).unwrap(); +/// let transposed_doc = TransposedMultiVector::from(&doc); +/// +/// let chamfer = Chamfer::::new(); +/// let distance = chamfer.evaluate_similarity(&query, &transposed_doc); +/// ``` +#[derive(Debug, Clone, Copy, Default)] +pub struct TransposedApproach; + +/// Block size for transposed layout (number of vectors per block). +const N: usize = 16; +/// Half block size for dual-register processing. +const N2: usize = N / 2; + +impl DistanceFunction<&MultiVector, &TransposedMultiVector> for Chamfer { + fn evaluate_similarity(&self, query: &MultiVector, doc: &TransposedMultiVector) -> f32 { + let mut score = 0.0; + // For each query vector, find max similarity to any document vector + for query_vec in query.rows() { + score += max_inner_product_to_transposed_doc(query_vec, doc.block_transposed()); + } + score + } +} + +/// Finds the maximum inner product between `query_vec` and any vector in the transposed document. +/// +/// Returns the negated max similarity (distance = -similarity for inner product). +/// +/// # Algorithm +/// +/// 1. Process full blocks of 16 document vectors using SIMD +/// 2. For each block, compute inner products for all dimensions using FMA +/// 3. Track running maximum across all document vectors using SIMD max operations +/// 4. Handle partial remainder block with lane masking +/// 5. Reduce SIMD max to scalar and negate +#[inline(always)] +fn max_inner_product_to_transposed_doc(query_vec: &[f32], doc: &BlockTranspose) -> f32 { + let min_val = f32s::splat(diskann_wide::ARCH, f32::MIN); + let mut max_similarity = min_val; + + // Process full blocks (each contains exactly 16 document vectors) + for block in 0..doc.full_blocks() { + let (sim1, sim2) = compute_block_inner_products(query_vec, doc, block); + max_similarity = max_similarity.max_simd(sim1); + max_similarity = max_similarity.max_simd(sim2); + } + + // Process remainder block if present (< 16 document vectors) + let remainder = doc.remainder(); + if remainder != 0 { + let (mut sim1, mut sim2) = compute_block_inner_products(query_vec, doc, doc.full_blocks()); + + // Compute how many valid lanes in each register + let lo = remainder.min(N2); // Valid lanes in sim1 (0-8) + let hi = remainder.saturating_sub(N2); // Valid lanes in sim2 (0-8) + + // Mask invalid lanes to MIN so they never win the max comparison + sim1 = m32s::keep_first(diskann_wide::ARCH, lo).select(sim1, min_val); + sim2 = m32s::keep_first(diskann_wide::ARCH, hi).select(sim2, min_val); + + max_similarity = max_similarity.max_simd(sim1); + max_similarity = max_similarity.max_simd(sim2); + } + + // Horizontal max reduction and negate (distance = -similarity) + -max_similarity + .to_array() + .into_iter() + .fold(f32::MIN, f32::max) +} + +/// Computes inner products between `query_vec` and 16 document vectors in the specified block. +/// +/// Returns two f32x8 vectors containing similarities for document vectors 0-7 and 8-15. +#[inline(always)] +fn compute_block_inner_products( + query_vec: &[f32], + doc: &BlockTranspose, + block: usize, +) -> (f32s, f32s) { + debug_assert!(block < doc.num_blocks()); + + // Use 4 accumulator registers to reduce FMA dependency chains + let mut sim1_a = f32s::default(diskann_wide::ARCH); + let mut sim2_a = f32s::default(diskann_wide::ARCH); + let mut sim1_b = f32s::default(diskann_wide::ARCH); + let mut sim2_b = f32s::default(diskann_wide::ARCH); + + // SAFETY: block < num_blocks() ensures this access is in-bounds. + let block_ptr = unsafe { doc.block_ptr_unchecked(block) }; + + let ncols = doc.ncols(); + + // Process 4 dimensions at a time, alternating accumulators + // Register count: 4 acc + 8 doc loads + 4 query = 16 registers (at AVX2 limit) + for dim in (0..ncols.saturating_sub(3)).step_by(4) { + // SAFETY: For all rows in this block, 16 reads are valid per dimension. + // dim + 3 < ncols ensures all dimension accesses are in-bounds. + // dim + 3 < ncols <= query_vec.len() by caller contract ensures query accesses are valid. + let (d0_0, d1_0, d0_1, d1_1, d0_2, d1_2, d0_3, d1_3, q0, q1, q2, q3) = unsafe { + ( + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim + N2)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 1))), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 1) + N2)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 2))), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 2) + N2)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 3))), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 3) + N2)), + f32s::splat(diskann_wide::ARCH, *query_vec.get_unchecked(dim)), + f32s::splat(diskann_wide::ARCH, *query_vec.get_unchecked(dim + 1)), + f32s::splat(diskann_wide::ARCH, *query_vec.get_unchecked(dim + 2)), + f32s::splat(diskann_wide::ARCH, *query_vec.get_unchecked(dim + 3)), + ) + }; + + // Fused multiply-add into alternating accumulators (dims 0,2 -> _a, dims 1,3 -> _b) + sim1_a = q0.mul_add_simd(d0_0, sim1_a); + sim2_a = q0.mul_add_simd(d1_0, sim2_a); + sim1_b = q1.mul_add_simd(d0_1, sim1_b); + sim2_b = q1.mul_add_simd(d1_1, sim2_b); + sim1_a = q2.mul_add_simd(d0_2, sim1_a); + sim2_a = q2.mul_add_simd(d1_2, sim2_a); + sim1_b = q3.mul_add_simd(d0_3, sim1_b); + sim2_b = q3.mul_add_simd(d1_3, sim2_b); + } + + // Handle remaining dimensions (0-3) + for dim in (ncols - (ncols % 4))..ncols { + // SAFETY: dim < ncols ensures valid block access; dim < query_vec.len() by caller contract. + let (d0, d1, q) = unsafe { + ( + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim + N2)), + f32s::splat(diskann_wide::ARCH, *query_vec.get_unchecked(dim)), + ) + }; + sim1_a = q.mul_add_simd(d0, sim1_a); + sim2_a = q.mul_add_simd(d1, sim2_a); + } + + // Combine accumulators + (sim1_a + sim1_b, sim2_a + sim2_b) +} diff --git a/experimental-multi-vector-bench/src/distance/transposed_tiling.rs b/experimental-multi-vector-bench/src/distance/transposed_tiling.rs new file mode 100644 index 000000000..2c427ab25 --- /dev/null +++ b/experimental-multi-vector-bench/src/distance/transposed_tiling.rs @@ -0,0 +1,353 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! Block-transposed SIMD implementation with tiling for multi-vector distance computation. +//! +//! This module provides a SIMD-accelerated implementation that combines block-transposed +//! memory layout for documents with query tiling for improved cache efficiency. +//! +//! # Tiling Strategy +//! +//! The key optimization is processing **pairs of query vectors together** against each +//! document block. This amortizes the cost of loading document data from memory by +//! reusing it for both query vectors simultaneously. +//! +//! # Performance +//! +//! This implementation achieves **1.8x-2.5x speedup** over the baseline SIMD approach. +//! Best performance is achieved when query token count is small (≤8). +//! +//! For scenarios with many query tokens (≥16), consider using +//! [`QueryTransposedWithTilingApproach`](super::QueryTransposedWithTilingApproach) which +//! transposes the query instead. +//! +//! # Register Allocation +//! +//! Both hot loops are carefully designed to use exactly 16 YMM registers (AVX2 limit): +//! - Pair processing: 8 accumulators + 4 doc loads + 4 query broadcasts = 16 registers +//! - Single fallback: 4 accumulators + 8 doc loads + 4 query broadcasts = 16 registers + +use diskann_quantization::algorithms::kmeans::BlockTranspose; +use diskann_vector::DistanceFunction; +use diskann_wide::{SIMDMask, SIMDMinMax, SIMDMulAdd, SIMDSelect, SIMDVector}; + +use super::Chamfer; +use crate::{MultiVector, TransposedMultiVector}; + +diskann_wide::alias!(f32s = f32x8); +diskann_wide::alias!(m32s = mask_f32x8); + +/// Block-transposed SIMD approach with tiling for Chamfer distance computation. +/// +/// This approach combines block-transposed memory layout for documents with query tiling +/// to improve cache utilization. The key insight is that when computing Chamfer distance, +/// we load document blocks from memory for each query vector. By processing pairs of query +/// vectors together, we can load each document block once and reuse it for both queries. +/// +/// # Algorithm +/// +/// Computes the asymmetric Chamfer distance: `Σ_q -max_d IP(q, d)` +/// +/// 1. Process query vectors in pairs (q1, q2) +/// 2. For each document block, load document values once and compute inner products +/// for both q1 and q2 simultaneously using 8 accumulators (4 per query) +/// 3. Track maximum similarity for each query using SIMD max operations +/// 4. Handle odd remainder query with a single-vector fallback +/// +/// # Performance Characteristics +/// +/// - **Best for**: Large configurations with many query and document tokens +/// - **Speedup**: 40-60% faster than baseline SIMD, 20-35% faster than transposed SIMD +/// - **Register usage**: Optimized for AVX2 (exactly 16 YMM registers in hot loops) +/// +/// # Example +/// +/// ``` +/// use experimental_multi_vector_bench::{ +/// Chamfer, TransposedWithTilingApproach, TransposedMultiVector, MultiVector, Standard, +/// }; +/// use diskann_vector::DistanceFunction; +/// +/// let query = MultiVector::new(Standard::new(8, 128), 0.0f32).unwrap(); +/// let doc = MultiVector::new(Standard::new(32, 128), 0.0f32).unwrap(); +/// let transposed_doc = TransposedMultiVector::from(&doc); +/// +/// let chamfer = Chamfer::::new(); +/// let distance = chamfer.evaluate_similarity(&query, &transposed_doc); +/// ``` +#[derive(Debug, Clone, Copy, Default)] +pub struct TransposedWithTilingApproach; + +/// Block size for transposed layout (number of vectors per block). +const N: usize = 16; +/// Half block size for dual-register processing. +const N2: usize = N / 2; + +impl DistanceFunction<&MultiVector, &TransposedMultiVector> + for Chamfer +{ + fn evaluate_similarity(&self, query: &MultiVector, doc: &TransposedMultiVector) -> f32 { + let block_transposed = doc.block_transposed(); + let num_queries = query.num_vectors(); + + let mut score = 0.0; + + // Process pairs of query vectors together to amortize document load costs + for i in (0..num_queries.saturating_sub(1)).step_by(2) { + // SAFETY: i + 1 < num_queries ensures both indices are valid. + let (q1, q2) = unsafe { (query.get_row_unchecked(i), query.get_row_unchecked(i + 1)) }; + let (max1, max2) = max_inner_product_pair(q1, q2, block_transposed); + score += max1 + max2; + } + + // Handle odd remainder query vector + if !num_queries.is_multiple_of(2) { + // SAFETY: num_queries - 1 < num_queries ensures index is valid + score += max_inner_product_single( + unsafe { query.get_row_unchecked(num_queries - 1) }, + block_transposed, + ); + } + + score + } +} + +/// Process two query vectors against all document blocks simultaneously. +/// Returns (max_similarity_for_q1, max_similarity_for_q2), both negated. +/// +/// This amortizes document memory loads by reusing them for both queries. +/// +/// # Register Allocation (Unroll by 2) +/// +/// - 8 accumulators: sim1_q1_a, sim2_q1_a, sim1_q1_b, sim2_q1_b, +/// sim1_q2_a, sim2_q2_a, sim1_q2_b, sim2_q2_b +/// - 4 doc loads: d0_0, d1_0, d0_1, d1_1 +/// - 4 query broadcasts: q1_0, q1_1, q2_0, q2_1 +/// - Total: 16 YMM registers +#[inline(always)] +fn max_inner_product_pair(q1: &[f32], q2: &[f32], doc: &BlockTranspose) -> (f32, f32) { + #[inline(always)] + fn process_block_pair( + q1: &[f32], + q2: &[f32], + doc: &BlockTranspose, + block: usize, + ) -> (f32s, f32s, f32s, f32s) { + debug_assert!(block < doc.num_blocks()); + + // 8 accumulators total (4 per query) + let mut sim1_q1_a = f32s::default(diskann_wide::ARCH); + let mut sim2_q1_a = f32s::default(diskann_wide::ARCH); + let mut sim1_q1_b = f32s::default(diskann_wide::ARCH); + let mut sim2_q1_b = f32s::default(diskann_wide::ARCH); + let mut sim1_q2_a = f32s::default(diskann_wide::ARCH); + let mut sim2_q2_a = f32s::default(diskann_wide::ARCH); + let mut sim1_q2_b = f32s::default(diskann_wide::ARCH); + let mut sim2_q2_b = f32s::default(diskann_wide::ARCH); + + // SAFETY: block < num_blocks() ensures this access is in-bounds. + let block_ptr = unsafe { doc.block_ptr_unchecked(block) }; + + let ncols = doc.ncols(); + + // Process 2 dimensions at a time + for dim in (0..ncols.saturating_sub(1)).step_by(2) { + // SAFETY: For all rows in this block, 16 reads are valid. + // dim + 1 < ncols ensures all dimension accesses are in-bounds. + // dim + 1 < ncols <= q1.len() and q2.len() by caller contract. + let (d0_0, d1_0, d0_1, d1_1, q1_0, q1_1, q2_0, q2_1) = unsafe { + ( + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim + N2)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 1))), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 1) + N2)), + f32s::splat(diskann_wide::ARCH, *q1.get_unchecked(dim)), + f32s::splat(diskann_wide::ARCH, *q1.get_unchecked(dim + 1)), + f32s::splat(diskann_wide::ARCH, *q2.get_unchecked(dim)), + f32s::splat(diskann_wide::ARCH, *q2.get_unchecked(dim + 1)), + ) + }; + + // FMA for query 1 + sim1_q1_a = q1_0.mul_add_simd(d0_0, sim1_q1_a); + sim2_q1_a = q1_0.mul_add_simd(d1_0, sim2_q1_a); + sim1_q1_b = q1_1.mul_add_simd(d0_1, sim1_q1_b); + sim2_q1_b = q1_1.mul_add_simd(d1_1, sim2_q1_b); + + // FMA for query 2 + sim1_q2_a = q2_0.mul_add_simd(d0_0, sim1_q2_a); + sim2_q2_a = q2_0.mul_add_simd(d1_0, sim2_q2_a); + sim1_q2_b = q2_1.mul_add_simd(d0_1, sim1_q2_b); + sim2_q2_b = q2_1.mul_add_simd(d1_1, sim2_q2_b); + } + + // Handle remaining dimension + if !ncols.is_multiple_of(2) { + let dim = ncols - 1; + // SAFETY: dim < ncols ensures valid block access; dim < q1.len() and q2.len(). + let (d0, d1, q1_val, q2_val) = unsafe { + ( + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim + N2)), + f32s::splat(diskann_wide::ARCH, *q1.get_unchecked(dim)), + f32s::splat(diskann_wide::ARCH, *q2.get_unchecked(dim)), + ) + }; + sim1_q1_a = q1_val.mul_add_simd(d0, sim1_q1_a); + sim2_q1_a = q1_val.mul_add_simd(d1, sim2_q1_a); + sim1_q2_a = q2_val.mul_add_simd(d0, sim1_q2_a); + sim2_q2_a = q2_val.mul_add_simd(d1, sim2_q2_a); + } + + ( + sim1_q1_a + sim1_q1_b, + sim2_q1_a + sim2_q1_b, + sim1_q2_a + sim1_q2_b, + sim2_q2_a + sim2_q2_b, + ) + } + + let min_val = f32s::splat(diskann_wide::ARCH, f32::MIN); + let mut max_sim_q1 = min_val; + let mut max_sim_q2 = min_val; + + for block in 0..doc.full_blocks() { + let (sim1_q1, sim2_q1, sim1_q2, sim2_q2) = process_block_pair(q1, q2, doc, block); + max_sim_q1 = max_sim_q1.max_simd(sim1_q1); + max_sim_q1 = max_sim_q1.max_simd(sim2_q1); + max_sim_q2 = max_sim_q2.max_simd(sim1_q2); + max_sim_q2 = max_sim_q2.max_simd(sim2_q2); + } + + let remainder = doc.remainder(); + if remainder != 0 { + let (mut sim1_q1, mut sim2_q1, mut sim1_q2, mut sim2_q2) = + process_block_pair(q1, q2, doc, doc.full_blocks()); + + let lo = remainder.min(N2); + let hi = remainder.saturating_sub(N2); + + sim1_q1 = m32s::keep_first(diskann_wide::ARCH, lo).select(sim1_q1, min_val); + sim2_q1 = m32s::keep_first(diskann_wide::ARCH, hi).select(sim2_q1, min_val); + sim1_q2 = m32s::keep_first(diskann_wide::ARCH, lo).select(sim1_q2, min_val); + sim2_q2 = m32s::keep_first(diskann_wide::ARCH, hi).select(sim2_q2, min_val); + + max_sim_q1 = max_sim_q1.max_simd(sim1_q1); + max_sim_q1 = max_sim_q1.max_simd(sim2_q1); + max_sim_q2 = max_sim_q2.max_simd(sim1_q2); + max_sim_q2 = max_sim_q2.max_simd(sim2_q2); + } + + let max1 = -max_sim_q1.to_array().into_iter().fold(f32::MIN, f32::max); + let max2 = -max_sim_q2.to_array().into_iter().fold(f32::MIN, f32::max); + + (max1, max2) +} + +/// Fallback for single query vector (odd remainder). +/// +/// Uses unroll by 4 with 4 accumulators to hide FMA latency. +/// +/// # Register Allocation (Unroll by 4) +/// +/// - 4 accumulators: sim1_a, sim2_a, sim1_b, sim2_b +/// - 8 doc loads: d0_0..d0_3, d1_0..d1_3 +/// - 4 query broadcasts: q0, q1, q2, q3 +/// - Total: 16 YMM registers +#[inline(always)] +fn max_inner_product_single(query_vec: &[f32], doc: &BlockTranspose) -> f32 { + #[inline(always)] + fn process_block(query_vec: &[f32], doc: &BlockTranspose, block: usize) -> (f32s, f32s) { + debug_assert!(block < doc.num_blocks()); + + // Use 4 accumulator registers to reduce FMA dependency chains + let mut sim1_a = f32s::default(diskann_wide::ARCH); + let mut sim2_a = f32s::default(diskann_wide::ARCH); + let mut sim1_b = f32s::default(diskann_wide::ARCH); + let mut sim2_b = f32s::default(diskann_wide::ARCH); + + // SAFETY: block < num_blocks() ensures this access is in-bounds. + let block_ptr = unsafe { doc.block_ptr_unchecked(block) }; + + let ncols = doc.ncols(); + + // Process 4 dimensions at a time, alternating accumulators + // Register count: 4 acc + 8 doc loads + 4 query = 16 registers + for dim in (0..ncols.saturating_sub(3)).step_by(4) { + // SAFETY: For all rows in this block, 16 reads are valid per dimension. + // dim + 3 < ncols ensures all dimension accesses are in-bounds. + // dim + 3 < ncols <= query_vec.len() by caller contract ensures query accesses are valid. + let (d0_0, d1_0, d0_1, d1_1, d0_2, d1_2, d0_3, d1_3, q0, q1, q2, q3) = unsafe { + ( + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim + N2)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 1))), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 1) + N2)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 2))), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 2) + N2)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 3))), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * (dim + 3) + N2)), + f32s::splat(diskann_wide::ARCH, *query_vec.get_unchecked(dim)), + f32s::splat(diskann_wide::ARCH, *query_vec.get_unchecked(dim + 1)), + f32s::splat(diskann_wide::ARCH, *query_vec.get_unchecked(dim + 2)), + f32s::splat(diskann_wide::ARCH, *query_vec.get_unchecked(dim + 3)), + ) + }; + + // Fused multiply-add into alternating accumulators (dims 0,2 -> _a, dims 1,3 -> _b) + sim1_a = q0.mul_add_simd(d0_0, sim1_a); + sim2_a = q0.mul_add_simd(d1_0, sim2_a); + sim1_b = q1.mul_add_simd(d0_1, sim1_b); + sim2_b = q1.mul_add_simd(d1_1, sim2_b); + sim1_a = q2.mul_add_simd(d0_2, sim1_a); + sim2_a = q2.mul_add_simd(d1_2, sim2_a); + sim1_b = q3.mul_add_simd(d0_3, sim1_b); + sim2_b = q3.mul_add_simd(d1_3, sim2_b); + } + + // Handle remaining dimensions (0-3) + for dim in (ncols - (ncols % 4))..ncols { + // SAFETY: dim < ncols ensures valid block access; dim < query_vec.len() by caller contract. + let (d0, d1, q) = unsafe { + ( + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim)), + f32s::load_simd(diskann_wide::ARCH, block_ptr.add(N * dim + N2)), + f32s::splat(diskann_wide::ARCH, *query_vec.get_unchecked(dim)), + ) + }; + sim1_a = q.mul_add_simd(d0, sim1_a); + sim2_a = q.mul_add_simd(d1, sim2_a); + } + + (sim1_a + sim1_b, sim2_a + sim2_b) + } + + let min_val = f32s::splat(diskann_wide::ARCH, f32::MIN); + let mut max_similarity = min_val; + + for block in 0..doc.full_blocks() { + let (sim1, sim2) = process_block(query_vec, doc, block); + max_similarity = max_similarity.max_simd(sim1); + max_similarity = max_similarity.max_simd(sim2); + } + + let remainder = doc.remainder(); + if remainder != 0 { + let (mut sim1, mut sim2) = process_block(query_vec, doc, doc.full_blocks()); + + let lo = remainder.min(N2); + let hi = remainder.saturating_sub(N2); + + sim1 = m32s::keep_first(diskann_wide::ARCH, lo).select(sim1, min_val); + sim2 = m32s::keep_first(diskann_wide::ARCH, hi).select(sim2, min_val); + + max_similarity = max_similarity.max_simd(sim1); + max_similarity = max_similarity.max_simd(sim2); + } + + -max_similarity + .to_array() + .into_iter() + .fold(f32::MIN, f32::max) +} diff --git a/experimental-multi-vector-bench/src/lib.rs b/experimental-multi-vector-bench/src/lib.rs new file mode 100644 index 000000000..39d6e2346 --- /dev/null +++ b/experimental-multi-vector-bench/src/lib.rs @@ -0,0 +1,106 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! Experimental multi-vector benchmarking support for DiskANN. +//! +//! This crate provides high-performance distance functions for multi-vector representations, +//! where a single entity (document, image, etc.) is represented by multiple embedding vectors. +//! +//! # Overview +//! +//! Multi-vector representations are used in advanced retrieval systems that employ +//! "late interaction" - instead of encoding an entity into a single vector, each token +//! or segment produces its own embedding. This enables more fine-grained semantic matching +//! by computing similarity at the token/segment level rather than aggregating into a +//! single representation upfront. +//! +//! # Use Cases +//! +//! - **Token-level retrieval**: Each token in a document/query has its own embedding, +//! and relevance is computed via MaxSim (maximum similarity) aggregation. +//! - **Multi-aspect embeddings**: A single entity represented by embeddings from different +//! views, modalities, or chunked segments. +//! - **Passage chunking**: Long documents split into chunks, each with its own embedding, +//! where the final score aggregates similarities across all chunks. +//! +//! # Distance Computation +//! +//! For multi-vector search, the typical approach is: +//! 1. Compute pairwise Inner Product similarities between all vectors in the query and document. +//! 2. For each query vector, find the maximum similarity to any document vector. +//! 3. Negate and sum these values to get the final Chamfer distance (lower = more similar). +//! +//! This "late interaction" pattern preserves fine-grained token-level semantics while +//! enabling efficient pre-computation of document representations. +//! +//! # Available Approaches +//! +//! | Approach | Best For | Speedup vs SIMD | +//! |----------|----------|-----------------| +//! | [`NaiveApproach`] | Reference/debugging | 0.1x (baseline) | +//! | [`SimdApproach`] | General purpose | 1.0x | +//! | [`TransposedApproach`] | Medium Q×D | 1.4–1.7x | +//! | [`TransposedWithTilingApproach`] | Small D (≤32 docs) | 1.8–2.6x | +//! | [`QueryTransposedWithTilingApproach`] | Many queries (≥16) | 1.8–2.2x | +//! | [`SgemmApproach`] | Large Q×D (≥16×64) | 1.9–4.2x | +//! +//! # Type Aliases +//! +//! This crate uses types from `diskann-quantization` for multi-vector representation: +//! +//! - [`MultiVector`] = `Mat>` - Owning row-major matrix +//! - [`MultiVectorRef`] = `MatRef>` - Borrowed view +//! +//! # Example +//! +//! ``` +//! use experimental_multi_vector_bench::{ +//! Chamfer, SimdApproach, TransposedWithTilingApproach, +//! MultiVector, TransposedMultiVector, Standard, +//! }; +//! use diskann_vector::DistanceFunction; +//! +//! // Create multi-vectors +//! let query = MultiVector::new(Standard::new(8, 128), 0.0f32).unwrap(); +//! let doc = MultiVector::new(Standard::new(32, 128), 0.0f32).unwrap(); +//! +//! // Basic: SIMD-accelerated +//! let chamfer = Chamfer::::new(); +//! let distance = chamfer.evaluate_similarity(&query, &doc); +//! +//! // Optimized: transpose documents for better cache utilization +//! let chamfer = Chamfer::::new(); +//! let transposed_doc = TransposedMultiVector::from(&doc); +//! let distance = chamfer.evaluate_similarity(&query, &transposed_doc); +//! +//! // For large Q×D: use SGEMM (best for ≥16 queries × ≥64 docs) +//! use experimental_multi_vector_bench::{SgemmApproach, SgemmScratch}; +//! let chamfer = Chamfer::::new(); +//! let mut scratch = SgemmScratch::new(); +//! let distance = chamfer.evaluate_similarity_with_scratch(&query, &doc, &mut scratch); +//! ``` + +#![warn(missing_docs)] + +pub mod bench; +pub mod distance; +mod multi_vector; + +pub use distance::{ + Chamfer, NaiveApproach, QueryTransposedWithTilingApproach, SgemmApproach, SgemmScratch, + SimdApproach, TransposedApproach, TransposedWithTilingApproach, +}; +pub use multi_vector::TransposedMultiVector; + +// Re-export types from diskann-quantization for unified multi-vector representation +pub use diskann_quantization::multi_vector::{distance::QueryMatRef, Mat, MatRef, Standard}; + +/// A multi-vector representation using standard f32 row-major format. +/// +/// This is an alias for `Mat>` from diskann-quantization. +pub type MultiVector = Mat>; + +/// An immutable view of a multi-vector. +/// +/// This is an alias for `MatRef>` from diskann-quantization. +pub type MultiVectorRef<'a> = MatRef<'a, Standard>; diff --git a/experimental-multi-vector-bench/src/multi_vector.rs b/experimental-multi-vector-bench/src/multi_vector.rs new file mode 100644 index 000000000..248c8b5f0 --- /dev/null +++ b/experimental-multi-vector-bench/src/multi_vector.rs @@ -0,0 +1,115 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +//! Multi-vector representation for DiskANN. +//! +//! This module provides the [`TransposedMultiVector`] type for block-transposed +//! SIMD-optimized multi-vector representations. +//! +//! For row-major multi-vectors, use [`MultiVector`](crate::MultiVector) (re-exported from +//! `diskann_quantization::multi_vector::Mat>`). +//! +//! # Background +//! +//! Traditional vector search represents each document as a single embedding vector. +//! Multi-vector representations instead encode each document (or query) as a *bag of embeddings*, +//! typically one per token or segment. This approach enables: +//! +//! - **Fine-grained matching**: Token-level similarity captures nuanced semantic relationships +//! that single-vector representations may miss. +//! - **Late interaction**: Document embeddings can be pre-computed offline, with only the +//! lightweight similarity aggregation performed at query time. +//! - **Better recall**: Chamfer aggregation ensures that if any part of a query matches +//! any part of a document well, the document receives a high score. + +use diskann_quantization::algorithms::kmeans::BlockTranspose; +use diskann_quantization::multi_vector::{MatRef, Standard}; + +use crate::MultiVector; + +/// A document multi-vector with block-transposed layout for SIMD operations. +/// +/// This structure provides a block-transposed memory layout optimized for SIMD +/// distance computations. It groups 16 document vectors together and stores their +/// dimensions contiguously, enabling efficient SIMD operations by loading 8 values +/// at once (f32x8) and computing 16 inner products simultaneously. +/// +/// # Memory Layout +/// +/// ```text +/// Standard: [v0_d0, v0_d1, ...], [v1_d0, v1_d1, ...], ... +/// Transposed: [v0_d0..v15_d0], [v0_d1..v15_d1], ... +/// ``` +/// +/// # Usage +/// +/// Documents are transposed because in the Chamfer distance computation: +/// - We iterate over each query vector (row-major = sequential access) +/// - For each query vector, we compute inner products with all document vectors +/// - The transposed layout enables SIMD-parallel inner product computation +/// +/// # Examples +/// +/// ``` +/// use experimental_multi_vector_bench::{TransposedMultiVector, MultiVector, Standard}; +/// +/// // Create a multi-vector with 32 token embeddings of dimension 128 +/// let mv = MultiVector::new(Standard::new(32, 128), 0.0f32).unwrap(); +/// let transposed = TransposedMultiVector::from(&mv); +/// +/// assert_eq!(transposed.num_vectors(), 32); // 32 tokens +/// assert_eq!(transposed.vector_dim(), 128); // 128-dim embeddings +/// ``` +#[derive(Debug)] +pub struct TransposedMultiVector { + /// Block-transposed layout for SIMD-friendly access patterns. + block_transposed: BlockTranspose<16>, +} + +impl TransposedMultiVector { + /// Creates a new `TransposedMultiVector` from a [`MatRef`] view. + /// + /// This computes the block-transposed layout for SIMD-optimized distance + /// computations. + pub fn from_view(view: MatRef<'_, Standard>) -> Self { + // Build a matrix view compatible with BlockTranspose + let nrows = view.num_vectors(); + let ncols = view.vector_dim(); + + // Collect rows into a flat buffer for BlockTranspose + let mut data = vec![0.0f32; nrows * ncols]; + for (i, row) in view.rows().enumerate() { + data[i * ncols..(i + 1) * ncols].copy_from_slice(row); + } + + let matrix = diskann_utils::views::Matrix::try_from(data.into_boxed_slice(), nrows, ncols) + .expect("valid dimensions"); + + let block_transposed = BlockTranspose::from_matrix_view(matrix.as_view()); + Self { block_transposed } + } + + /// Returns a reference to the block-transposed representation. + #[inline] + pub fn block_transposed(&self) -> &BlockTranspose<16> { + &self.block_transposed + } + + /// Returns the number of token embeddings in this multi-vector. + #[inline] + pub fn num_vectors(&self) -> usize { + self.block_transposed.nrows() + } + + /// Returns the dimensionality of each token embedding. + #[inline] + pub fn vector_dim(&self) -> usize { + self.block_transposed.ncols() + } +} + +impl From<&MultiVector> for TransposedMultiVector { + fn from(mv: &MultiVector) -> Self { + Self::from_view(mv.as_view()) + } +}