microsoft · suri-kumkaran · Feb 5, 2026 · Feb 6, 2026 · Feb 6, 2026 · Feb 6, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -22,6 +22,8 @@ members = [
     "diskann-benchmark-simd",
     "diskann-benchmark",
     "diskann-tools",
+    # Experimental
+    "experimental-multi-vector-bench",
 ]
 
 default-members = [
@@ -62,6 +64,8 @@ diskann-label-filter = { path = "diskann-label-filter", version = "0.45.0" }
 diskann-benchmark-runner = { path = "diskann-benchmark-runner", version = "0.45.0" }
 diskann-benchmark-core = { path = "diskann-benchmark-core", version = "0.45.0" }
 diskann-tools = { path = "diskann-tools", version = "0.45.0" }
+# Experimental
+experimental-multi-vector-bench = { path = "experimental-multi-vector-bench", version = "0.45.0" }
 
 # External dependencies (shared versions)
 anyhow = "1.0.98"

diff --git a/diskann-quantization/src/multi_vector/matrix.rs b/diskann-quantization/src/multi_vector/matrix.rs
@@ -495,7 +495,13 @@ impl<T: ReprOwned> Mat<T> {
         }
     }
 
-    pub(crate) unsafe fn get_row_unchecked(&self, i: usize) -> T::Row<'_> {
+    /// Returns the i-th row without bounds checking.
+    ///
+    /// # Safety
+    ///
+    /// `i` must be less than `self.num_vectors()`.
+    #[inline]
+    pub unsafe fn get_row_unchecked(&self, i: usize) -> T::Row<'_> {
         // SAFETY: Caller must ensure i < self.num_vectors(). The constructors for this type
         // ensure that `ptr` is compatible with `T`.
         unsafe { self.repr.get_row(self.ptr, i) }
@@ -581,6 +587,17 @@ impl<T: Copy> Mat<Standard<T>> {
     pub fn vector_dim(&self) -> usize {
         self.repr.ncols()
     }
+
+    /// Returns the underlying data as a contiguous slice.
+    ///
+    /// The data is stored in row-major order: `[row0_col0, row0_col1, ..., row0_colN, row1_col0, ...]`.
+    #[inline]
+    pub fn as_slice(&self) -> &[T] {
+        let len = self.repr.nrows() * self.repr.ncols();
+        // SAFETY: Standard representation guarantees contiguous row-major layout.
+        // The Mat was constructed with valid data of the correct length.
+        unsafe { std::slice::from_raw_parts(self.ptr.as_ptr().cast::<T>(), len) }
+    }
 }
 
 ////////////
@@ -651,7 +668,7 @@ impl<'a, T: Repr> MatRef<'a, T> {
     ///
     /// `i` must be less than `self.num_vectors()`.
     #[inline]
-    pub(crate) unsafe fn get_row_unchecked(&self, i: usize) -> T::Row<'_> {
+    pub unsafe fn get_row_unchecked(&self, i: usize) -> T::Row<'_> {
         // SAFETY: Caller must ensure i < self.num_vectors().
         unsafe { self.repr.get_row(self.ptr, i) }
     }
@@ -683,6 +700,17 @@ impl<'a, T: Copy> MatRef<'a, Standard<T>> {
     pub fn vector_dim(&self) -> usize {
         self.repr.ncols()
     }
+
+    /// Returns the underlying data as a contiguous slice.
+    ///
+    /// The data is stored in row-major order: `[row0_col0, row0_col1, ..., row0_colN, row1_col0, ...]`.
+    #[inline]
+    pub fn as_slice(&self) -> &[T] {
+        let len = self.repr.nrows() * self.repr.ncols();
+        // SAFETY: Standard representation guarantees contiguous row-major layout.
+        // The MatRef was constructed with valid data of the correct length.
+        unsafe { std::slice::from_raw_parts(self.ptr.as_ptr().cast::<T>(), len) }
+    }
 }
 
 // Reborrow: Mat -> MatRef
@@ -784,7 +812,7 @@ impl<'a, T: ReprMut> MatMut<'a, T> {
     ///
     /// `i` must be less than `self.num_vectors()`.
     #[inline]
-    pub(crate) unsafe fn get_row_unchecked(&self, i: usize) -> T::Row<'_> {
+    pub unsafe fn get_row_unchecked(&self, i: usize) -> T::Row<'_> {
         // SAFETY: Caller must ensure i < self.num_vectors().
         unsafe { self.repr.get_row(self.ptr, i) }
     }

diff --git a/experimental-multi-vector-bench/Cargo.toml b/experimental-multi-vector-bench/Cargo.toml
@@ -0,0 +1,35 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+[package]
+name = "experimental-multi-vector-bench"
+edition.workspace = true
+version.workspace = true
+authors.workspace = true
+description = "Experimental multi-vector benchmarking support for DiskANN"
+documentation.workspace = true
+license.workspace = true
+
+[[bin]]
+name = "multivec-bench"
+path = "src/bin/multivec_bench.rs"
+
+[dependencies]
+diskann-linalg.workspace = true
+diskann-utils.workspace = true
+diskann-quantization.workspace = true
+diskann-vector.workspace = true
+diskann-wide.workspace = true
+
+# Benchmark dependencies
+anyhow.workspace = true
+diskann-benchmark-runner.workspace = true
+rand.workspace = true
+serde = { workspace = true, features = ["derive"] }
+serde_json.workspace = true
+thiserror.workspace = true
+
+[dev-dependencies]
+tempfile.workspace = true
+
+[lints]
+workspace = true
diff --git a/experimental-multi-vector-bench/README.md b/experimental-multi-vector-bench/README.md
@@ -0,0 +1,135 @@
+# experimental-multi-vector-bench
+
+Experimental multi-vector benchmarking support for DiskANN, enabling late interaction retrieval with token-level embeddings.
+
+## Scope & Goals
+
+This crate is an **experimental workspace** focused on:
+
+1. **Fast Chamfer distance implementation for `f32`** - Develop and benchmark high-performance implementations of the Chamfer distance function for multi-vector representations using 32-bit floating point values.
+
+2. **Multiple computation approaches** - Compare naive scalar, SIMD-accelerated, transposed, tiling, and SGEMM implementations to quantify performance gains.
+
+3. **Benchmarking infrastructure** - Provide tooling to measure and compare different implementation strategies.
+
+## Current Status
+
+- ✅ `MultiVector` type alias for `Mat<Standard<f32>>` (row-major storage from diskann-quantization)
+- ✅ `TransposedMultiVector` type for block-transposed storage (SIMD-optimized)
+- ✅ `Chamfer<Approach>` - Generic distance calculator using Inner Product similarity
+- ✅ `Chamfer<NaiveApproach>` - Scalar baseline implementation
+- ✅ `Chamfer<SimdApproach>` - SIMD-accelerated implementation
+- ✅ `Chamfer<TransposedApproach>` - Block-transposed SIMD with transposed documents
+- ✅ `Chamfer<TransposedWithTilingApproach>` - Block-transposed SIMD with query pair tiling
+- ✅ `Chamfer<QueryTransposedWithTilingApproach>` - Transposed query with doc pair tiling
+- ✅ `Chamfer<SgemmApproach>` - BLAS SGEMM + SIMD row-max
+- ✅ Implements `diskann_vector::DistanceFunction` trait for ecosystem compatibility
+- ✅ Benchmark utility integrated with diskann-benchmark-runner
+
+## Usage
+
+```rust
+use experimental_multi_vector_bench::{
+    Chamfer, SimdApproach, TransposedWithTilingApproach, QueryTransposedWithTilingApproach,
+    MultiVector, TransposedMultiVector, Standard,
+};
+use diskann_vector::DistanceFunction;
+
+// Create a multi-vector (3 vectors of dimension 4)
+let mv = MultiVector::new(Standard::new(3, 4), 0.0f32).unwrap();
+
+// Basic usage with row-major vectors (NaiveApproach or SimdApproach)
+let chamfer = Chamfer::<SimdApproach>::new();
+let distance = chamfer.evaluate_similarity(&query, &document);
+
+// Optimized for few query tokens (≤8): transpose documents
+let chamfer = Chamfer::<TransposedWithTilingApproach>::new();
+let transposed_doc = TransposedMultiVector::from(&document);
+let distance = chamfer.evaluate_similarity(&query, &transposed_doc);
+
+// Optimized for many query tokens (≥16): transpose query instead
+let chamfer = Chamfer::<QueryTransposedWithTilingApproach>::new();
+let transposed_query = TransposedMultiVector::from(&query);
+let distance = chamfer.evaluate_similarity(&transposed_query, &document);
+
+// For large Q×D: use SGEMM
+use experimental_multi_vector_bench::{SgemmApproach, SgemmScratch};
+let chamfer = Chamfer::<SgemmApproach>::new();
+let mut scratch = SgemmScratch::new();
+let distance = chamfer.evaluate_similarity_with_scratch(&query, &document, &mut scratch);
+```
+
+## Type Aliases
+
+This crate uses shared types from `diskann-quantization` for multi-vector representation:
+
+```rust
+// Row-major owning matrix
+pub type MultiVector = Mat<Standard<f32>>;
+
+// Immutable view
+pub type MultiVectorRef<'a> = MatRef<'a, Standard<f32>>;
+```
+
+The `Standard<f32>` representation provides:
+
+- Contiguous row-major storage
+- Direct `as_slice()` access for BLAS operations
+- Zero-copy views via `MatRef`
+
+## Future Work
+
+- [ ] Add RFC based on findings for DiskANN integration
+- [ ] Additional similarity measures (Cosine, SquaredL2)
+- [ ] Support for additional element types (`f16`, `u8` quantized, etc.)
+
+## Running Benchmarks
+
+```bash
+# Run benchmarks with example configuration
+cargo run --release -p experimental-multi-vector-bench --bin multivec-bench -- run \
+    --input-file experimental-multi-vector-bench/examples/bench.json \
+    --output-file results.json
+
+# Verify correctness (all approaches should produce same checksum)
+cargo run --release -p experimental-multi-vector-bench --bin multivec-bench -- run \
+    --input-file experimental-multi-vector-bench/examples/verify.json \
+    --output-file verify_results.json
+```
+
+See [examples/bench.json](examples/bench.json) for benchmark configuration format.
+
+### Benchmark Configuration
+
+The benchmark supports six approaches via the `approach` field:
+
+- `"naive"` - Scalar baseline
+- `"simd"` - SIMD-accelerated
+- `"transposed_simd"` - Block-transposed SIMD
+- `"transposed_with_tiling"` - Block-transposed SIMD with query pair tiling
+- `"query_transposed_with_tiling"` - Transposed query with doc pair tiling
+- `"sgemm"` - BLAS SGEMM + SIMD row-max
+
+## Module Structure
+
+```text
+src/
+├── lib.rs                       # Crate root with re-exports and type aliases
+├── multi_vector.rs              # TransposedMultiVector type (block-transposed storage)
+├── distance/
+│   ├── mod.rs                   # Chamfer<Approach> generic struct
+│   ├── naive.rs                 # Scalar implementation (NaiveApproach)
+│   ├── simd.rs                  # SIMD-accelerated (SimdApproach)
+│   ├── transposed.rs            # Transposed docs (TransposedApproach)
+│   ├── transposed_tiling.rs     # Transposed docs + query tiling (TransposedWithTilingApproach)
+│   ├── query_transposed_tiling.rs # Transposed query + doc tiling (QueryTransposedWithTilingApproach)
+│   └── sgemm.rs                 # BLAS SGEMM + row-max (SgemmApproach)
+└── bench/
+    ├── mod.rs                   # Benchmark registration and dispatch
+    ├── input.rs                 # Benchmark input types
+    └── runner.rs                # Benchmark execution logic
+```
+
+## Contributing
+
+This work is experimental and will be submitted as separate PRs.
diff --git a/experimental-multi-vector-bench/examples/bench.json b/experimental-multi-vector-bench/examples/bench.json
@@ -0,0 +1,95 @@
+{
+  "search_directories": [],
+  "jobs": [
+    {
+      "type": "multivec-op",
+      "content": {
+        "approach": "simd",
+        "runs": [
+          { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 },
+          { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 },
+          { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 16 },
+          { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 },
+          { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 },
+          { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 }
+        ]
+      }
+    },
+    {
+      "type": "multivec-op",
+      "content": {
+        "approach": "transposed_simd",
+        "runs": [
+          { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 },
+          { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 },
+          { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 16 },
+          { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 },
+          { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 },
+          { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 }
+        ]
+      }
+    },
+    {
+      "type": "multivec-op",
+      "content": {
+        "approach": "transposed_with_tiling",
+        "runs": [
+          { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 },
+          { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 },
+          { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 16 },
+          { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 },
+          { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 },
+          { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 }
+        ]
+      }
+    },
+    {
+      "type": "multivec-op",
+      "content": {
+        "approach": "query_transposed_with_tiling",
+        "runs": [
+          { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 },
+          { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 },
+          { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 16 },
+          { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 },
+          { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 },
+          { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 }
+        ]
+      }
+    },
+    {
+      "type": "multivec-op",
+      "content": {
+        "approach": "sgemm",
+        "runs": [
+          { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 },
+          { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 },
+          { "dim": 128, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 },
+          { "dim": 256, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 16 },
+          { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 8, "num_doc_token": 32 },
+          { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 16, "num_doc_token": 64 },
+          { "dim": 384, "num_points": 100, "loops_per_measurement": 10, "num_measurements": 50, "num_query_token": 32, "num_doc_token": 128 }
+        ]
+      }
+    }
+  ]
+}