From 435ed9f05fa56bbf6381b02bf05642f2a48ec541 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 2 Jun 2026 17:57:41 +0000 Subject: [PATCH] Refactor sparse vector CLI args to be independent from dense vectors Previously sparse vectors reused the dense `--dim` parameter (or `--sparse-dim`) together with a `--sparse-vectors ` factor, which made it awkward to create a collection with both dense and sparse vectors since their sizes differ. Sparse vectors are now configured independently: - `--sparse-vectors` is a boolean flag to enable sparse vectors - `--sparse-vocab-size` controls the index range (vocabulary size), default 100k - `--sparse-avg-dim` controls the average number of non-zero values, default 32 `--sparse-dim` and the sparsity factor are removed; the new options imply `--sparse-vectors`. Co-authored-by: Cursor --- README.md | 12 ++++--- src/args.rs | 51 +++++++++++++++++++++++----- src/collection.rs | 2 +- src/common.rs | 85 ++++++++++++++++++++++++++++++++++++----------- src/search.rs | 8 ++--- src/upsert.rs | 6 ++-- 6 files changed, 123 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index 38bb30c..b8bdcdc 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Options: -m, --max-id If set, will randomly upsert/override vector ids within range [offset, max_id) -d, --dim - Number of dimensions in each dense vector or max dimension for sparse vectors [default: 128] + Number of dimensions in each dense vector [default: 128] -t, --threads Number of worker threads to use [default: 2] -p, --parallel @@ -166,14 +166,16 @@ Options: Delay between requests in milliseconds --indexed-only Skip un-indexed segments during search [possible values: true, false] - --sparse-vectors - Whether to use sparse vectors and with how much sparsity + --sparse-vectors + Use sparse vectors. Can be combined with dense vectors. Sparse vectors are configured independently from dense vectors via `--sparse-vocab-size` (index range) and `--sparse-avg-dim` (average number of non-zero values). This flag is implied when either of those options is set --sparse-vectors-per-point Number of named sparse vectors per point [default: 1] + --sparse-vocab-size + Vocabulary size for sparse vectors, i.e. the range of possible indices. Implies `--sparse-vectors`. [default: 100000] + --sparse-avg-dim + Average number of non-zero values per sparse vector. Implies `--sparse-vectors`. [default: 32] --multivector-size Whether to set dense vectors as multivectors - --sparse-dim - Max dimension for sparse vectors (overrides --dim) --jsonl-updates Path to the jsonl file to save update timings TIP: Use `qdrant/mri` to visualize the timings --jsonl-searches diff --git a/src/args.rs b/src/args.rs index cf025a4..9ece77e 100644 --- a/src/args.rs +++ b/src/args.rs @@ -54,7 +54,7 @@ pub struct Args { #[clap(short, long, value_parser = parse_number)] pub max_id: Option, - /// Number of dimensions in each dense vector or max dimension for sparse vectors + /// Number of dimensions in each dense vector #[clap(short, long, default_value_t = 128, value_parser = parse_number)] pub dim: usize, @@ -353,22 +353,33 @@ pub struct Args { #[clap(long)] pub indexed_only: Option, - /// Whether to use sparse vectors and with how much sparsity - #[clap(long, value_name = "SPARSITY")] - pub sparse_vectors: Option, + /// Use sparse vectors. Can be combined with dense vectors. + /// + /// Sparse vectors are configured independently from dense vectors via + /// `--sparse-vocab-size` (index range) and `--sparse-avg-dim` (average + /// number of non-zero values). This flag is implied when either of those + /// options is set. + #[clap(long, default_value_t = false)] + pub sparse_vectors: bool, /// Number of named sparse vectors per point #[clap(long, default_value_t = 1)] pub sparse_vectors_per_point: usize, + /// Vocabulary size for sparse vectors, i.e. the range of possible indices. + /// Implies `--sparse-vectors`. [default: 100000] + #[clap(long, value_parser = parse_number)] + pub sparse_vocab_size: Option, + + /// Average number of non-zero values per sparse vector. + /// Implies `--sparse-vectors`. [default: 32] + #[clap(long, value_parser = parse_number)] + pub sparse_avg_dim: Option, + /// Whether to set dense vectors as multivectors #[clap(long)] pub multivector_size: Option, - /// Max dimension for sparse vectors (overrides --dim) - #[clap(long, value_parser = parse_number)] - pub sparse_dim: Option, - /// Path to the jsonl file to save update timings /// TIP: Use `qdrant/mri` to visualize the timings #[clap(long)] @@ -410,12 +421,36 @@ pub struct Args { pub full_scan_threshold: Option, } +/// Default vocabulary size (index range) for sparse vectors. +pub const DEFAULT_SPARSE_VOCAB_SIZE: usize = 100_000; + +/// Default average number of non-zero values per sparse vector. +pub const DEFAULT_SPARSE_AVG_DIM: usize = 32; + impl Args { pub fn is_uint8_datatype(&self) -> bool { self.datatype .as_ref() .is_some_and(|x| x == qdrant::Datatype::Uint8.as_str_name()) } + + /// Whether sparse vectors should be used. + /// + /// Enabled either via the `--sparse-vectors` flag or implicitly when any + /// of the sparse configuration options is set. + pub fn use_sparse_vectors(&self) -> bool { + self.sparse_vectors || self.sparse_vocab_size.is_some() || self.sparse_avg_dim.is_some() + } + + /// Vocabulary size (index range) used to generate sparse vectors. + pub fn sparse_vocab_size(&self) -> usize { + self.sparse_vocab_size.unwrap_or(DEFAULT_SPARSE_VOCAB_SIZE) + } + + /// Average number of non-zero values per generated sparse vector. + pub fn sparse_avg_dim(&self) -> usize { + self.sparse_avg_dim.unwrap_or(DEFAULT_SPARSE_AVG_DIM) + } } #[derive(Copy, Clone, Debug)] diff --git a/src/collection.rs b/src/collection.rs index cdec774..870a63c 100644 --- a/src/collection.rs +++ b/src/collection.rs @@ -121,7 +121,7 @@ pub async fn recreate_collection(args: &Args, stopped: Arc) -> Resul let vectors_config: VectorsConfig = dense_vector_params.clone().into(); - let sparse_vectors_config = if args.sparse_vectors.is_some() { + let sparse_vectors_config = if args.use_sparse_vectors() { let params: HashMap<_, _> = (0..args.sparse_vectors_per_point) .map(|idx| { let key = format!("{idx}_sparse"); diff --git a/src/common.rs b/src/common.rs index 48e98ef..140874e 100644 --- a/src/common.rs +++ b/src/common.rs @@ -14,6 +14,7 @@ use rand::distr::Distribution; use rand::prelude::SliceRandom; use rand::seq::IndexedRandom; use serde_json::json; +use std::collections::HashSet; use std::time::Duration; use tokio::time::interval; use tokio_stream::StreamExt; @@ -303,23 +304,39 @@ pub fn random_vector(rng: &mut impl Rng, args: &Args) -> Vector { } } -/// Generate random sparse vector with random size and random values. -/// - `max_size` - maximum size of vector -/// - `sparsity` - how many non-zero values should be in vector -pub fn random_sparse_vector(rng: &mut impl Rng, max_size: usize, sparsity: f64) -> Vec<(u32, f32)> { - let size = rng.random_range(1..max_size); - // (index, value) - let mut pairs = Vec::with_capacity(size); - for i in 1..=size { - // probability of skipping a dimension to make the vectors sparse - let skip = !rng.random_bool(sparsity); - if skip { - continue; - } - // Only positive values are generated to make sure to hit the pruning path. - pairs.push((i as u32, rng.random_range(0.0..10.0) as f32)); +/// Generate random sparse vector with random values. +/// +/// - `vocab_size` - the range of possible indices (1..=vocab_size) +/// - `avg_dim` - the average number of non-zero values per vector +/// +/// The actual number of non-zero values is randomized around `avg_dim` to +/// produce vectors of varying length, while keeping the average close to the +/// requested value. +pub fn random_sparse_vector( + rng: &mut impl Rng, + vocab_size: usize, + avg_dim: usize, +) -> Vec<(u32, f32)> { + if vocab_size == 0 || avg_dim == 0 { + return Vec::new(); } - pairs + + // Randomize the number of non-zero values uniformly in [1, 2*avg_dim], + // which keeps the expected value at `avg_dim`. Clamp to the vocabulary size. + let max_dim = (2 * avg_dim).min(vocab_size); + let size = rng.random_range(1..=max_dim); + + // Sample `size` distinct indices from 1..=vocab_size. + let mut indices: HashSet = HashSet::with_capacity(size); + while indices.len() < size { + indices.insert(rng.random_range(1..=vocab_size) as u32); + } + + indices + .into_iter() + // Only positive values are generated to make sure to hit the pruning path. + .map(|idx| (idx, rng.random_range(0.0..10.0) as f32)) + .collect() } pub fn random_dense_vector(rng: &mut impl Rng, dim: usize, is_uint: bool) -> Vec { @@ -515,18 +532,46 @@ mod tests { #[test] fn test_random_sparse_vector_bounds() { let mut rng = seeded_rng(); - let pairs = random_sparse_vector(&mut rng, 100, 0.5); + let vocab_size = 1000; + let pairs = random_sparse_vector(&mut rng, vocab_size, 16); for &(idx, val) in &pairs { assert!(idx >= 1, "sparse index should be >= 1, got {idx}"); - assert!(idx <= 100, "sparse index should be <= max_size, got {idx}"); + assert!( + idx <= vocab_size as u32, + "sparse index should be <= vocab_size, got {idx}" + ); assert!(val >= 0.0, "sparse value should be non-negative, got {val}"); } } #[test] - fn test_random_sparse_vector_empty_with_zero_sparsity() { + fn test_random_sparse_vector_distinct_indices() { + let mut rng = seeded_rng(); + let pairs = random_sparse_vector(&mut rng, 1000, 16); + let unique: HashSet = pairs.iter().map(|&(idx, _)| idx).collect(); + assert_eq!(unique.len(), pairs.len(), "indices should be distinct"); + } + + #[test] + fn test_random_sparse_vector_average_dim() { + let mut rng = seeded_rng(); + let avg_dim = 32; + let samples = 1000; + let total: usize = (0..samples) + .map(|_| random_sparse_vector(&mut rng, 100_000, avg_dim).len()) + .sum(); + let observed = total as f64 / samples as f64; + // Expected value of uniform [1, 2*avg_dim] is roughly avg_dim. + assert!( + (observed - avg_dim as f64).abs() < avg_dim as f64 * 0.2, + "observed average dim {observed} should be close to {avg_dim}" + ); + } + + #[test] + fn test_random_sparse_vector_empty_with_zero_avg_dim() { let mut rng = seeded_rng(); - let pairs = random_sparse_vector(&mut rng, 100, 0.0); + let pairs = random_sparse_vector(&mut rng, 100, 0); assert!(pairs.is_empty()); } } diff --git a/src/search.rs b/src/search.rs index 3ec59ca..9a62e2b 100644 --- a/src/search.rs +++ b/src/search.rs @@ -69,7 +69,7 @@ impl SearchProcessor { &self, rng: &mut impl Rng, ) -> Vec<(Vec, Option, Option)> { - if let Some(sparsity) = self.args.sparse_vectors { + if self.args.use_sparse_vectors() { let name = format!( "{}_sparse", random_vector_name(rng, self.args.sparse_vectors_per_point) @@ -79,8 +79,8 @@ impl SearchProcessor { .map(|_| { let sparse_vector_tuples = random_sparse_vector( rng, - self.args.sparse_dim.unwrap_or(self.args.dim), - sparsity, + self.args.sparse_vocab_size(), + self.args.sparse_avg_dim(), ); let (indices, values): (Vec<_>, Vec<_>) = sparse_vector_tuples.into_iter().unzip(); @@ -190,7 +190,7 @@ impl SearchProcessor { let start = std::time::Instant::now(); let mut rng = rand::rng(); - let has_sparse = self.args.sparse_vectors.is_some(); + let has_sparse = self.args.use_sparse_vectors(); let has_dense = self.args.vectors_per_point > 0; let use_sparse = match (has_sparse, has_dense) { diff --git a/src/upsert.rs b/src/upsert.rs index 6ed608c..ef2ff1c 100644 --- a/src/upsert.rs +++ b/src/upsert.rs @@ -131,15 +131,15 @@ impl UpsertProcessor { random_vector(&mut rng, &self.args).into() }; - let vectors: Vectors = if let Some(sparsity) = self.args.sparse_vectors { + let vectors: Vectors = if self.args.use_sparse_vectors() { let mut vectors_map: HashMap<_, _> = Default::default(); for i in 0..self.args.sparse_vectors_per_point { let vector_name = format!("{i}_sparse"); let vector = Vector::from(random_sparse_vector( &mut rng, - self.args.sparse_dim.unwrap_or(self.args.dim), - sparsity, + self.args.sparse_vocab_size(), + self.args.sparse_avg_dim(), )); vectors_map.insert(vector_name, vector); }