Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Options:
-m, --max-id <MAX_ID>
If set, will randomly upsert/override vector ids within range [offset, max_id)
-d, --dim <DIM>
Number of dimensions in each dense vector or max dimension for sparse vectors [default: 128]
Number of dimensions in each dense vector [default: 128]
-t, --threads <THREADS>
Number of worker threads to use [default: 2]
-p, --parallel <PARALLEL>
Expand Down Expand Up @@ -166,14 +166,16 @@ Options:
Delay between requests in milliseconds
--indexed-only <INDEXED_ONLY>
Skip un-indexed segments during search [possible values: true, false]
--sparse-vectors <SPARSITY>
Whether to use sparse vectors and with how much sparsity
--sparse-vectors
Use sparse vectors. Can be combined with dense vectors. Sparse vectors are configured independently from dense vectors via `--sparse-vocab-size` (index range) and `--sparse-avg-dim` (average number of non-zero values). This flag is implied when either of those options is set
--sparse-vectors-per-point <SPARSE_VECTORS_PER_POINT>
Number of named sparse vectors per point [default: 1]
--sparse-vocab-size <SPARSE_VOCAB_SIZE>
Vocabulary size for sparse vectors, i.e. the range of possible indices. Implies `--sparse-vectors`. [default: 100000]
--sparse-avg-dim <SPARSE_AVG_DIM>
Average number of non-zero values per sparse vector. Implies `--sparse-vectors`. [default: 32]
--multivector-size <MULTIVECTOR_SIZE>
Whether to set dense vectors as multivectors
--sparse-dim <SPARSE_DIM>
Max dimension for sparse vectors (overrides --dim)
--jsonl-updates <JSONL_UPDATES>
Path to the jsonl file to save update timings TIP: Use `qdrant/mri` to visualize the timings
--jsonl-searches <JSONL_SEARCHES>
Expand Down
51 changes: 43 additions & 8 deletions src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ pub struct Args {
#[clap(short, long, value_parser = parse_number)]
pub max_id: Option<usize>,

/// Number of dimensions in each dense vector or max dimension for sparse vectors
/// Number of dimensions in each dense vector
#[clap(short, long, default_value_t = 128, value_parser = parse_number)]
pub dim: usize,

Expand Down Expand Up @@ -353,22 +353,33 @@ pub struct Args {
#[clap(long)]
pub indexed_only: Option<bool>,

/// Whether to use sparse vectors and with how much sparsity
#[clap(long, value_name = "SPARSITY")]
pub sparse_vectors: Option<f64>,
/// Use sparse vectors. Can be combined with dense vectors.
///
/// Sparse vectors are configured independently from dense vectors via
/// `--sparse-vocab-size` (index range) and `--sparse-avg-dim` (average
/// number of non-zero values). This flag is implied when either of those
/// options is set.
#[clap(long, default_value_t = false)]
pub sparse_vectors: bool,

/// Number of named sparse vectors per point
#[clap(long, default_value_t = 1)]
pub sparse_vectors_per_point: usize,

/// Vocabulary size for sparse vectors, i.e. the range of possible indices.
/// Implies `--sparse-vectors`. [default: 100000]
#[clap(long, value_parser = parse_number)]
pub sparse_vocab_size: Option<usize>,

/// Average number of non-zero values per sparse vector.
/// Implies `--sparse-vectors`. [default: 32]
#[clap(long, value_parser = parse_number)]
pub sparse_avg_dim: Option<usize>,

/// Whether to set dense vectors as multivectors
#[clap(long)]
pub multivector_size: Option<usize>,

/// Max dimension for sparse vectors (overrides --dim)
#[clap(long, value_parser = parse_number)]
pub sparse_dim: Option<usize>,

/// Path to the jsonl file to save update timings
/// TIP: Use `qdrant/mri` to visualize the timings
#[clap(long)]
Expand Down Expand Up @@ -410,12 +421,36 @@ pub struct Args {
pub full_scan_threshold: Option<usize>,
}

/// Default vocabulary size (index range) for sparse vectors.
pub const DEFAULT_SPARSE_VOCAB_SIZE: usize = 100_000;

/// Default average number of non-zero values per sparse vector.
pub const DEFAULT_SPARSE_AVG_DIM: usize = 32;

impl Args {
pub fn is_uint8_datatype(&self) -> bool {
self.datatype
.as_ref()
.is_some_and(|x| x == qdrant::Datatype::Uint8.as_str_name())
}

/// Whether sparse vectors should be used.
///
/// Enabled either via the `--sparse-vectors` flag or implicitly when any
/// of the sparse configuration options is set.
pub fn use_sparse_vectors(&self) -> bool {
self.sparse_vectors || self.sparse_vocab_size.is_some() || self.sparse_avg_dim.is_some()
}

/// Vocabulary size (index range) used to generate sparse vectors.
pub fn sparse_vocab_size(&self) -> usize {
self.sparse_vocab_size.unwrap_or(DEFAULT_SPARSE_VOCAB_SIZE)
}

/// Average number of non-zero values per generated sparse vector.
pub fn sparse_avg_dim(&self) -> usize {
self.sparse_avg_dim.unwrap_or(DEFAULT_SPARSE_AVG_DIM)
}
}

#[derive(Copy, Clone, Debug)]
Expand Down
2 changes: 1 addition & 1 deletion src/collection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ pub async fn recreate_collection(args: &Args, stopped: Arc<AtomicBool>) -> Resul

let vectors_config: VectorsConfig = dense_vector_params.clone().into();

let sparse_vectors_config = if args.sparse_vectors.is_some() {
let sparse_vectors_config = if args.use_sparse_vectors() {
let params: HashMap<_, _> = (0..args.sparse_vectors_per_point)
.map(|idx| {
let key = format!("{idx}_sparse");
Expand Down
85 changes: 65 additions & 20 deletions src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use rand::distr::Distribution;
use rand::prelude::SliceRandom;
use rand::seq::IndexedRandom;
use serde_json::json;
use std::collections::HashSet;
use std::time::Duration;
use tokio::time::interval;
use tokio_stream::StreamExt;
Expand Down Expand Up @@ -303,23 +304,39 @@ pub fn random_vector(rng: &mut impl Rng, args: &Args) -> Vector {
}
}

/// Generate random sparse vector with random size and random values.
/// - `max_size` - maximum size of vector
/// - `sparsity` - how many non-zero values should be in vector
pub fn random_sparse_vector(rng: &mut impl Rng, max_size: usize, sparsity: f64) -> Vec<(u32, f32)> {
let size = rng.random_range(1..max_size);
// (index, value)
let mut pairs = Vec::with_capacity(size);
for i in 1..=size {
// probability of skipping a dimension to make the vectors sparse
let skip = !rng.random_bool(sparsity);
if skip {
continue;
}
// Only positive values are generated to make sure to hit the pruning path.
pairs.push((i as u32, rng.random_range(0.0..10.0) as f32));
/// Generate random sparse vector with random values.
///
/// - `vocab_size` - the range of possible indices (1..=vocab_size)
/// - `avg_dim` - the average number of non-zero values per vector
///
/// The actual number of non-zero values is randomized around `avg_dim` to
/// produce vectors of varying length, while keeping the average close to the
/// requested value.
pub fn random_sparse_vector(
rng: &mut impl Rng,
vocab_size: usize,
avg_dim: usize,
) -> Vec<(u32, f32)> {
if vocab_size == 0 || avg_dim == 0 {
return Vec::new();
}
pairs

// Randomize the number of non-zero values uniformly in [1, 2*avg_dim],
// which keeps the expected value at `avg_dim`. Clamp to the vocabulary size.
let max_dim = (2 * avg_dim).min(vocab_size);
let size = rng.random_range(1..=max_dim);

// Sample `size` distinct indices from 1..=vocab_size.
let mut indices: HashSet<u32> = HashSet::with_capacity(size);
while indices.len() < size {
indices.insert(rng.random_range(1..=vocab_size) as u32);
}

indices
.into_iter()
// Only positive values are generated to make sure to hit the pruning path.
.map(|idx| (idx, rng.random_range(0.0..10.0) as f32))
.collect()
}

pub fn random_dense_vector(rng: &mut impl Rng, dim: usize, is_uint: bool) -> Vec<f32> {
Expand Down Expand Up @@ -515,18 +532,46 @@ mod tests {
#[test]
fn test_random_sparse_vector_bounds() {
let mut rng = seeded_rng();
let pairs = random_sparse_vector(&mut rng, 100, 0.5);
let vocab_size = 1000;
let pairs = random_sparse_vector(&mut rng, vocab_size, 16);
for &(idx, val) in &pairs {
assert!(idx >= 1, "sparse index should be >= 1, got {idx}");
assert!(idx <= 100, "sparse index should be <= max_size, got {idx}");
assert!(
idx <= vocab_size as u32,
"sparse index should be <= vocab_size, got {idx}"
);
assert!(val >= 0.0, "sparse value should be non-negative, got {val}");
}
}

#[test]
fn test_random_sparse_vector_empty_with_zero_sparsity() {
fn test_random_sparse_vector_distinct_indices() {
let mut rng = seeded_rng();
let pairs = random_sparse_vector(&mut rng, 1000, 16);
let unique: HashSet<u32> = pairs.iter().map(|&(idx, _)| idx).collect();
assert_eq!(unique.len(), pairs.len(), "indices should be distinct");
}

#[test]
fn test_random_sparse_vector_average_dim() {
let mut rng = seeded_rng();
let avg_dim = 32;
let samples = 1000;
let total: usize = (0..samples)
.map(|_| random_sparse_vector(&mut rng, 100_000, avg_dim).len())
.sum();
let observed = total as f64 / samples as f64;
// Expected value of uniform [1, 2*avg_dim] is roughly avg_dim.
assert!(
(observed - avg_dim as f64).abs() < avg_dim as f64 * 0.2,
"observed average dim {observed} should be close to {avg_dim}"
);
}

#[test]
fn test_random_sparse_vector_empty_with_zero_avg_dim() {
let mut rng = seeded_rng();
let pairs = random_sparse_vector(&mut rng, 100, 0.0);
let pairs = random_sparse_vector(&mut rng, 100, 0);
assert!(pairs.is_empty());
}
}
8 changes: 4 additions & 4 deletions src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ impl SearchProcessor {
&self,
rng: &mut impl Rng,
) -> Vec<(Vec<f32>, Option<SparseIndices>, Option<String>)> {
if let Some(sparsity) = self.args.sparse_vectors {
if self.args.use_sparse_vectors() {
let name = format!(
"{}_sparse",
random_vector_name(rng, self.args.sparse_vectors_per_point)
Expand All @@ -79,8 +79,8 @@ impl SearchProcessor {
.map(|_| {
let sparse_vector_tuples = random_sparse_vector(
rng,
self.args.sparse_dim.unwrap_or(self.args.dim),
sparsity,
self.args.sparse_vocab_size(),
self.args.sparse_avg_dim(),
);
let (indices, values): (Vec<_>, Vec<_>) =
sparse_vector_tuples.into_iter().unzip();
Expand Down Expand Up @@ -190,7 +190,7 @@ impl SearchProcessor {

let start = std::time::Instant::now();
let mut rng = rand::rng();
let has_sparse = self.args.sparse_vectors.is_some();
let has_sparse = self.args.use_sparse_vectors();
let has_dense = self.args.vectors_per_point > 0;

let use_sparse = match (has_sparse, has_dense) {
Expand Down
6 changes: 3 additions & 3 deletions src/upsert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,15 +131,15 @@ impl UpsertProcessor {
random_vector(&mut rng, &self.args).into()
};

let vectors: Vectors = if let Some(sparsity) = self.args.sparse_vectors {
let vectors: Vectors = if self.args.use_sparse_vectors() {
let mut vectors_map: HashMap<_, _> = Default::default();

for i in 0..self.args.sparse_vectors_per_point {
let vector_name = format!("{i}_sparse");
let vector = Vector::from(random_sparse_vector(
&mut rng,
self.args.sparse_dim.unwrap_or(self.args.dim),
sparsity,
self.args.sparse_vocab_size(),
self.args.sparse_avg_dim(),
));
vectors_map.insert(vector_name, vector);
}
Expand Down
Loading