From 97b36efd47292a203ab442e7b7fe096ea454aedf Mon Sep 17 00:00:00 2001 From: Magdalen Manohar Date: Thu, 14 May 2026 17:56:54 +0000 Subject: [PATCH 01/14] finish up recall computation patch --- diskann-benchmark-core/src/recall.rs | 51 ++++++++----------- .../src/backend/index/benchmarks.rs | 28 ++++++++-- diskann-benchmark/src/utils/datafiles.rs | 13 ++++- diskann-benchmark/src/utils/recall.rs | 6 --- 4 files changed, 57 insertions(+), 41 deletions(-) diff --git a/diskann-benchmark-core/src/recall.rs b/diskann-benchmark-core/src/recall.rs index 0fa4d42c1..cfca474eb 100644 --- a/diskann-benchmark-core/src/recall.rs +++ b/diskann-benchmark-core/src/recall.rs @@ -22,10 +22,6 @@ pub struct RecallMetrics { pub num_queries: usize, /// The average recall across all queries. pub average: f64, - /// The minimum observed recall (max possible value: `recall_n`). - pub minimum: usize, - /// The maximum observed recall (max possible value: `recall_k`). - pub maximum: usize, } #[derive(Debug, Error)] @@ -186,8 +182,8 @@ where } } - // The actual recall computation for fixed-size groundtruth - let mut recall_values: Vec = Vec::new(); + // The actual recall computation for groundtruth + let mut recall_values: Vec = Vec::new(); let mut this_groundtruth = HashSet::new(); let mut this_results = HashSet::new(); @@ -198,26 +194,22 @@ where } let gt_row = groundtruth.row(i); - if gt_row.len() < recall_k { - return Err(ComputeRecallError::NotEnoughGroundTruth( - gt_row.len(), - recall_k, - )); - } + // groundtruth does not have to be fixed-size, so we compute recall_k for this row based on its gt length + let this_recall_k = gt_row.len().min(recall_k); // Populate the groundtruth using the top-k this_groundtruth.clear(); - this_groundtruth.extend(gt_row.iter().take(recall_k).cloned()); + this_groundtruth.extend(gt_row.iter().take(this_recall_k).cloned()); // If we have distances, then continue to append distances as long as the distance // value is constant if let Some(distances) = groundtruth_distances - && recall_k > 0 + && this_recall_k > 0 { let distances_row = distances.row(i); - if distances_row.len() > recall_k - 1 && gt_row.len() > recall_k - 1 { - let last_distance = distances_row[recall_k - 1]; - for (d, g) in distances_row.iter().zip(gt_row.iter()).skip(recall_k) { + if distances_row.len() > this_recall_k - 1 && gt_row.len() > this_recall_k - 1 { + let last_distance = distances_row[this_recall_k - 1]; + for (d, g) in distances_row.iter().zip(gt_row.iter()).skip(this_recall_k) { if *d == last_distance { this_groundtruth.insert(g.clone()); } else { @@ -235,27 +227,28 @@ where .iter() .filter(|i| this_results.contains(i)) .count() - .min(recall_k); + .min(this_recall_k); - recall_values.push(r); - } + // recall is the number of correct results in the top n, divided by k (not n), or 0 if there are no groundtruth results for this query + let recall = if this_recall_k > 0 { + (r as f64) / (this_recall_k as f64) + } else { + 0.0 + }; - // Perform post-processing - let total: usize = recall_values.iter().sum(); - let minimum = recall_values.iter().min().unwrap_or(&0); - let maximum = recall_values.iter().max().unwrap_or(&0); + recall_values.push(recall); + } - // We explicitly check that each groundtruth row has at least `recall_k` elements. - let div = recall_k * nrows; - let average = (total as f64) / (div as f64); + // Compute the average recall + let total: f64 = recall_values.iter().sum(); + let div = recall_values.len(); + let average = (total) / (div as f64); Ok(RecallMetrics { recall_k, recall_n, num_queries: nrows, average, - minimum: *minimum, - maximum: *maximum, }) } diff --git a/diskann-benchmark/src/backend/index/benchmarks.rs b/diskann-benchmark/src/backend/index/benchmarks.rs index 57aafc8eb..6a0150489 100644 --- a/diskann-benchmark/src/backend/index/benchmarks.rs +++ b/diskann-benchmark/src/backend/index/benchmarks.rs @@ -471,7 +471,16 @@ where let queries: Arc> = Arc::new(datafiles::load_dataset(datafiles::BinFile(&topk.queries))?); - let groundtruth = datafiles::load_groundtruth(datafiles::BinFile(&topk.groundtruth))?; + // compute the maximum value of k used in any search + let max_k = topk + .runs + .iter() + .map(|run| run.recall_k) + .max() + .ok_or_else(|| anyhow::anyhow!("No runs provided in Topk phase"))?; + + let groundtruth = + datafiles::load_groundtruth(datafiles::BinFile(&topk.groundtruth), Some(max_k))?; let knn = benchmark_core::search::graph::KNN::new( index.clone(), @@ -695,10 +704,19 @@ where let managed = Managed::new(max_points, consolidate_threshold, managed_stream); - let layered = bigann::WithData::new(managed, data, queries, |path| { - Ok(Box::new(datafiles::load_groundtruth(datafiles::BinFile( - path, - ))?)) + // compute the maximum value of k used in any search + let max_k = topk + .runs + .iter() + .map(|run| run.recall_k) + .max() + .ok_or_else(|| anyhow::anyhow!("No runs provided in Topk phase"))?; + + let layered = bigann::WithData::new(managed, data, queries, move |path| { + Ok(Box::new(datafiles::load_groundtruth( + datafiles::BinFile(path), + Some(max_k), + )?)) }); Ok(layered) diff --git a/diskann-benchmark/src/utils/datafiles.rs b/diskann-benchmark/src/utils/datafiles.rs index 9c5057488..c6d43ccc2 100644 --- a/diskann-benchmark/src/utils/datafiles.rs +++ b/diskann-benchmark/src/utils/datafiles.rs @@ -95,7 +95,7 @@ impl ConvertingLoad for f32 { } /// Load a groundtruth set from disk and return the result as a row-major matrix. -pub(crate) fn load_groundtruth(path: BinFile<'_>) -> anyhow::Result> { +pub(crate) fn load_groundtruth(path: BinFile<'_>, k: Option) -> anyhow::Result> { let provider = diskann_providers::storage::FileStorageProvider; let mut file = provider .open_reader(&path.0.to_string_lossy()) @@ -114,6 +114,17 @@ pub(crate) fn load_groundtruth(path: BinFile<'_>) -> anyhow::Result> let mut groundtruth = Matrix::::new(0, num_points, dim); let groundtruth_slice: &mut [u8] = bytemuck::cast_slice_mut(groundtruth.as_mut_slice()); file.read_exact(groundtruth_slice)?; + + if let Some(expected_k) = k { + if groundtruth.ncols() != expected_k { + return Err(anyhow::anyhow!( + "Each row of groundtruth must have length {} (got {})", + expected_k, + groundtruth.ncols() + )); + } + } + Ok(groundtruth) } diff --git a/diskann-benchmark/src/utils/recall.rs b/diskann-benchmark/src/utils/recall.rs index dcbe86d94..b6eebc72b 100644 --- a/diskann-benchmark/src/utils/recall.rs +++ b/diskann-benchmark/src/utils/recall.rs @@ -18,10 +18,6 @@ pub(crate) struct RecallMetrics { pub(crate) num_queries: usize, /// The average recall across all queries. pub(crate) average: f64, - /// The minimum observed recall (max possible value: `recall_n`). - pub(crate) minimum: usize, - /// The maximum observed recall (max possible value: `recall_k`). - pub(crate) maximum: usize, } impl From<&benchmark_core::recall::RecallMetrics> for RecallMetrics { @@ -31,8 +27,6 @@ impl From<&benchmark_core::recall::RecallMetrics> for RecallMetrics { recall_n: m.recall_n, num_queries: m.num_queries, average: m.average, - minimum: m.minimum, - maximum: m.maximum, } } } From 43eb51742170c5dd0629289834b86c438ede43a1 Mon Sep 17 00:00:00 2001 From: Magdalen Manohar Date: Fri, 22 May 2026 15:11:36 +0000 Subject: [PATCH 02/14] fix conflict --- diskann-benchmark-core/src/recall.rs | 34 ------------------- .../src/backend/index/benchmarks.rs | 9 ----- diskann-benchmark/src/utils/datafiles.rs | 6 ---- 3 files changed, 49 deletions(-) diff --git a/diskann-benchmark-core/src/recall.rs b/diskann-benchmark-core/src/recall.rs index 400a475be..eb1b65868 100644 --- a/diskann-benchmark-core/src/recall.rs +++ b/diskann-benchmark-core/src/recall.rs @@ -205,10 +205,6 @@ where let result = results.row(i); let gt_row = groundtruth.row(i); -<<<<<<< HEAD - // groundtruth does not have to be fixed-size, so we compute recall_k for this row based on its gt length - let this_recall_k = gt_row.len().min(recall_k); -======= // `groundtruth` does not have to be fixed-size, // so we compute `recall_k` for this row based on its gt length let this_recall_k = gt_row.len().min(recall_k); @@ -216,7 +212,6 @@ where if this_recall_k == 0 { continue; } ->>>>>>> 4f70a82133bf43e6bece7572e611cb4dedf2c475 // Populate the groundtruth using the top-k this_groundtruth.clear(); @@ -224,20 +219,6 @@ where // If we have distances, then continue to append distances as long as the distance // value is constant -<<<<<<< HEAD - if let Some(distances) = groundtruth_distances - && this_recall_k > 0 - { - let distances_row = distances.row(i); - if distances_row.len() > this_recall_k - 1 && gt_row.len() > this_recall_k - 1 { - let last_distance = distances_row[this_recall_k - 1]; - for (d, g) in distances_row.iter().zip(gt_row.iter()).skip(this_recall_k) { - if *d == last_distance { - this_groundtruth.insert(g.clone()); - } else { - break; - } -======= if let Some(distances) = groundtruth_distances { let distances_row = distances.row(i); @@ -249,7 +230,6 @@ where this_groundtruth.insert(g.clone()); } else { break; ->>>>>>> 4f70a82133bf43e6bece7572e611cb4dedf2c475 } } } @@ -264,32 +244,18 @@ where .count() .min(this_recall_k); -<<<<<<< HEAD - // recall is the number of correct results in the top n, divided by k (not n), or 0 if there are no groundtruth results for this query - let recall = if this_recall_k > 0 { - (r as f64) / (this_recall_k as f64) - } else { - 0.0 - }; -======= let recall = (r as f64) / (this_recall_k as f64); ->>>>>>> 4f70a82133bf43e6bece7572e611cb4dedf2c475 recall_values.push(recall); } // Compute the average recall let total: f64 = recall_values.iter().sum(); -<<<<<<< HEAD - let div = recall_values.len(); - let average = (total) / (div as f64); -======= let average = if recall_values.is_empty() { 0.0 } else { total / (recall_values.len() as f64) }; ->>>>>>> 4f70a82133bf43e6bece7572e611cb4dedf2c475 Ok(RecallMetrics { recall_k, diff --git a/diskann-benchmark/src/backend/index/benchmarks.rs b/diskann-benchmark/src/backend/index/benchmarks.rs index 39804cb1b..a289b7571 100644 --- a/diskann-benchmark/src/backend/index/benchmarks.rs +++ b/diskann-benchmark/src/backend/index/benchmarks.rs @@ -465,16 +465,7 @@ where Arc::new(datafiles::load_dataset(datafiles::BinFile(&topk.queries))?); // compute the maximum value of k used in any search -<<<<<<< HEAD - let max_k = topk - .runs - .iter() - .map(|run| run.recall_k) - .max() - .ok_or_else(|| anyhow::anyhow!("No runs provided in Topk phase"))?; -======= let max_k = topk.max_k(); ->>>>>>> 4f70a82133bf43e6bece7572e611cb4dedf2c475 let groundtruth = datafiles::load_groundtruth(datafiles::BinFile(&topk.groundtruth), Some(max_k))?; diff --git a/diskann-benchmark/src/utils/datafiles.rs b/diskann-benchmark/src/utils/datafiles.rs index bf3cb3f54..abfe06a7d 100644 --- a/diskann-benchmark/src/utils/datafiles.rs +++ b/diskann-benchmark/src/utils/datafiles.rs @@ -116,15 +116,9 @@ pub(crate) fn load_groundtruth(path: BinFile<'_>, k: Option) -> anyhow::R file.read_exact(groundtruth_slice)?; if let Some(expected_k) = k { -<<<<<<< HEAD - if groundtruth.ncols() != expected_k { - return Err(anyhow::anyhow!( - "Each row of groundtruth must have length {} (got {})", -======= if groundtruth.ncols() < expected_k { return Err(anyhow::anyhow!( "Each row of groundtruth must have at least {} neighbors (got {})", ->>>>>>> 4f70a82133bf43e6bece7572e611cb4dedf2c475 expected_k, groundtruth.ncols() )); From 54ee01bbbe919de015cd6c1c48cb68a865e2f04e Mon Sep 17 00:00:00 2001 From: Magdalen Manohar Date: Tue, 2 Jun 2026 20:01:59 +0000 Subject: [PATCH 03/14] fix conflict --- diskann-benchmark/src/backend/index/benchmarks.rs | 9 --------- 1 file changed, 9 deletions(-) diff --git a/diskann-benchmark/src/backend/index/benchmarks.rs b/diskann-benchmark/src/backend/index/benchmarks.rs index a289b7571..e0684923a 100644 --- a/diskann-benchmark/src/backend/index/benchmarks.rs +++ b/diskann-benchmark/src/backend/index/benchmarks.rs @@ -691,16 +691,7 @@ where let managed = Managed::new(max_points, consolidate_threshold, managed_stream); // compute the maximum value of k used in any search -<<<<<<< HEAD - let max_k = topk - .runs - .iter() - .map(|run| run.recall_k) - .max() - .ok_or_else(|| anyhow::anyhow!("No runs provided in Topk phase"))?; -======= let max_k = topk.max_k(); ->>>>>>> 4f70a82133bf43e6bece7572e611cb4dedf2c475 let layered = bigann::WithData::new(managed, data, queries, move |path| { Ok(Box::new(datafiles::load_groundtruth( From c864722a27868d0c20d84b5344b926b98c22639d Mon Sep 17 00:00:00 2001 From: Magdalen Manohar Date: Tue, 30 Jun 2026 19:15:38 +0000 Subject: [PATCH 04/14] add range groundtruth calculator, rename vector_filters_file --- diskann-tools/src/bin/compute_groundtruth.rs | 14 +- diskann-tools/src/utils/ground_truth.rs | 268 ++++++++++++++----- diskann-tools/src/utils/search_disk_index.rs | 20 +- 3 files changed, 223 insertions(+), 79 deletions(-) diff --git a/diskann-tools/src/bin/compute_groundtruth.rs b/diskann-tools/src/bin/compute_groundtruth.rs index 75c876f39..6fc3a77c2 100644 --- a/diskann-tools/src/bin/compute_groundtruth.rs +++ b/diskann-tools/src/bin/compute_groundtruth.rs @@ -31,7 +31,7 @@ fn main() -> CMDResult<()> { &args.base_file, &args.query_file, &args.ground_truth_file, - args.vector_filters_file.as_deref(), + args.filter_bitmap_file.as_deref(), args.recall_at, insert_file, skip_base, @@ -47,7 +47,7 @@ fn main() -> CMDResult<()> { &args.base_file, &args.query_file, &args.ground_truth_file, - args.vector_filters_file.as_deref(), + args.filter_bitmap_file.as_deref(), args.recall_at, insert_file, skip_base, @@ -63,7 +63,7 @@ fn main() -> CMDResult<()> { &args.base_file, &args.query_file, &args.ground_truth_file, - args.vector_filters_file.as_deref(), + args.filter_bitmap_file.as_deref(), args.recall_at, insert_file, skip_base, @@ -79,7 +79,7 @@ fn main() -> CMDResult<()> { &args.base_file, &args.query_file, &args.ground_truth_file, - args.vector_filters_file.as_deref(), + args.filter_bitmap_file.as_deref(), args.recall_at, insert_file, skip_base, @@ -133,9 +133,9 @@ struct ComputeGroundTruthArgs { #[arg(long = "gt_file", short, required = true)] pub ground_truth_file: String, - /// Vector filters file in the range ground truth format - #[arg(long = "vector_filters_file", short, default_value = None)] - pub vector_filters_file: Option, + /// Filter bitmap file in the range ground truth format + #[arg(long = "filter_bitmap_file", short, default_value = None)] + pub filter_bitmap_file: Option, /// Number of ground truth nearest neigbhors to compute #[arg(long = "recall_at", short = 'K', default_value = "10")] diff --git a/diskann-tools/src/utils/ground_truth.rs b/diskann-tools/src/utils/ground_truth.rs index 98e16713a..aac1a7847 100644 --- a/diskann-tools/src/utils/ground_truth.rs +++ b/diskann-tools/src/utils/ground_truth.rs @@ -50,6 +50,71 @@ pub fn read_labels_and_compute_bitmap( } } +fn build_query_bitmaps( + storage_provider: &StorageProvider, + query_num: usize, + filter_bitmap_file: Option<&str>, + base_file_labels: Option<&str>, + query_file_labels: Option<&str>, +) -> CMDResult>> { + // both base_file_labels and query_file_labels are provided or both are not provided + if !((base_file_labels.is_some() && query_file_labels.is_some()) + || (base_file_labels.is_none() && query_file_labels.is_none())) + { + return Err(CMDToolError { + details: "Both base_file_labels and query_file_labels must be provided or both must be not provided.".to_string(), + }); + } + + if base_file_labels.is_some() && filter_bitmap_file.is_some() { + return Err(CMDToolError { + details: "Both base_file_labels and filter_bitmap_file cannot be provided." + .to_string(), + }); + } + + let mut query_bitmaps: Option> = None; + + if let (Some(base_file_labels), Some(query_file_labels)) = (base_file_labels, query_file_labels) + { + query_bitmaps = Some(read_labels_and_compute_bitmap( + base_file_labels, + query_file_labels, + )?); + } + + // Load the filter bitmaps + let filter_bitmaps = match filter_bitmap_file { + Some(filter_bitmap_file) => { + let filters = + search_index_utils::load_vector_filters(storage_provider, filter_bitmap_file)?; + + assert_eq!( + filters.len(), + query_num, + "Mismatch in query and filter bitmap sizes" + ); + + Some(filters) + } + None => None, + }; + + if let Some(filters) = filter_bitmaps { + let mut bitmaps = vec![BitSet::new(); query_num]; + for (idx_query, filter) in filters.iter().enumerate() { + for item in filter.iter() { + if let Ok(idx) = (*item).try_into() { + bitmaps[idx_query].insert(idx); + } + } + } + query_bitmaps = Some(bitmaps) + } + + Ok(query_bitmaps) +} + #[allow(clippy::too_many_arguments)] #[allow(clippy::panic)] /// Computes the true nearest neighbors for a set of queries and writes them to a file. @@ -72,7 +137,7 @@ pub fn compute_ground_truth_from_datafiles< base_file: &str, query_file: &str, ground_truth_file: &str, - vector_filters_file: Option<&str>, + filter_bitmap_file: Option<&str>, recall_at: u32, insert_file: Option<&str>, skip_base: Option, @@ -86,22 +151,6 @@ pub fn compute_ground_truth_from_datafiles< Data::AssociatedDataType, >::new(base_file, associated_data_file.clone(), storage_provider)?; - // both base_file_labels and query_file_labels are provided or both are not provided - if !((base_file_labels.is_some() && query_file_labels.is_some()) - || (base_file_labels.is_none() && query_file_labels.is_none())) - { - return Err(CMDToolError { - details: "Both base_file_labels and query_file_labels must be provided or both must be not provided.".to_string(), - }); - } - - if base_file_labels.is_some() && vector_filters_file.is_some() { - return Err(CMDToolError { - details: "Both base_file_labels and vector_filters_file cannot be provided." - .to_string(), - }); - } - let insert_iterator = match insert_file { Some(insert_file) => { let i = VectorDataIterator::< @@ -118,50 +167,15 @@ pub fn compute_ground_truth_from_datafiles< let query_data = read_bin::(&mut storage_provider.open_reader(query_file)?)?; let query_num = query_data.nrows(); - - let mut query_bitmaps: Option> = None; - if let (Some(base_file_labels), Some(query_file_labels)) = (base_file_labels, query_file_labels) - { - query_bitmaps = Some(read_labels_and_compute_bitmap( - base_file_labels, - query_file_labels, - )?); - } - - // Load the vector filters - let vector_filters = match vector_filters_file { - Some(vector_filters_file) => { - let filters = - search_index_utils::load_vector_filters(storage_provider, vector_filters_file)?; - - assert_eq!( - filters.len(), - query_num, - "Mismatch in query and vector filter sizes" - ); - - Some(filters) - } - None => None, - }; - - let has_vector_filters = vector_filters.is_some(); - let has_query_bitmaps = query_bitmaps.is_some(); - - if has_vector_filters { - // copy vector_filters to query_bitmaps one item at a time - if let Some(filters) = vector_filters { - let mut bitmaps = vec![BitSet::new(); query_num]; - for (idx_query, filter) in filters.iter().enumerate() { - for item in filter.iter() { - if let Ok(idx) = (*item).try_into() { - bitmaps[idx_query].insert(idx); - } - } - } - query_bitmaps = Some(bitmaps) - } - } + let has_filter_bitmap_file = filter_bitmap_file.is_some(); + let has_query_bitmaps = base_file_labels.is_some() && query_file_labels.is_some(); + let query_bitmaps = build_query_bitmaps( + storage_provider, + query_num, + filter_bitmap_file, + base_file_labels, + query_file_labels, + )?; let ground_truth_result = compute_ground_truth_from_data::( distance_function, @@ -180,7 +194,7 @@ pub fn compute_ground_truth_from_datafiles< assert_ne!(ground_truth.len(), 0, "No ground-truth results computed"); - if has_vector_filters || has_query_bitmaps { + if has_filter_bitmap_file || has_query_bitmaps { let ground_truth_collection = ground_truth .into_iter() .map(|npq| npq.into_iter().collect()) @@ -205,6 +219,136 @@ pub fn compute_ground_truth_from_datafiles< } } +#[allow(clippy::too_many_arguments)] +#[allow(clippy::panic)] +/// Computes range-search ground truth for a set of queries and writes it to a file. +/// +/// # Arguments +/// +/// * `distance_function` - e.g. L2 +/// * `base_file` - The file containing the base vectors. +/// * `query_file` - The file containing the query vectors. +/// * `ground_truth_file` - The file to write the range-search ground truth results to. +/// * `radius` - Similarity threshold for including neighbors in the result set. +/// * `filter_bitmap_file` - Optional filter bitmap file in range-groundtruth format. +/// * `base_file_labels` - Optional labels file for base vectors. +/// * `query_file_labels` - Optional labels file for query vectors. +pub fn compute_range_ground_truth_from_datafiles< + Data: GraphDataType, + StorageProvider: StorageReadProvider + StorageWriteProvider, +>( + storage_provider: &StorageProvider, + distance_function: Metric, + base_file: &str, + query_file: &str, + ground_truth_file: &str, + radius: f32, + filter_bitmap_file: Option<&str>, + base_file_labels: Option<&str>, + query_file_labels: Option<&str>, +) -> CMDResult<()> { + let dataset_iterator = + VectorDataIterator::::new( + base_file, + Option::None, + storage_provider, + )?; + + let query_data = + read_bin::(&mut storage_provider.open_reader(query_file)?)?; + let query_num = query_data.nrows(); + + let query_bitmaps = build_query_bitmaps( + storage_provider, + query_num, + filter_bitmap_file, + base_file_labels, + query_file_labels, + )?; + + let ground_truth = compute_range_ground_truth_from_data::( + distance_function, + dataset_iterator, + &query_data, + radius, + query_bitmaps, + )?; + + assert_ne!(ground_truth.len(), 0, "No ground-truth results computed"); + + write_range_search_ground_truth(storage_provider, ground_truth_file, query_num, ground_truth) +} + +#[allow(clippy::too_many_arguments)] +pub fn compute_range_ground_truth_from_data( + distance_function: Metric, + dataset_iter: VectorDataIterator, + queries: &Matrix, + radius: f32, + query_bitmaps: Option>, +) -> CMDResult>>> +where + Data: GraphDataType, + VectorReader: StorageReadProvider, +{ + let query_num = queries.nrows(); + let query_dim = queries.ncols(); + + let mut ground_truth: Vec>> = vec![Vec::new(); query_num]; + let mut queries_and_result: Vec<_> = queries.row_iter().zip(ground_truth.iter_mut()).collect(); + + let distance_comparer = Data::VectorDataType::distance(distance_function, Some(query_dim)); + + let batch_size = 10_000; + let mut data_batch: Vec> = Vec::with_capacity(batch_size); + + let pool = create_thread_pool(0)?; + + let mut num_base_points: usize = 0; + + for chunk in dataset_iter.chunks(batch_size).into_iter() { + data_batch.clear(); + for (data_vector, _associated_data) in chunk { + data_batch.push(data_vector); + } + let points = data_batch.len(); + + if points == 0 { + continue; + } + + queries_and_result + .par_iter_mut() + .enumerate() + .for_each_in_pool(pool.as_ref(), |(idx_query, (query, query_results))| { + for (idx_in_batch, data) in data_batch.iter().enumerate() { + let idx = (num_base_points + idx_in_batch) as u32; + + let allowed_by_bitmap = if let Some(ref bitmaps) = query_bitmaps { + if let Ok(idx_usize) = idx.try_into() { + bitmaps[idx_query].contains(idx_usize) + } else { + false + } + } else { + true + }; + + if allowed_by_bitmap { + let distance = distance_comparer.evaluate_similarity(data, query); + if distance <= radius { + query_results.push(Neighbor { id: idx, distance }); + } + } + } + }); + + num_base_points += points; + } + + Ok(ground_truth) +} + #[derive(Debug, Clone)] pub enum MultivecAggregationMethod { AveragePairwise, diff --git a/diskann-tools/src/utils/search_disk_index.rs b/diskann-tools/src/utils/search_disk_index.rs index 3825d2631..faf2a48e2 100644 --- a/diskann-tools/src/utils/search_disk_index.rs +++ b/diskann-tools/src/utils/search_disk_index.rs @@ -46,7 +46,7 @@ pub struct SearchDiskIndexParameters<'a> { pub result_output_prefix: &'a str, pub query_file: &'a str, pub truthset_file: &'a str, - pub vector_filters_file: Option<&'a str>, + pub filter_bitmap_file: Option<&'a str>, pub num_threads: usize, pub recall_at: u32, pub beam_width: u32, @@ -79,18 +79,18 @@ where &mut storage_provider.open_reader(parameters.query_file)?, )?; let query_num = queries.nrows(); - // Load the vector filters - let vector_filters = match parameters.vector_filters_file { - Some(vector_filters_file) => { - search_index_utils::load_vector_filters(storage_provider, vector_filters_file)? + // Load the filter bitmaps + let filter_bitmaps = match parameters.filter_bitmap_file { + Some(filter_bitmap_file) => { + search_index_utils::load_vector_filters(storage_provider, filter_bitmap_file)? } None => vec![HashSet::::new(); query_num], }; assert_eq!( - vector_filters.len(), + filter_bitmaps.len(), query_num, - "Mismatch in query and vector filter sizes" + "Mismatch in query and filter bitmap sizes" ); let mut gt_dim: usize = 0; @@ -102,7 +102,7 @@ where // Check for ground truth let mut calc_recall_flag = false; if !parameters.truthset_file.is_empty() && storage_provider.exists(parameters.truthset_file) { - if parameters.vector_filters_file.is_none() { + if parameters.filter_bitmap_file.is_none() { let ret = search_index_utils::load_truthset(storage_provider, parameters.truthset_file)?; gt_ids = Some(ret.index_nodes); @@ -225,7 +225,7 @@ where let zipped = cmp_stats .par_iter_mut() .zip(queries.par_row_iter()) - .zip(vector_filters.par_iter()) + .zip(filter_bitmaps.par_iter()) .zip(query_result_ids[test_id].par_chunks_mut(parameters.recall_at as usize)) .zip(query_result_dists[test_id].par_chunks_mut(parameters.recall_at as usize)) .zip(statistics.par_iter_mut()) @@ -248,7 +248,7 @@ where result_count, )| { let vector_filter_function: Box bool + Send + Sync> = - if parameters.vector_filters_file.is_none() { + if parameters.filter_bitmap_file.is_none() { Box::new(|_: &u32| true) } else { Box::new(move |vector_id: &u32| vector_filter.contains(vector_id)) From bd05e0f941955a09d672dbd833ddb5a277a3967a Mon Sep 17 00:00:00 2001 From: Magdalen Manohar Date: Tue, 30 Jun 2026 19:17:25 +0000 Subject: [PATCH 05/14] fmt, clippy, add missing file --- .../src/bin/compute_range_groundtruth.rs | 130 ++++++++++++++++++ diskann-tools/src/utils/ground_truth.rs | 16 +-- 2 files changed, 137 insertions(+), 9 deletions(-) create mode 100644 diskann-tools/src/bin/compute_range_groundtruth.rs diff --git a/diskann-tools/src/bin/compute_range_groundtruth.rs b/diskann-tools/src/bin/compute_range_groundtruth.rs new file mode 100644 index 000000000..057c49035 --- /dev/null +++ b/diskann-tools/src/bin/compute_range_groundtruth.rs @@ -0,0 +1,130 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ +use clap::Parser; +use diskann_providers::{storage::FileStorageProvider, utils::Timer}; +use diskann_tools::utils::{ + compute_range_ground_truth_from_datafiles, init_subscriber, CMDResult, DataType, + GraphDataF32Vector, GraphDataHalfVector, GraphDataInt8Vector, GraphDataU8Vector, +}; +use diskann_vector::distance::Metric; + +fn main() -> CMDResult<()> { + init_subscriber(); + let timer = Timer::new(); + + let args = ComputeRangeGroundTruthArgs::parse(); + + tracing::info!("Computing range-search ground truth file"); + + let storage_provider = FileStorageProvider; + + let err = match args.data_type { + DataType::Float => { + compute_range_ground_truth_from_datafiles::( + &storage_provider, + args.distance_function, + &args.base_file, + &args.query_file, + &args.ground_truth_file, + args.radius, + args.filter_bitmap_file.as_deref(), + args.base_file_labels.as_deref(), + args.query_file_labels.as_deref(), + ) + } + DataType::Fp16 => { + compute_range_ground_truth_from_datafiles::( + &storage_provider, + args.distance_function, + &args.base_file, + &args.query_file, + &args.ground_truth_file, + args.radius, + args.filter_bitmap_file.as_deref(), + args.base_file_labels.as_deref(), + args.query_file_labels.as_deref(), + ) + } + DataType::Uint8 => { + compute_range_ground_truth_from_datafiles::( + &storage_provider, + args.distance_function, + &args.base_file, + &args.query_file, + &args.ground_truth_file, + args.radius, + args.filter_bitmap_file.as_deref(), + args.base_file_labels.as_deref(), + args.query_file_labels.as_deref(), + ) + } + DataType::Int8 => { + compute_range_ground_truth_from_datafiles::( + &storage_provider, + args.distance_function, + &args.base_file, + &args.query_file, + &args.ground_truth_file, + args.radius, + args.filter_bitmap_file.as_deref(), + args.base_file_labels.as_deref(), + args.query_file_labels.as_deref(), + ) + } + }; + + match err { + Ok(_) => { + tracing::info!( + "Compute range ground-truth completed successfully in {:?}", + timer.elapsed() + ); + Ok(()) + } + Err(err) => { + tracing::error!("Error: {:?}", err); + Err(err) + } + } +} + +#[derive(Debug, Parser)] +struct ComputeRangeGroundTruthArgs { + /// data type + #[arg(long = "data_type", default_value = "float")] + pub data_type: DataType, + + /// Distance function to use. + #[arg(long = "dist_fn", default_value = "l2")] + pub distance_function: Metric, + + /// File containing the base vectors in binary format + #[arg(long = "base_file", short, required = true)] + pub base_file: String, + + /// Optional labels file for base vectors + #[arg(long = "base_file_labels", default_value = None)] + pub base_file_labels: Option, + + /// File containing the query vectors in binary format + #[arg(long = "query_file", short, required = true)] + pub query_file: String, + + /// Optional labels file for query vectors + #[arg(long = "query_file_labels", default_value = None)] + pub query_file_labels: Option, + + /// Path of the file to write range ground truth to in binary format + #[arg(long = "gt_file", short, required = true)] + pub ground_truth_file: String, + + /// Filter bitmap file in range ground truth format + #[arg(long = "filter_bitmap_file", short, default_value = None)] + pub filter_bitmap_file: Option, + + /// Radius threshold used to include neighbors in range-groundtruth + #[arg(long = "radius", required = true)] + pub radius: f32, +} diff --git a/diskann-tools/src/utils/ground_truth.rs b/diskann-tools/src/utils/ground_truth.rs index aac1a7847..eb1ec6531 100644 --- a/diskann-tools/src/utils/ground_truth.rs +++ b/diskann-tools/src/utils/ground_truth.rs @@ -68,8 +68,7 @@ fn build_query_bitmaps, query_file_labels: Option<&str>, ) -> CMDResult<()> { - let dataset_iterator = - VectorDataIterator::::new( - base_file, - Option::None, - storage_provider, - )?; + let dataset_iterator = VectorDataIterator::< + StorageProvider, + Data::VectorDataType, + Data::AssociatedDataType, + >::new(base_file, Option::None, storage_provider)?; let query_data = read_bin::(&mut storage_provider.open_reader(query_file)?)?; let query_num = query_data.nrows(); - + let query_bitmaps = build_query_bitmaps( storage_provider, query_num, From 1477ac41cc8f30a9ffae415595501eb5c72c51b2 Mon Sep 17 00:00:00 2001 From: Magdalen Dobson Manohar <58752279+magdalendobson@users.noreply.github.com> Date: Tue, 30 Jun 2026 16:02:25 -0400 Subject: [PATCH 06/14] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- diskann-tools/src/utils/ground_truth.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/diskann-tools/src/utils/ground_truth.rs b/diskann-tools/src/utils/ground_truth.rs index eb1ec6531..eb0c9f3fb 100644 --- a/diskann-tools/src/utils/ground_truth.rs +++ b/diskann-tools/src/utils/ground_truth.rs @@ -88,11 +88,15 @@ fn build_query_bitmaps Date: Tue, 30 Jun 2026 16:02:52 -0400 Subject: [PATCH 07/14] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- diskann-tools/src/utils/ground_truth.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diskann-tools/src/utils/ground_truth.rs b/diskann-tools/src/utils/ground_truth.rs index eb0c9f3fb..2bc0f4031 100644 --- a/diskann-tools/src/utils/ground_truth.rs +++ b/diskann-tools/src/utils/ground_truth.rs @@ -232,7 +232,7 @@ pub fn compute_ground_truth_from_datafiles< /// * `base_file` - The file containing the base vectors. /// * `query_file` - The file containing the query vectors. /// * `ground_truth_file` - The file to write the range-search ground truth results to. -/// * `radius` - Similarity threshold for including neighbors in the result set. +/// * `radius` - Distance threshold in DiskANN score space (smaller is better; cosine uses `1 - cos`, inner product uses `-dot`). /// * `filter_bitmap_file` - Optional filter bitmap file in range-groundtruth format. /// * `base_file_labels` - Optional labels file for base vectors. /// * `query_file_labels` - Optional labels file for query vectors. From 2044ba1ade0185ebd98a7ab9ab190a9ff4b6a606 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 30 Jun 2026 20:05:15 +0000 Subject: [PATCH 08/14] fix: return error for filter bitmap mismatch --- diskann-tools/src/utils/search_disk_index.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/diskann-tools/src/utils/search_disk_index.rs b/diskann-tools/src/utils/search_disk_index.rs index faf2a48e2..c38544d07 100644 --- a/diskann-tools/src/utils/search_disk_index.rs +++ b/diskann-tools/src/utils/search_disk_index.rs @@ -87,11 +87,15 @@ where None => vec![HashSet::::new(); query_num], }; - assert_eq!( - filter_bitmaps.len(), - query_num, - "Mismatch in query and filter bitmap sizes" - ); + if filter_bitmaps.len() != query_num { + return Err(CMDToolError { + details: format!( + "Mismatch in query and filter bitmap sizes: {} queries, {} filter bitmaps", + query_num, + filter_bitmaps.len() + ), + }); + } let mut gt_dim: usize = 0; let mut gt_ids: Option> = None; From df844be77a577cbf2945804a8a1f62bff5d1bcf7 Mon Sep 17 00:00:00 2001 From: Magdalen Manohar Date: Tue, 30 Jun 2026 20:46:18 +0000 Subject: [PATCH 09/14] standardize naming to kebab case --- diskann-tools/src/bin/compute_groundtruth.rs | 20 ++++++++--------- .../src/bin/compute_multivec_groundtruth.rs | 16 +++++++------- .../src/bin/compute_range_groundtruth.rs | 16 +++++++------- .../src/bin/compute_specificities.rs | 6 ++--- .../src/bin/gen_associated_data_from_range.rs | 2 +- diskann-tools/src/bin/generate_pq.rs | 14 ++++++------ .../src/bin/generate_synthetic_labels.rs | 2 +- .../src/bin/random_data_generator.rs | 4 ++-- .../bin/range_search_disk_index.rs.disabled | 22 +++++++++---------- diskann-tools/src/bin/relative_contrast.rs | 12 +++++----- 10 files changed, 57 insertions(+), 57 deletions(-) diff --git a/diskann-tools/src/bin/compute_groundtruth.rs b/diskann-tools/src/bin/compute_groundtruth.rs index 6fc3a77c2..a32ad7919 100644 --- a/diskann-tools/src/bin/compute_groundtruth.rs +++ b/diskann-tools/src/bin/compute_groundtruth.rs @@ -108,40 +108,40 @@ fn main() -> CMDResult<()> { #[derive(Debug, Parser)] struct ComputeGroundTruthArgs { /// data type (required) - #[arg(long = "data_type", default_value = "float")] + #[arg(long = "data-type", default_value = "float")] pub data_type: DataType, /// Distance function to use. - #[arg(long = "dist_fn", default_value = "l2")] + #[arg(long = "dist-fn", default_value = "l2")] pub distance_function: Metric, /// File containing the base vectors in binary format - #[arg(long = "base_file", short, required = true)] + #[arg(long = "base-file", short, required = true)] pub base_file: String, - #[arg(long = "base_file_labels", default_value = None)] + #[arg(long = "base-file-labels", default_value = None)] pub base_file_labels: Option, /// File containing the query vectors in binary format - #[arg(long = "query_file", short, required = true)] + #[arg(long = "query-file", short, required = true)] pub query_file: String, - #[arg(long = "query_file_labels", default_value = None)] + #[arg(long = "query-file-labels", default_value = None)] pub query_file_labels: Option, /// Path of the file to write the ground truth to in binary format. Please don't append .bin at the end if no filter_label or filter_label_file is provided. It will save the file with '.bin' at the end. Otherwise it will save the file as filename_label.bin. - #[arg(long = "gt_file", short, required = true)] + #[arg(long = "gt-file", short, required = true)] pub ground_truth_file: String, /// Filter bitmap file in the range ground truth format - #[arg(long = "filter_bitmap_file", short, default_value = None)] + #[arg(long = "filter-bitmap-file", short, default_value = None)] pub filter_bitmap_file: Option, /// Number of ground truth nearest neigbhors to compute - #[arg(long = "recall_at", short = 'K', default_value = "10")] + #[arg(long = "recall-at", short = 'K', default_value = "10")] pub recall_at: u32, /// File containing the associated data in binary format - #[arg(long = "associated_data_file", required = false, default_value = None)] + #[arg(long = "associated-data-file", required = false, default_value = None)] pub associated_data_file: Option, } diff --git a/diskann-tools/src/bin/compute_multivec_groundtruth.rs b/diskann-tools/src/bin/compute_multivec_groundtruth.rs index 452187375..613b01ac5 100644 --- a/diskann-tools/src/bin/compute_multivec_groundtruth.rs +++ b/diskann-tools/src/bin/compute_multivec_groundtruth.rs @@ -94,11 +94,11 @@ fn main() -> CMDResult<()> { #[derive(Debug, Parser)] struct ComputeMultivecGroundTruthArgs { /// data type (required) - #[arg(long = "data_type", default_value = "float")] + #[arg(long = "data-type", default_value = "float")] pub data_type: DataType, /// Distance function to use. - #[arg(long = "dist_fn", default_value = "l2")] + #[arg(long = "dist-fn", default_value = "l2")] pub distance_function: Metric, /// Whether to use average or min aggregation @@ -106,24 +106,24 @@ struct ComputeMultivecGroundTruthArgs { pub aggregation: MultivecAggregationMethod, /// File containing the base vectors in binary format - #[arg(long = "base_file", short, required = true)] + #[arg(long = "base-file", short, required = true)] pub base_file: String, - #[arg(long = "base_file_labels", default_value = None)] + #[arg(long = "base-file-labels", default_value = None)] pub base_file_labels: Option, /// File containing the query vectors in binary format - #[arg(long = "query_file", short, required = true)] + #[arg(long = "query-file", short, required = true)] pub query_file: String, - #[arg(long = "query_file_labels", default_value = None)] + #[arg(long = "query-file-labels", default_value = None)] pub query_file_labels: Option, /// Path of the file to write the ground truth to in binary format. Please don't append .bin at the end if no filter_label or filter_label_file is provided. It will save the file with '.bin' at the end. Otherwise it will save the file as filename_label.bin. - #[arg(long = "gt_file", short, required = true)] + #[arg(long = "gt-file", short, required = true)] pub ground_truth_file: String, /// Number of ground truth nearest neighbors to compute - #[arg(long = "recall_at", short = 'K', default_value = "10")] + #[arg(long = "recall-at", short = 'K', default_value = "10")] pub recall_at: u32, } diff --git a/diskann-tools/src/bin/compute_range_groundtruth.rs b/diskann-tools/src/bin/compute_range_groundtruth.rs index 057c49035..65e532830 100644 --- a/diskann-tools/src/bin/compute_range_groundtruth.rs +++ b/diskann-tools/src/bin/compute_range_groundtruth.rs @@ -93,35 +93,35 @@ fn main() -> CMDResult<()> { #[derive(Debug, Parser)] struct ComputeRangeGroundTruthArgs { /// data type - #[arg(long = "data_type", default_value = "float")] + #[arg(long = "data-type", default_value = "float")] pub data_type: DataType, /// Distance function to use. - #[arg(long = "dist_fn", default_value = "l2")] + #[arg(long = "dist-fn", default_value = "l2")] pub distance_function: Metric, /// File containing the base vectors in binary format - #[arg(long = "base_file", short, required = true)] + #[arg(long = "base-file", short, required = true)] pub base_file: String, /// Optional labels file for base vectors - #[arg(long = "base_file_labels", default_value = None)] + #[arg(long = "base-file-labels", default_value = None)] pub base_file_labels: Option, /// File containing the query vectors in binary format - #[arg(long = "query_file", short, required = true)] + #[arg(long = "query-file", short, required = true)] pub query_file: String, /// Optional labels file for query vectors - #[arg(long = "query_file_labels", default_value = None)] + #[arg(long = "query-file-labels", default_value = None)] pub query_file_labels: Option, /// Path of the file to write range ground truth to in binary format - #[arg(long = "gt_file", short, required = true)] + #[arg(long = "gt-file", short, required = true)] pub ground_truth_file: String, /// Filter bitmap file in range ground truth format - #[arg(long = "filter_bitmap_file", short, default_value = None)] + #[arg(long = "filter-bitmap-file", short, default_value = None)] pub filter_bitmap_file: Option, /// Radius threshold used to include neighbors in range-groundtruth diff --git a/diskann-tools/src/bin/compute_specificities.rs b/diskann-tools/src/bin/compute_specificities.rs index 3cb2bf174..d88e68679 100644 --- a/diskann-tools/src/bin/compute_specificities.rs +++ b/diskann-tools/src/bin/compute_specificities.rs @@ -18,15 +18,15 @@ use std::process; )] struct Args { /// File containing the base labels - #[arg(long = "base_label_file", short = 'b')] + #[arg(long = "base-label-file", short = 'b')] pub base_label_file: String, /// File containing the query labels - #[arg(long = "query_label_file", short = 'q')] + #[arg(long = "query-label-file", short = 'q')] pub query_label_file: String, /// Output file for specificities (optional) - #[arg(long = "specificity_output_file", short = 'o')] + #[arg(long = "specificity-output-file", short = 'o')] pub specificity_output_file: Option, } diff --git a/diskann-tools/src/bin/gen_associated_data_from_range.rs b/diskann-tools/src/bin/gen_associated_data_from_range.rs index 708dc5004..7c27838d2 100644 --- a/diskann-tools/src/bin/gen_associated_data_from_range.rs +++ b/diskann-tools/src/bin/gen_associated_data_from_range.rs @@ -20,7 +20,7 @@ fn main() -> CMDResult<()> { #[derive(Debug, Parser)] struct GenAssociatedDataFromRangeArgs { - #[arg(long = "associated_data_path")] + #[arg(long = "associated-data-path")] pub associated_data_path: String, #[arg(long = "start")] diff --git a/diskann-tools/src/bin/generate_pq.rs b/diskann-tools/src/bin/generate_pq.rs index 740ab9aa1..2e729259c 100644 --- a/diskann-tools/src/bin/generate_pq.rs +++ b/diskann-tools/src/bin/generate_pq.rs @@ -52,30 +52,30 @@ fn main() -> Result<(), CMDToolError> { #[derive(Debug, Parser)] struct BuildPQArgs { /// data type (required) - #[arg(long = "data_type", default_value = "float")] + #[arg(long = "data-type", default_value = "float")] pub data_type: DataType, /// Distance function to use. - #[arg(long = "dist_fn", default_value = "l2")] + #[arg(long = "dist-fn", default_value = "l2")] pub dist_fn: Metric, /// Path to the data file. The file should be in the format specified by the `data_type` argument. - #[arg(long = "data_path", short, required = true)] + #[arg(long = "data-path", short, required = true)] pub data_path: String, /// Path to the index file. The index will be saved to this prefixed name. - #[arg(long = "index_path_prefix", short, required = true)] + #[arg(long = "index-path-prefix", short, required = true)] pub index_path_prefix: String, /// Number of threads to use. - #[arg(long = "num_threads", short = 'T')] + #[arg(long = "num-threads", short = 'T')] pub num_threads: Option, /// Ratio of PQ training set size to data size - #[arg(long = "p_val", short = 'p', default_value = "0.1")] + #[arg(long = "p-val", short = 'p', default_value = "0.1")] pub p_val: f64, /// Number of PQ bytee - #[arg(long = "pq_bytes", short, default_value = "10")] + #[arg(long = "pq-bytes", short, default_value = "10")] pub pq_bytes: usize, } diff --git a/diskann-tools/src/bin/generate_synthetic_labels.rs b/diskann-tools/src/bin/generate_synthetic_labels.rs index 1c3479819..d615cc2dd 100644 --- a/diskann-tools/src/bin/generate_synthetic_labels.rs +++ b/diskann-tools/src/bin/generate_synthetic_labels.rs @@ -11,7 +11,7 @@ use tracing::{error, info}; #[derive(Debug, Parser)] struct GenerateSyntheticLabelsArgs { /// Filename for saving the label file - #[arg(long = "output_file", required = true)] + #[arg(long = "output-file", required = true)] pub output_file: String, /// Number of vectors diff --git a/diskann-tools/src/bin/random_data_generator.rs b/diskann-tools/src/bin/random_data_generator.rs index 3fc258dc9..f6e708aed 100644 --- a/diskann-tools/src/bin/random_data_generator.rs +++ b/diskann-tools/src/bin/random_data_generator.rs @@ -10,11 +10,11 @@ use diskann_tools::utils::{write_random_data, CMDResult, CMDToolError, DataType} #[derive(Debug, Parser)] struct RandomDataGeneratorArgs { /// data type (required) - #[arg(long = "data_type", required = true)] + #[arg(long = "data-type", required = true)] pub data_type: DataType, /// File name for saving the random vectors - #[arg(long = "output_file", required = true)] + #[arg(long = "output-file", required = true)] pub output_file: String, /// Dimensionality of the vector diff --git a/diskann-tools/src/bin/range_search_disk_index.rs.disabled b/diskann-tools/src/bin/range_search_disk_index.rs.disabled index df05ac220..b83751c2a 100644 --- a/diskann-tools/src/bin/range_search_disk_index.rs.disabled +++ b/diskann-tools/src/bin/range_search_disk_index.rs.disabled @@ -82,46 +82,46 @@ fn main() -> Result<(), CMDToolError> { #[derive(Debug, Parser)] struct SearchDiskIndexArgs { /// data type (required) - #[arg(long = "data_type", required = true)] + #[arg(long = "data-type", required = true)] pub data_type: DataType, /// Distance function to use (l2, cosine) - #[arg(long = "dist_fn", required = true)] + #[arg(long = "dist-fn", required = true)] pub dist_fn: Metric, /// Path to the index file - #[arg(long = "index_path_prefix", required = true)] + #[arg(long = "index-path-prefix", required = true)] pub index_path_prefix: String, /// Query file in binary format - #[arg(long = "query_file", short, required = true)] + #[arg(long = "query-file", short, required = true)] pub query_file: String, /// Ground truth file for the queryset - #[arg(long = "gt_file", default_value = "")] + #[arg(long = "gt-file", default_value = "")] pub gt_file: String, /// Number of neighbors to be returned - #[arg(long = "range_threshold", short = 'K', default_value = "10")] + #[arg(long = "range-threshold", short = 'K', default_value = "10")] pub range_threshold: f32, /// List of L values of search - #[arg(long = "search_list", short = 'L', required = true, num_args=1..)] + #[arg(long = "search-list", short = 'L', required = true, num_args=1..)] pub search_list: Vec, /// Beam width for beam search - #[arg(long = "beam_width", default_value = "2")] + #[arg(long = "beam-width", default_value = "2")] pub beam_width: u32, /// IO limit for each beam search, the default value is u32::MAX - #[arg(long = "search_io_limit", default_value = "4294967295")] + #[arg(long = "search-io-limit", default_value = "4294967295")] pub search_io_limit: u32, /// Number of threads used for querying the index - #[arg(long = "num_threads", short = 'T')] + #[arg(long = "num-threads", short = 'T')] pub num_threads: Option, /// Number of BFS nodes around medoid(s) to cache during query warm up - #[arg(long = "num_nodes_to_cache", default_value = "0")] + #[arg(long = "num-nodes-to-cache", default_value = "0")] pub num_nodes_to_cache: usize, } diff --git a/diskann-tools/src/bin/relative_contrast.rs b/diskann-tools/src/bin/relative_contrast.rs index 14dce99cc..c3216511e 100644 --- a/diskann-tools/src/bin/relative_contrast.rs +++ b/diskann-tools/src/bin/relative_contrast.rs @@ -93,26 +93,26 @@ fn main() -> CMDResult<()> { #[derive(Debug, Parser)] struct RelativeContrastArgs { /// Data type - #[arg(long = "data_type", default_value = "fp16")] + #[arg(long = "data-type", default_value = "fp16")] pub data_type: DataType, /// Vector data file path - #[arg(long = "data_file", short, required = true)] + #[arg(long = "data-file", short, required = true)] pub data_file: String, /// Query file in binary format - #[arg(long = "query_file", short, required = true)] + #[arg(long = "query-file", short, required = true)] pub query_file: String, /// Ground truth file for the queryset - #[arg(long = "gt_file", required = true)] + #[arg(long = "gt-file", required = true)] pub gt_file: String, /// Number of neighbors to use from ground truth - #[arg(long = "recall_at", short = 'K', default_value = "10")] + #[arg(long = "recall-at", short = 'K', default_value = "10")] pub recall_at: usize, /// Number of random distances to average per query - #[arg(long = "search_list", short = 'L', default_value = "10")] + #[arg(long = "search-list", short = 'L', default_value = "10")] pub search_list: usize, } From 6827f9d5dbd1b5105d265abd379a8601eea212d3 Mon Sep 17 00:00:00 2001 From: Magdalen Manohar Date: Tue, 30 Jun 2026 23:29:17 +0000 Subject: [PATCH 10/14] standardize variable names, one more kebab-case instance --- diskann-tools/src/bin/compute_specificities.rs | 4 ++-- diskann-tools/src/bin/gen_associated_data_from_range.rs | 2 +- diskann-tools/src/bin/generate_minmax.rs | 4 ++-- diskann-tools/src/bin/generate_pq.rs | 2 +- diskann-tools/src/bin/subsample_bin.rs | 6 +++++- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/diskann-tools/src/bin/compute_specificities.rs b/diskann-tools/src/bin/compute_specificities.rs index d88e68679..b01d3389d 100644 --- a/diskann-tools/src/bin/compute_specificities.rs +++ b/diskann-tools/src/bin/compute_specificities.rs @@ -18,11 +18,11 @@ use std::process; )] struct Args { /// File containing the base labels - #[arg(long = "base-label-file", short = 'b')] + #[arg(long = "base-file-labels", short = 'b')] pub base_label_file: String, /// File containing the query labels - #[arg(long = "query-label-file", short = 'q')] + #[arg(long = "query-file-labels", short = 'q')] pub query_label_file: String, /// Output file for specificities (optional) diff --git a/diskann-tools/src/bin/gen_associated_data_from_range.rs b/diskann-tools/src/bin/gen_associated_data_from_range.rs index 7c27838d2..1661406c6 100644 --- a/diskann-tools/src/bin/gen_associated_data_from_range.rs +++ b/diskann-tools/src/bin/gen_associated_data_from_range.rs @@ -20,7 +20,7 @@ fn main() -> CMDResult<()> { #[derive(Debug, Parser)] struct GenAssociatedDataFromRangeArgs { - #[arg(long = "associated-data-path")] + #[arg(long = "associated-data-file")] pub associated_data_path: String, #[arg(long = "start")] diff --git a/diskann-tools/src/bin/generate_minmax.rs b/diskann-tools/src/bin/generate_minmax.rs index 7ced9e2ef..43739a743 100644 --- a/diskann-tools/src/bin/generate_minmax.rs +++ b/diskann-tools/src/bin/generate_minmax.rs @@ -27,11 +27,11 @@ use rand::{rngs::StdRng, SeedableRng}; #[command(author, version, about, long_about = None)] struct Args { /// Input binary file path containing vector data - #[arg(short, long)] + #[arg(short, long = "input-file")] input: String, /// Output binary file path for quantized vectors - #[arg(short, long)] + #[arg(short, long = "output-file")] output: String, /// Number of bits for quantization (1, 2, 4, or 8) diff --git a/diskann-tools/src/bin/generate_pq.rs b/diskann-tools/src/bin/generate_pq.rs index 2e729259c..ce641ac05 100644 --- a/diskann-tools/src/bin/generate_pq.rs +++ b/diskann-tools/src/bin/generate_pq.rs @@ -60,7 +60,7 @@ struct BuildPQArgs { pub dist_fn: Metric, /// Path to the data file. The file should be in the format specified by the `data_type` argument. - #[arg(long = "data-path", short, required = true)] + #[arg(long = "data-file", short, required = true)] pub data_path: String, /// Path to the index file. The index will be saved to this prefixed name. diff --git a/diskann-tools/src/bin/subsample_bin.rs b/diskann-tools/src/bin/subsample_bin.rs index 6612ea91b..a7927adce 100644 --- a/diskann-tools/src/bin/subsample_bin.rs +++ b/diskann-tools/src/bin/subsample_bin.rs @@ -24,19 +24,23 @@ use diskann_utils::io::Metadata; #[command(name = "subsample_bin", about = "Subsample vectors from a binary file")] struct Args { /// Data type of the vectors, one of: float, int8, uint8, fp16 - #[arg(value_enum)] + #[arg(long = "data-type", value_enum)] data_type: DataType, /// Input base binary file + #[arg(long = "base-bin-file")] base_bin_file: PathBuf, /// Output file for sampled vectors + #[arg(long = "sampled-output-file")] sampled_output_file: PathBuf, /// Sampling probability between 0 and 1, for example 0.1 + #[arg(long = "sampling-probability")] sampling_probability: f64, /// Optional random seed for reproducible sampling + #[arg(long = "random-seed")] random_seed: Option, } From 0b53bd7dfe4bc570ac0550d40ccede795ea2734b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 1 Jul 2026 15:07:48 +0000 Subject: [PATCH 11/14] fix: remove invalid clap default for base-file-labels option --- diskann-tools/src/bin/compute_range_groundtruth.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diskann-tools/src/bin/compute_range_groundtruth.rs b/diskann-tools/src/bin/compute_range_groundtruth.rs index 65e532830..a8e8c88f6 100644 --- a/diskann-tools/src/bin/compute_range_groundtruth.rs +++ b/diskann-tools/src/bin/compute_range_groundtruth.rs @@ -105,7 +105,7 @@ struct ComputeRangeGroundTruthArgs { pub base_file: String, /// Optional labels file for base vectors - #[arg(long = "base-file-labels", default_value = None)] + #[arg(long = "base-file-labels")] pub base_file_labels: Option, /// File containing the query vectors in binary format From f8b6bb9df44c06991c6f66876f07915b931e935f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 1 Jul 2026 15:08:32 +0000 Subject: [PATCH 12/14] fix: remove invalid default for optional associated data arg --- diskann-tools/src/bin/compute_groundtruth.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diskann-tools/src/bin/compute_groundtruth.rs b/diskann-tools/src/bin/compute_groundtruth.rs index a32ad7919..c6f1481a8 100644 --- a/diskann-tools/src/bin/compute_groundtruth.rs +++ b/diskann-tools/src/bin/compute_groundtruth.rs @@ -142,6 +142,6 @@ struct ComputeGroundTruthArgs { pub recall_at: u32, /// File containing the associated data in binary format - #[arg(long = "associated-data-file", required = false, default_value = None)] + #[arg(long = "associated-data-file")] pub associated_data_file: Option, } From 34c4af1cf5b89b3b7ae9ec705e93a08912262008 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 1 Jul 2026 15:07:55 +0000 Subject: [PATCH 13/14] Fix optional query-file-labels clap default handling --- diskann-tools/src/bin/compute_multivec_groundtruth.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diskann-tools/src/bin/compute_multivec_groundtruth.rs b/diskann-tools/src/bin/compute_multivec_groundtruth.rs index 613b01ac5..220aff622 100644 --- a/diskann-tools/src/bin/compute_multivec_groundtruth.rs +++ b/diskann-tools/src/bin/compute_multivec_groundtruth.rs @@ -116,7 +116,7 @@ struct ComputeMultivecGroundTruthArgs { #[arg(long = "query-file", short, required = true)] pub query_file: String, - #[arg(long = "query-file-labels", default_value = None)] + #[arg(long = "query-file-labels")] pub query_file_labels: Option, /// Path of the file to write the ground truth to in binary format. Please don't append .bin at the end if no filter_label or filter_label_file is provided. It will save the file with '.bin' at the end. Otherwise it will save the file as filename_label.bin. From 1b60ce24eb1aefed418caf55a42faf9972efd6c0 Mon Sep 17 00:00:00 2001 From: Magdalen Manohar Date: Wed, 1 Jul 2026 19:51:49 +0000 Subject: [PATCH 14/14] remove accidentally added file --- .../bin/range_search_disk_index.rs.disabled | 127 ------------------ 1 file changed, 127 deletions(-) delete mode 100644 diskann-tools/src/bin/range_search_disk_index.rs.disabled diff --git a/diskann-tools/src/bin/range_search_disk_index.rs.disabled b/diskann-tools/src/bin/range_search_disk_index.rs.disabled deleted file mode 100644 index b83751c2a..000000000 --- a/diskann-tools/src/bin/range_search_disk_index.rs.disabled +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) Microsoft Corporation. - * Licensed under the MIT license. - */ -use clap::Parser; -use diskann_tools::utils::{ - get_num_threads, init_subscriber, range_search_disk_index, CMDToolError, DataType, - GraphDataF32Vector, GraphDataHalfVector, GraphDataInt8Vector, GraphDataU8Vector, -}; -use diskann_vector::distance::Metric; - -fn main() -> Result<(), CMDToolError> { - init_subscriber(); - - let args: SearchDiskIndexArgs = SearchDiskIndexArgs::parse(); - - let threads = get_num_threads(args.num_threads); - - let result = match args.data_type { - DataType::Float => range_search_disk_index::( - args.dist_fn, - &args.index_path_prefix, - &args.query_file, - &args.gt_file, - threads, - args.range_threshold, - args.beam_width, - args.search_io_limit, - &args.search_list, - args.num_nodes_to_cache, - ), - DataType::Int8 => range_search_disk_index::( - args.dist_fn, - &args.index_path_prefix, - &args.query_file, - &args.gt_file, - threads, - args.range_threshold, - args.beam_width, - args.search_io_limit, - &args.search_list, - args.num_nodes_to_cache, - ), - DataType::Uint8 => range_search_disk_index::( - args.dist_fn, - &args.index_path_prefix, - &args.query_file, - &args.gt_file, - threads, - args.range_threshold, - args.beam_width, - args.search_io_limit, - &args.search_list, - args.num_nodes_to_cache, - ), - DataType::Fp16 => range_search_disk_index::( - args.dist_fn, - &args.index_path_prefix, - &args.query_file, - &args.gt_file, - threads, - args.range_threshold, - args.beam_width, - args.search_io_limit, - &args.search_list, - args.num_nodes_to_cache, - ), - }; - - match result { - Ok(_) => { - println!("Index search completed successfully"); - Ok(()) - } - Err(err) => { - tracing::error!("Index search failed - see diagnostic"); - Err(err.into()) - } - } -} - -#[derive(Debug, Parser)] -struct SearchDiskIndexArgs { - /// data type (required) - #[arg(long = "data-type", required = true)] - pub data_type: DataType, - - /// Distance function to use (l2, cosine) - #[arg(long = "dist-fn", required = true)] - pub dist_fn: Metric, - - /// Path to the index file - #[arg(long = "index-path-prefix", required = true)] - pub index_path_prefix: String, - - /// Query file in binary format - #[arg(long = "query-file", short, required = true)] - pub query_file: String, - - /// Ground truth file for the queryset - #[arg(long = "gt-file", default_value = "")] - pub gt_file: String, - - /// Number of neighbors to be returned - #[arg(long = "range-threshold", short = 'K', default_value = "10")] - pub range_threshold: f32, - - /// List of L values of search - #[arg(long = "search-list", short = 'L', required = true, num_args=1..)] - pub search_list: Vec, - - /// Beam width for beam search - #[arg(long = "beam-width", default_value = "2")] - pub beam_width: u32, - - /// IO limit for each beam search, the default value is u32::MAX - #[arg(long = "search-io-limit", default_value = "4294967295")] - pub search_io_limit: u32, - - /// Number of threads used for querying the index - #[arg(long = "num-threads", short = 'T')] - pub num_threads: Option, - - /// Number of BFS nodes around medoid(s) to cache during query warm up - #[arg(long = "num-nodes-to-cache", default_value = "0")] - pub num_nodes_to_cache: usize, -}