From 1cd6b499a5b5319926c9c12702b71e6eed3089d6 Mon Sep 17 00:00:00 2001 From: Wei Wu Date: Wed, 17 Jun 2026 20:05:58 +0800 Subject: [PATCH 1/3] Remove dead disk-index tool modules The search_disk_index and build_disk_index modules in diskann-tools were unused: their public items had zero callers anywhere in the workspace. The only external references were the glob re-exports in utils/mod.rs. The disk index build/search entry points are provided by diskann-benchmark's own implementations, and no bin target invoked these modules (the corresponding range_search_disk_index bin is already disabled). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- diskann-tools/src/utils/build_disk_index.rs | 246 ---------- diskann-tools/src/utils/mod.rs | 6 - diskann-tools/src/utils/search_disk_index.rs | 451 ------------------- 3 files changed, 703 deletions(-) delete mode 100644 diskann-tools/src/utils/build_disk_index.rs delete mode 100644 diskann-tools/src/utils/search_disk_index.rs diff --git a/diskann-tools/src/utils/build_disk_index.rs b/diskann-tools/src/utils/build_disk_index.rs deleted file mode 100644 index d55a1f566..000000000 --- a/diskann-tools/src/utils/build_disk_index.rs +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright (c) Microsoft Corporation. - * Licensed under the MIT license. - */ - -use diskann::{ - graph::config, - utils::{IntoUsize, ONE}, - ANNError, ANNResult, -}; -use diskann_disk::{ - build::{ - builder::build::DiskIndexBuilder, - chunking::{checkpoint::CheckpointManager, continuation::ChunkingConfig}, - }, - data_model::GraphDataType, - disk_index_build_parameter::{ - DiskIndexBuildParameters, MemoryBudget, NumPQChunks, DISK_SECTOR_LEN, - }, - storage::DiskIndexWriter, - QuantizationType, -}; -use diskann_providers::storage::{StorageReadProvider, StorageWriteProvider}; -use diskann_providers::{ - model::IndexConfiguration, - utils::{load_metadata_from_file, Timer}, -}; -use diskann_vector::distance::Metric; -use opentelemetry::global::BoxedSpan; -#[cfg(feature = "perf_test")] -use opentelemetry::{ - trace::{Span, Tracer}, - KeyValue, -}; - -pub struct ChunkingParameters { - pub chunking_config: ChunkingConfig, - pub checkpoint_record_manager: Box, -} - -/// A simple struct to contain the underlying dimension of the data and -/// its full-precision vector dimension. -/// -/// * `dim` is the length of the vector when represented with the underlying datatype -/// * `full_dim` is the length of the vector when converted to a full-precision slice, i.e. [f32] -/// -/// # Notes -/// -/// These values are the same when using primitive data types to represent the vectors -/// such as `half::f16` or `f32`, however, for quantized vectors used in place of -/// full-preicision vectors such as [`common::MinMaxElement`] these might be different. -#[derive(Clone, Copy, PartialEq, Debug)] -pub struct DimensionValues { - dim: usize, - full_dim: usize, -} -impl DimensionValues { - pub fn new(dim: usize, full_dim: usize) -> Self { - Self { dim, full_dim } - } - - pub fn dim(&self) -> usize { - self.dim - } - - pub fn full_dim(&self) -> usize { - self.full_dim - } -} - -pub struct BuildDiskIndexParameters<'a> { - pub metric: Metric, - pub data_path: &'a str, - pub r: u32, - pub l: u32, - pub index_path_prefix: &'a str, - pub num_threads: usize, - pub num_of_pq_chunks: usize, - pub index_build_ram_limit_gb: f64, - pub build_quantization_type: QuantizationType, - pub chunking_parameters: Option, - pub dim_values: DimensionValues, -} - -/// The main function to build a disk index -pub fn build_disk_index( - storage_provider: &StorageProviderType, - parameters: BuildDiskIndexParameters, -) -> ANNResult<()> -where - Data: GraphDataType, - StorageProviderType: StorageReadProvider + StorageWriteProvider + 'static, - ::Reader: std::marker::Send, -{ - let build_parameters = DiskIndexBuildParameters::new( - MemoryBudget::try_from_gb(parameters.index_build_ram_limit_gb)?, - parameters.build_quantization_type, - NumPQChunks::new_with( - parameters.num_of_pq_chunks, - parameters.dim_values.full_dim(), - )?, - ); - - let config = config::Builder::new_with( - parameters.r.into_usize(), - config::MaxDegree::default_slack(), - parameters.l.into_usize(), - parameters.metric.into(), - |b| { - b.saturate_after_prune(true); - }, - ) - .build()?; - - let metadata = load_metadata_from_file(storage_provider, parameters.data_path)?; - - if metadata.ndims() != parameters.dim_values.dim() { - return Err(ANNError::log_index_config_error( - format!("{:?}", parameters.dim_values), - format!("dim_values must match with data_dim {}", metadata.ndims()), - )); - } - - let index_configuration = IndexConfiguration::new( - parameters.metric, - metadata.ndims(), - metadata.npoints(), - ONE, - parameters.num_threads, - config, - ) - .with_pseudo_rng(); - - let disk_index_writer = DiskIndexWriter::new( - parameters.data_path.to_string(), - parameters.index_path_prefix.to_string(), - Option::None, - DISK_SECTOR_LEN, - )?; - - let mut disk_index = match parameters.chunking_parameters { - Some(chunking_parameters) => { - let chunking_config = chunking_parameters.chunking_config; - let checkpoint_record_manager = chunking_parameters.checkpoint_record_manager; - DiskIndexBuilder::::new_with_chunking_config( - storage_provider, - build_parameters, - index_configuration, - disk_index_writer, - chunking_config, - checkpoint_record_manager, - ) - } - None => DiskIndexBuilder::::new( - storage_provider, - build_parameters, - index_configuration, - disk_index_writer, - ), - }?; - - let mut _span: BoxedSpan; - #[cfg(feature = "perf_test")] - { - let tracer = opentelemetry::global::tracer(""); - - // Start a span for the search iteration. - _span = tracer.start("index-build statistics".to_string()); - } - - let timer = Timer::new(); - disk_index.build()?; - - let diff = timer.elapsed(); - println!("Indexing time: {} seconds", diff.as_secs_f64()); - - #[cfg(feature = "perf_test")] - { - _span.set_attribute(KeyValue::new("total_time", diff.as_secs_f64())); - _span.set_attribute(KeyValue::new("total_comparisons", 0i64)); - _span.set_attribute(KeyValue::new("search_hops", 0i64)); - _span.end(); - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use diskann::ANNErrorKind; - use diskann_providers::storage::VirtualStorageProvider; - use vfs::MemoryFS; - - use super::*; - use crate::utils::GraphDataInt8Vector; - - #[test] - fn test_build_disk_index_with_num_of_pq_chunks() { - let storage_provider = VirtualStorageProvider::new_memory(); - let parameters = BuildDiskIndexParameters { - metric: Metric::L2, - data_path: "test_data_path", - r: 10, - l: 20, - index_path_prefix: "test_index_path_prefix", - num_threads: 4, - num_of_pq_chunks: 8, - index_build_ram_limit_gb: 1.0, - build_quantization_type: QuantizationType::FP, - chunking_parameters: None, - dim_values: DimensionValues::new(128, 128), - }; - - let result = build_disk_index::>( - &storage_provider, - parameters, - ); - assert!(result.is_err()); - assert_ne!(result.unwrap_err().kind(), ANNErrorKind::IndexConfigError); - } - - #[test] - fn test_build_disk_index_with_zero_num_of_pq_chunks() { - let storage_provider = VirtualStorageProvider::new_memory(); - let parameters = BuildDiskIndexParameters { - metric: Metric::L2, - data_path: "test_data_path", - r: 10, - l: 20, - index_path_prefix: "test_index_path_prefix", - num_threads: 4, - num_of_pq_chunks: 0, - index_build_ram_limit_gb: 1.0, - build_quantization_type: QuantizationType::FP, - chunking_parameters: None, - dim_values: DimensionValues::new(128, 128), - }; - - let result = build_disk_index::>( - &storage_provider, - parameters, - ); - assert!(result.is_err()); - assert_eq!(result.unwrap_err().kind(), ANNErrorKind::IndexConfigError); - } -} diff --git a/diskann-tools/src/utils/mod.rs b/diskann-tools/src/utils/mod.rs index 3b102e091..7a0d720ad 100644 --- a/diskann-tools/src/utils/mod.rs +++ b/diskann-tools/src/utils/mod.rs @@ -24,12 +24,6 @@ pub use ground_truth::*; // pub mod range_search_disk_index; // pub use range_search_disk_index::*; -pub mod search_disk_index; -pub use search_disk_index::*; - -pub mod build_disk_index; -pub use build_disk_index::*; - pub mod build_pq; pub use build_pq::*; diff --git a/diskann-tools/src/utils/search_disk_index.rs b/diskann-tools/src/utils/search_disk_index.rs deleted file mode 100644 index 8bbdb1c8f..000000000 --- a/diskann-tools/src/utils/search_disk_index.rs +++ /dev/null @@ -1,451 +0,0 @@ -/* - * Copyright (c) Microsoft Corporation. - * Licensed under the MIT license. - */ - -use std::{collections::HashSet, sync::atomic::AtomicBool, time::Instant}; - -use diskann::utils::IntoUsize; -use diskann_disk::{ - data_model::{CachingStrategy, GraphDataType}, - search::provider::{ - disk_provider::DiskIndexSearcher, disk_vertex_provider_factory::DiskVertexProviderFactory, - }, - storage::disk_index_reader::DiskIndexReader, - utils::{ - aligned_file_reader::traits::AlignedReaderFactory, instrumentation::PerfLogger, statistics, - QueryStatistics, - }, -}; -use diskann_providers::storage::{StorageReadProvider, StorageWriteProvider}; -use diskann_providers::{ - storage::{get_compressed_pq_file, get_pq_pivot_file}, - utils::{create_thread_pool, ParallelIteratorInPool}, -}; -use diskann_utils::{ - io::{read_bin, write_bin}, - views::MatrixView, -}; -use diskann_vector::distance::Metric; -use opentelemetry::global::BoxedSpan; -#[cfg(feature = "perf_test")] -use opentelemetry::{ - trace::{Span, Tracer}, - KeyValue, -}; -use ordered_float::OrderedFloat; -use rayon::prelude::*; -use tracing::{error, info}; - -use crate::utils::{search_index_utils, CMDResult, CMDToolError, KRecallAtN}; - -pub struct SearchDiskIndexParameters<'a> { - pub metric: Metric, - pub index_path_prefix: &'a str, - pub result_output_prefix: &'a str, - pub query_file: &'a str, - pub truthset_file: &'a str, - pub vector_filters_file: Option<&'a str>, - pub num_threads: usize, - pub recall_at: u32, - pub beam_width: u32, - pub search_io_limit: u32, - pub l_vec: &'a [u32], - pub fail_if_recall_below: f32, - pub num_nodes_to_cache: usize, - pub is_flat_search: bool, -} - -pub fn search_disk_index( - storage_provider: &StorageType, - parameters: SearchDiskIndexParameters, - aligned_reader_factory: ReaderFactory, -) -> CMDResult -where - Data: GraphDataType, - StorageType: StorageReadProvider + StorageWriteProvider, - ReaderFactory: AlignedReaderFactory, -{ - let mut logger = PerfLogger::new("search_disk_index".to_string(), true); - - info!( - "Search parameters: #threads: {}, recall_at {}, search_list_size: {:?}, search_io_limit: {}, fail_if_recall_below: {}, beam_width: {}", - parameters.num_threads, parameters.recall_at, parameters.l_vec, parameters.search_io_limit, parameters.fail_if_recall_below,parameters.beam_width - ); - - // Load the query file - let queries = read_bin::( - &mut storage_provider.open_reader(parameters.query_file)?, - )?; - let query_num = queries.nrows(); - // Load the vector filters - let vector_filters = match parameters.vector_filters_file { - Some(vector_filters_file) => { - search_index_utils::load_vector_filters(storage_provider, vector_filters_file)? - } - None => vec![HashSet::::new(); query_num], - }; - - assert_eq!( - vector_filters.len(), - query_num, - "Mismatch in query and vector filter sizes" - ); - - let mut gt_dim: usize = 0; - let mut gt_ids: Option> = None; - - let mut gt_ids_variable_length: Option>> = None; - let mut gt_dists: Option> = None; - - // Check for ground truth - let mut calc_recall_flag = false; - if !parameters.truthset_file.is_empty() && storage_provider.exists(parameters.truthset_file) { - if parameters.vector_filters_file.is_none() { - let ret = - search_index_utils::load_truthset(storage_provider, parameters.truthset_file)?; - gt_ids = Some(ret.index_nodes); - gt_dists = ret.distances; - let gt_num = ret.index_num_points; - gt_dim = ret.index_dimension; - - if gt_num != query_num { - error!("Error. Mismatch in number of queries and ground truth data"); - } - } else { - let range_truthset = search_index_utils::load_range_truthset( - storage_provider, - parameters.truthset_file, - )?; - gt_ids_variable_length = Some(range_truthset.index_nodes); - let gt_num = range_truthset.index_num_points; - - if gt_num != query_num { - error!("Error. Mismatch in number of queries and ground truth data"); - } - } - - calc_recall_flag = true; - } else { - error!( - "Truthset file {} not found. Not computing recall", - parameters.truthset_file - ); - } - - let index_reader = DiskIndexReader::new( - get_pq_pivot_file(parameters.index_path_prefix), - get_compressed_pq_file(parameters.index_path_prefix), - storage_provider, - )?; - - let caching_strategy = if parameters.num_nodes_to_cache > 0 { - CachingStrategy::StaticCacheWithBfsNodes(parameters.num_nodes_to_cache) - } else { - CachingStrategy::None - }; - // Create the vertex provider factory - let vertex_provider_factory = - DiskVertexProviderFactory::new(aligned_reader_factory, caching_strategy)?; - - let searcher = DiskIndexSearcher::>::new( - parameters.num_threads.into_usize(), - parameters.search_io_limit.into_usize(), - &index_reader, - vertex_provider_factory, - parameters.metric, - None, - )?; - - logger.log_checkpoint("index_loaded"); - - let recall_string = format!("Recall@{}", parameters.recall_at); - if calc_recall_flag { - println!( - "{:<6}{:<12}{:<15}{:<20}{:<20}{:<12}{:<16}{:<10}{:<20}{:<12}{:<12}{:<14}{:<16}", - "L", - "Beamwidth", - "QPS", - "Mean Latency (us)", - "99.9 Latency (us)", - "Mean IOs", - "Mean IO (us)", - "CPU (us)", - "PQ Preprocess (us)", - "Mean Comps", - "Mean Hops", - "Cache Hit %", - recall_string - ); - } else { - println!( - "{:<6}{:<12}{:<15}{:<20}{:<20}{:<12}{:<16}{:<10}{:<20}{:<12}{:<12}{:<14}", - "L", - "Beamwidth", - "QPS", - "Mean Latency (us)", - "99.9 Latency (us)", - "Mean IOs", - "Mean IO (us)", - "CPU (us)", - "PQ Preprocess (us)", - "Mean Comparisons", - "Mean hops", - "Cache Hit %", - ); - } - println!("{:=<178}", ""); - - let mut query_result_ids: Vec> = vec![vec![]; parameters.l_vec.len()]; - let mut query_result_dists: Vec> = vec![vec![]; parameters.l_vec.len()]; - let mut cmp_stats: Vec = vec![0; query_num]; - let has_any_search_failed = AtomicBool::new(false); - - let mut best_recall = 0.0; - - let pool = create_thread_pool(parameters.num_threads)?; - - for (test_id, &l) in parameters.l_vec.iter().enumerate() { - if l < parameters.recall_at { - println!( - "Ignoring search with L: {} since it's smaller than K: {}", - l, parameters.recall_at - ); - continue; - } - - query_result_ids[test_id].resize(parameters.recall_at as usize * query_num, 0); - query_result_dists[test_id].resize(parameters.recall_at as usize * query_num, 0.0); - - // Assuming `QueryStats` is a struct that you have defined elsewhere - let mut statistics: Vec = vec![QueryStatistics::default(); query_num]; - let mut result_counts: Vec = vec![0; query_num]; - - let zipped = cmp_stats - .par_iter_mut() - .zip(queries.par_row_iter()) - .zip(vector_filters.par_iter()) - .zip(query_result_ids[test_id].par_chunks_mut(parameters.recall_at as usize)) - .zip(query_result_dists[test_id].par_chunks_mut(parameters.recall_at as usize)) - .zip(statistics.par_iter_mut()) - .zip(result_counts.par_iter_mut()); - - let mut _span: BoxedSpan; - #[cfg(feature = "perf_test")] - { - let tracer = opentelemetry::global::tracer(""); - - // Start a span for the search iteration. - _span = tracer.start(format!("search-with-L={}-bw={}", l, parameters.beam_width)); - } - - let test_start = Instant::now(); - zipped.for_each_in_pool( - pool.as_ref(), - |( - (((((_cmp, query), vector_filter), query_result_id), query_result_dist), stats), - result_count, - )| { - let vector_filter_function: Box bool + Send + Sync> = - if parameters.vector_filters_file.is_none() { - Box::new(|_: &u32| true) - } else { - Box::new(move |vector_id: &u32| vector_filter.contains(vector_id)) - }; - - let result = searcher.search( - query, - parameters.recall_at, - l, - Some(parameters.beam_width as usize), - Some(vector_filter_function), - parameters.is_flat_search, - ); - - match result { - Ok(search_result) => { - *result_count = search_result.stats.result_count; - *stats = search_result.stats.query_statistics; - search_result - .results - .iter() - .take(parameters.recall_at as usize) - .enumerate() - .for_each(|(i, item)| { - query_result_id[i] = item.vertex_id; - query_result_dist[i] = item.distance; - }); - } - Err(e) => { - error!("Error during search: {}", e); - has_any_search_failed.store(true, std::sync::atomic::Ordering::Release); - } - } - }, - ); - - let diff = test_start.elapsed(); - let qps = query_num as f32 / diff.as_secs_f32(); - - let mean_latency = - statistics::get_mean_stats(&statistics, |stats| stats.total_execution_time_us as f64); - - let latency_999 = statistics::get_percentile_stats(&statistics, 0.999, |stats| { - stats.total_execution_time_us - }); - - let mean_ios = statistics::get_mean_stats(&statistics, |stats| stats.total_io_operations); - let mean_io_time = statistics::get_mean_stats(&statistics, |stats| stats.io_time_us as f64); - let mean_cpus = statistics::get_mean_stats(&statistics, |stats| stats.cpu_time_us as f64); - let mean_pq_preprocess_time = statistics::get_mean_stats(&statistics, |stats| { - stats.query_pq_preprocess_time_us as f64 - }); - let mean_comps = - statistics::get_mean_stats(&statistics, |stats| stats.total_comparisons as f64); - let mean_hops = statistics::get_mean_stats(&statistics, |stats| stats.search_hops as f64); - let total_ios = statistics::get_sum_stats(&statistics, |stats| stats.total_io_operations); - let total_vertices_loaded = - statistics::get_sum_stats(&statistics, |stats| stats.total_vertices_loaded); - let cache_hit_percentage = if total_vertices_loaded > 0.0 { - 100.0 * (1.0 - (total_ios / total_vertices_loaded)) - } else { - 100.0 - }; - - let mut recall = 0.0; - if calc_recall_flag { - recall = if let Some(gt_ids_variable_length) = >_ids_variable_length { - let our_results_variable_length = query_result_ids[test_id] - .chunks_exact(parameters.recall_at as usize) - .enumerate() - .map(|(i, chunk)| chunk[..result_counts[i] as usize].to_vec()) - .collect::>(); - search_index_utils::calculate_filtered_search_recall( - query_num, - None, - gt_ids_variable_length, - &our_results_variable_length, - parameters.recall_at, - )? as f32 - } else { - search_index_utils::calculate_recall( - query_num, - gt_ids.as_ref().ok_or_else(|| CMDToolError { - details: "GroundTruth IDs not initialized".to_string(), - })?, - gt_dists.as_ref(), - gt_dim, - &query_result_ids[test_id], - parameters.recall_at, - KRecallAtN::new(parameters.recall_at, parameters.recall_at)?, - )? as f32 - }; - - best_recall = f32::from(std::cmp::max( - OrderedFloat::(best_recall), - OrderedFloat::(recall), - )); - } - - if calc_recall_flag { - println!( - "{:<6}{:<12.2}{:<15.2}{:<20.2}{:<20.2}{:<12.2}{:<16.2}{:<10.2}{:<20.2}{:<12.2}{:<12.2}{:<14.2}{:<16.2}", - l, - parameters.beam_width, - qps, - mean_latency, - latency_999, - mean_ios, - mean_io_time, - mean_cpus, - mean_pq_preprocess_time, - mean_comps, - mean_hops, - cache_hit_percentage, - recall, - ); - } else { - println!( - "{:<6}{:<12.2}{:<15.2}{:<20.2}{:<20.2}{:<12.2}{:<16.2}{:<10.2}{:<20.2}{:<12.2}{:<12.2}{:<14.2}", - l, - parameters.beam_width, - qps, - mean_latency, - latency_999, - mean_ios, - mean_io_time, - mean_cpus, - mean_pq_preprocess_time, - mean_comps, - mean_hops, - cache_hit_percentage, - ); - } - - #[cfg(feature = "perf_test")] - { - let latency_95 = statistics::get_percentile_stats(&statistics, 0.95, |stats| { - stats.total_execution_time_us - }); - - _span.set_attribute(KeyValue::new("qps", qps as f64)); - _span.set_attribute(KeyValue::new("mean_latency", mean_latency)); - _span.set_attribute(KeyValue::new("latency_999", latency_999 as f64)); - _span.set_attribute(KeyValue::new("latency_95", latency_95 as f64)); - _span.set_attribute(KeyValue::new("mean_cpus", mean_cpus)); - _span.set_attribute(KeyValue::new("mean_io_time", mean_io_time)); - _span.set_attribute(KeyValue::new("mean_ios", mean_ios)); - _span.set_attribute(KeyValue::new("mean_comps", mean_comps)); - _span.set_attribute(KeyValue::new("mean_hops", mean_hops)); - _span.set_attribute(KeyValue::new("recall", recall as f64)); - _span.end(); - } - } - - logger.log_checkpoint("search_completed"); - - info!("Done searching. Now saving results"); - for (test_id, l_value) in parameters.l_vec.iter().enumerate() { - if *l_value < parameters.recall_at { - println!( - "Ignoring all search with L: {} since it's smaller than K: {}", - l_value, parameters.recall_at - ); - } - - let cur_result_path = format!( - "{}_{}_idx_uint32.bin", - parameters.result_output_prefix, l_value - ); - let view = MatrixView::try_from( - query_result_ids[test_id].as_slice(), - query_num, - parameters.recall_at as usize, - ) - .map_err(|e| CMDToolError { - details: e.to_string(), - })?; - write_bin( - view, - &mut storage_provider.create_for_write(&cur_result_path)?, - )?; - } - - if has_any_search_failed.load(std::sync::atomic::Ordering::Acquire) { - // Exit with error. The above stats might still be useful to the user if only a few searched failed, so allowed printing them. - return Err(CMDToolError { - details: "At least one search failed with error. See log for details. Exiting." - .to_string(), - }); - } - - if best_recall >= parameters.fail_if_recall_below { - Ok(0) - } else { - println!( - "Search failed. Best recall {} is below the threshold {}", - best_recall, parameters.fail_if_recall_below - ); - Ok(-1) - } -} From c74673452e855126644fff4ee029b8d8da09120c Mon Sep 17 00:00:00 2001 From: Wei Wu Date: Wed, 17 Jun 2026 20:40:40 +0800 Subject: [PATCH 2/3] Remove unused test fixtures and associated-data filter Delete six never-referenced test size constants from diskann-tools test_utils (keeping TEST_DATASET_SIZE_SMALL and TEST_NUM_DIMENSIONS_RECOMMENDED, which the random_data_generator tests use), across both the miri and non-miri variants. Delete diskann-disk's AssociatedDataFilter type and default_associated_data_filter function: unlike its live twin VectorFilter/default_vector_filter (wired into disk_provider search), the associated-data variant was never used by any production path and was exercised only by its own unit test. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../build/configuration/filter_parameter.rs | 15 ------- diskann-tools/src/utils/test_utils.rs | 44 ------------------- 2 files changed, 59 deletions(-) diff --git a/diskann-disk/src/build/configuration/filter_parameter.rs b/diskann-disk/src/build/configuration/filter_parameter.rs index b2c66f522..00efaa6e2 100644 --- a/diskann-disk/src/build/configuration/filter_parameter.rs +++ b/diskann-disk/src/build/configuration/filter_parameter.rs @@ -5,13 +5,6 @@ use crate::data_model::GraphDataType; -pub type AssociatedDataFilter = - Box::AssociatedDataType) -> bool>; - -pub fn default_associated_data_filter() -> AssociatedDataFilter { - Box::new(|_| true) -} - pub type VectorFilter<'a, Data> = Box::VectorIdType) -> bool + Send + Sync + 'a>; @@ -26,14 +19,6 @@ mod tests { type TestGraphData = GraphDataF32VectorUnitData; - #[test] - fn test_default_associated_data_filter_returns_true_for_all() { - let filter = default_associated_data_filter::(); - // Test that the default filter always returns true - assert!(filter(&())); - assert!(filter(&())); - } - #[test] fn test_default_vector_filter_returns_true_for_all() { let filter = default_vector_filter::(); diff --git a/diskann-tools/src/utils/test_utils.rs b/diskann-tools/src/utils/test_utils.rs index 869441cb3..cae1c5601 100644 --- a/diskann-tools/src/utils/test_utils.rs +++ b/diskann-tools/src/utils/test_utils.rs @@ -5,68 +5,24 @@ #[cfg(not(miri))] pub mod size_constants { - /// The recommended dataset size for testing the library. - /// A prime number is used to avoid any accidental patterns in the data. - pub const TEST_DATASET_SIZE_RECOMMENDED: u64 = 991; - /// The small dataset size for testing the library. /// A prime number is used to avoid any accidental patterns in the data. pub const TEST_DATASET_SIZE_SMALL: u64 = 101; - /// The recommended query size for testing the library. - /// A prime number is used to avoid any accidental patterns in the data. - pub const TEST_QUERYSET_SIZE_RECOMMENDED: u64 = 101; - - /// The small query size for testing the library. - /// A prime number is used to avoid any accidental patterns in the data. - pub const TEST_QUERYSET_SIZE_SMALL: u64 = 11; - /// The recommended number of dimensions for testing the library. /// A prime number is used to avoid any accidental patterns in the data. /// When "memory aligned" the dimensions become 64 (8*8). Setting to non-aligned value to ensure aligning works. pub const TEST_NUM_DIMENSIONS_RECOMMENDED: usize = 59; - - /// The recommended "memory aligned" number of dimensions for testing the library (64=8*8). - pub const TEST_NUM_DIMENSIONS_RECOMMENDED_MEMORY_ALIGNED: usize = 64; - - /// The small number of dimensions for testing the library. - /// A prime number is used to avoid any accidental patterns in the data. - pub const TEST_NUM_DIMENSIONS_SMALL: usize = 13; - - /// The small "memory aligned" number of dimensions for testing the library. - pub const TEST_NUM_DIMENSIONS_SMALL_MEMORY_ALIGNED: usize = 16; } #[cfg(miri)] pub mod size_constants { - /// The recommended dataset size for testing the library. - /// A prime number is used to avoid any accidental patterns in the data. - pub const TEST_DATASET_SIZE_RECOMMENDED: u64 = 7; - /// The small dataset size for testing the library. /// A prime number is used to avoid any accidental patterns in the data. pub const TEST_DATASET_SIZE_SMALL: u64 = 3; - /// The recommended query size for testing the library. - /// A prime number is used to avoid any accidental patterns in the data. - pub const TEST_QUERYSET_SIZE_RECOMMENDED: u64 = 3; - - /// The small query size for testing the library. - /// A prime number is used to avoid any accidental patterns in the data. - pub const TEST_QUERYSET_SIZE_SMALL: u64 = 1; - /// The recommended number of dimensions for testing the library. /// A prime number is used to avoid any accidental patterns in the data. /// When "memory aligned" the dimensions become 16 (8*2). Setting to non-aligned value to ensure aligning works. pub const TEST_NUM_DIMENSIONS_RECOMMENDED: usize = 13; - - /// The recommended "memory aligned" number of dimensions for testing the library (16=8*2). - pub const TEST_NUM_DIMENSIONS_RECOMMENDED_MEMORY_ALIGNED: usize = 16; - - /// The small number of dimensions for testing the library. - /// A prime number is used to avoid any accidental patterns in the data. - pub const TEST_NUM_DIMENSIONS_SMALL: usize = 7; - - /// The small "memory aligned" number of dimensions for testing the library. - pub const TEST_NUM_DIMENSIONS_SMALL_MEMORY_ALIGNED: usize = 8; } From bc0c37b0e27b185e6b5db979009ed5ced08d0b2d Mon Sep 17 00:00:00 2001 From: Wei Wu Date: Thu, 18 Jun 2026 22:11:02 +0800 Subject: [PATCH 3/3] Inline test size constants and remove test_utils module The two surviving size constants were used only by the random_data_generator test module. Inline them there as private consts (preserving the miri/non-miri split) and drop the test_utils module and its re-export, removing one more file and public export surface. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- diskann-tools/src/utils/mod.rs | 3 -- .../src/utils/random_data_generator.rs | 23 ++++++++++++++- diskann-tools/src/utils/test_utils.rs | 28 ------------------- 3 files changed, 22 insertions(+), 32 deletions(-) delete mode 100644 diskann-tools/src/utils/test_utils.rs diff --git a/diskann-tools/src/utils/mod.rs b/diskann-tools/src/utils/mod.rs index 7a0d720ad..975a9062c 100644 --- a/diskann-tools/src/utils/mod.rs +++ b/diskann-tools/src/utils/mod.rs @@ -33,9 +33,6 @@ pub use generate_synthetic_labels_utils::*; pub mod gen_associated_data_from_range; pub use gen_associated_data_from_range::*; -pub mod test_utils; -pub use test_utils::*; - pub type CMDResult = Result; pub mod parameter_helper; diff --git a/diskann-tools/src/utils/random_data_generator.rs b/diskann-tools/src/utils/random_data_generator.rs index c573a8737..5060be49c 100644 --- a/diskann-tools/src/utils/random_data_generator.rs +++ b/diskann-tools/src/utils/random_data_generator.rs @@ -197,7 +197,28 @@ mod tests { use rstest::rstest; use super::*; - use crate::utils::size_constants::{TEST_DATASET_SIZE_SMALL, TEST_NUM_DIMENSIONS_RECOMMENDED}; + + /// The small dataset size for testing the library. + /// A prime number is used to avoid any accidental patterns in the data. + #[cfg(not(miri))] + const TEST_DATASET_SIZE_SMALL: u64 = 101; + + /// The recommended number of dimensions for testing the library. + /// A prime number is used to avoid any accidental patterns in the data. + /// When "memory aligned" the dimensions become 64 (8*8). Setting to non-aligned value to ensure aligning works. + #[cfg(not(miri))] + const TEST_NUM_DIMENSIONS_RECOMMENDED: usize = 59; + + /// The small dataset size for testing the library. + /// A prime number is used to avoid any accidental patterns in the data. + #[cfg(miri)] + const TEST_DATASET_SIZE_SMALL: u64 = 3; + + /// The recommended number of dimensions for testing the library. + /// A prime number is used to avoid any accidental patterns in the data. + /// When "memory aligned" the dimensions become 16 (8*2). Setting to non-aligned value to ensure aligning works. + #[cfg(miri)] + const TEST_NUM_DIMENSIONS_RECOMMENDED: usize = 13; #[rstest] fn random_data_write_success( diff --git a/diskann-tools/src/utils/test_utils.rs b/diskann-tools/src/utils/test_utils.rs deleted file mode 100644 index cae1c5601..000000000 --- a/diskann-tools/src/utils/test_utils.rs +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) Microsoft Corporation. - * Licensed under the MIT license. - */ - -#[cfg(not(miri))] -pub mod size_constants { - /// The small dataset size for testing the library. - /// A prime number is used to avoid any accidental patterns in the data. - pub const TEST_DATASET_SIZE_SMALL: u64 = 101; - - /// The recommended number of dimensions for testing the library. - /// A prime number is used to avoid any accidental patterns in the data. - /// When "memory aligned" the dimensions become 64 (8*8). Setting to non-aligned value to ensure aligning works. - pub const TEST_NUM_DIMENSIONS_RECOMMENDED: usize = 59; -} - -#[cfg(miri)] -pub mod size_constants { - /// The small dataset size for testing the library. - /// A prime number is used to avoid any accidental patterns in the data. - pub const TEST_DATASET_SIZE_SMALL: u64 = 3; - - /// The recommended number of dimensions for testing the library. - /// A prime number is used to avoid any accidental patterns in the data. - /// When "memory aligned" the dimensions become 16 (8*2). Setting to non-aligned value to ensure aligning works. - pub const TEST_NUM_DIMENSIONS_RECOMMENDED: usize = 13; -}