From 59556654e5d661e919d039b34837fb410fb5ed45 Mon Sep 17 00:00:00 2001 From: Naren Datha Date: Thu, 2 Jul 2026 17:06:09 +0530 Subject: [PATCH] benchmark: add experimental diverse graph search integration Adds a feature-gated (experimental_diversity_search) topk-diverse-search benchmark path: - DiverseKNN search wrapper in diskann-benchmark-core - FileAttributeProvider loading plaintext per-point attributes - TopkDiverseSearchPhase input, DiverseSearch plugin, and Knn impl wiring - Example config and sample attribute test data All additions are gated so default builds are unaffected. --- diskann-benchmark-core/Cargo.toml | 3 + .../src/search/graph/diverse.rs | 150 ++++++++++++++++++ .../src/search/graph/mod.rs | 6 + diskann-benchmark/Cargo.toml | 7 + .../example/async-diverse-search.json | 49 ++++++ diskann-benchmark/src/index/benchmarks.rs | 68 ++++++-- diskann-benchmark/src/index/search/knn.rs | 33 ++++ diskann-benchmark/src/index/search/plugins.rs | 22 +++ diskann-benchmark/src/inputs/graph_index.rs | 81 ++++++++++ diskann-benchmark/src/utils/attributes.rs | 77 +++++++++ diskann-benchmark/src/utils/mod.rs | 3 + ...ndex_siftsmall_learn_256pts_attributes.txt | 3 + 12 files changed, 492 insertions(+), 10 deletions(-) create mode 100644 diskann-benchmark-core/src/search/graph/diverse.rs create mode 100644 diskann-benchmark/example/async-diverse-search.json create mode 100644 diskann-benchmark/src/utils/attributes.rs create mode 100644 test_data/disk_index_search/disk_index_siftsmall_learn_256pts_attributes.txt diff --git a/diskann-benchmark-core/Cargo.toml b/diskann-benchmark-core/Cargo.toml index 5b6740ef8..80295e26c 100644 --- a/diskann-benchmark-core/Cargo.toml +++ b/diskann-benchmark-core/Cargo.toml @@ -25,6 +25,9 @@ default = [] # BigANN Runbook support requires parsing a YAML file. bigann = ["dep:serde_yaml"] +# Diversity-aware search wrappers (experimental). +experimental_diversity_search = ["diskann/experimental_diversity_search"] + [lints] workspace = true diff --git a/diskann-benchmark-core/src/search/graph/diverse.rs b/diskann-benchmark-core/src/search/graph/diverse.rs new file mode 100644 index 000000000..b60d35fee --- /dev/null +++ b/diskann-benchmark-core/src/search/graph/diverse.rs @@ -0,0 +1,150 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! A built-in helper for benchmarking diversity-aware K-nearest neighbors search. + +use std::sync::Arc; + +use diskann::{ + ANNResult, + graph::{self, glue}, + neighbor::AttributeValueProvider, + provider, +}; +use diskann_utils::{future::AsyncFriendly, views::Matrix}; + +use crate::search::{self, graph::Strategy, graph::knn::Metrics}; + +/// A built-in helper for benchmarking diversity-aware search via +/// [`graph::search::Diverse`]. +/// +/// This mirrors [`super::KNN`] but runs each query through a +/// [`graph::search::Diverse`] wrapper constructed from the shared +/// [`AttributeValueProvider`] and diversity parameters stored on this struct. The +/// [`Search::Parameters`] remain the base [`graph::search::Knn`] parameters so that the +/// same benchmark driving code (search list sweeps, recall computation, aggregation) can be +/// reused unchanged. +/// +/// # Type Parameters +/// +/// - `DP`: The data provider type. +/// - `T`: The query element type. +/// - `S`: The search strategy type. +/// - `P`: The attribute value provider used to derive diversity attributes. +#[derive(Debug)] +pub struct DiverseKNN +where + DP: provider::DataProvider, + P: AttributeValueProvider, +{ + index: Arc>, + queries: Arc>, + strategy: Strategy, + attribute_provider: Arc

, + diverse_attribute_id: usize, + diverse_results_k: usize, +} + +impl DiverseKNN +where + DP: provider::DataProvider, + P: AttributeValueProvider, +{ + /// Construct a new [`DiverseKNN`] searcher. + /// + /// # Errors + /// + /// Returns an error if the number of elements in `strategy` is not compatible with + /// the number of rows in `queries`. + pub fn new( + index: Arc>, + queries: Arc>, + strategy: Strategy, + attribute_provider: Arc

, + diverse_attribute_id: usize, + diverse_results_k: usize, + ) -> anyhow::Result> { + strategy.length_compatible(queries.nrows())?; + + Ok(Arc::new(Self { + index, + queries, + strategy, + attribute_provider, + diverse_attribute_id, + diverse_results_k, + })) + } + + /// Access the index. + pub fn index(&self) -> &Arc> { + &self.index + } +} + +impl search::Search for DiverseKNN +where + DP: provider::DataProvider, + S: for<'a> glue::DefaultSearchStrategy< + 'a, + DP, + &'a [T], + DP::ExternalId, + SearchAccessor: glue::SearchAccessor, + > + Clone + + AsyncFriendly, + P: AttributeValueProvider + AsyncFriendly, + graph::search::Diverse

: + for<'a> graph::Search<'a, DP, S, &'a [T], Output = graph::index::SearchStats>, + T: AsyncFriendly + Clone, +{ + type Id = DP::ExternalId; + type Parameters = graph::search::Knn; + type Output = Metrics; + + fn num_queries(&self) -> usize { + self.queries.nrows() + } + + fn id_count(&self, parameters: &Self::Parameters) -> search::IdCount { + search::IdCount::Fixed(parameters.k_value()) + } + + async fn search( + &self, + parameters: &Self::Parameters, + buffer: &mut O, + index: usize, + ) -> ANNResult + where + O: graph::SearchOutputBuffer + Send, + { + let context = DP::Context::default(); + let strategy = self.strategy.get(index)?; + + let diverse_params = graph::DiverseSearchParams::new( + self.diverse_attribute_id, + self.diverse_results_k, + self.attribute_provider.clone(), + ); + let diverse_search = graph::search::Diverse::new(*parameters, diverse_params); + + let stats = self + .index + .search( + diverse_search, + strategy, + &context, + self.queries.row(index), + buffer, + ) + .await?; + + Ok(Metrics { + comparisons: stats.cmps, + hops: stats.hops, + }) + } +} diff --git a/diskann-benchmark-core/src/search/graph/mod.rs b/diskann-benchmark-core/src/search/graph/mod.rs index 8063f0875..1c99510bf 100644 --- a/diskann-benchmark-core/src/search/graph/mod.rs +++ b/diskann-benchmark-core/src/search/graph/mod.rs @@ -10,6 +10,9 @@ pub mod range; pub mod strategy; +#[cfg(feature = "experimental_diversity_search")] +pub mod diverse; + pub use inline::InlineFilterSearch; pub use knn::KNN; pub use multihop::MultiHop; @@ -17,6 +20,9 @@ pub use range::Range; pub use strategy::Strategy; +#[cfg(feature = "experimental_diversity_search")] +pub use diverse::DiverseKNN; + //////////////// // Test Utils // //////////////// diff --git a/diskann-benchmark/Cargo.toml b/diskann-benchmark/Cargo.toml index ce5018aad..ac839b984 100644 --- a/diskann-benchmark/Cargo.toml +++ b/diskann-benchmark/Cargo.toml @@ -67,6 +67,13 @@ minmax-quantization = [] # Enable multi-vector MaxSim distance benchmarks multi-vector = [] +# Enable experimental diversity-aware search benchmarks +experimental_diversity_search = [ + "diskann/experimental_diversity_search", + "diskann-providers/experimental_diversity_search", + "diskann-benchmark-core/experimental_diversity_search", +] + # Enable bftree backend bftree = ["dep:diskann-bftree"] diff --git a/diskann-benchmark/example/async-diverse-search.json b/diskann-benchmark/example/async-diverse-search.json new file mode 100644 index 000000000..7d8518f44 --- /dev/null +++ b/diskann-benchmark/example/async-diverse-search.json @@ -0,0 +1,49 @@ +{ + "search_directories": [ + "test_data/disk_index_search" + ], + "jobs": [ + { + "type": "graph-index-build", + "content": { + "source": { + "index-source": "Build", + "data_type": "float32", + "data": "disk_index_siftsmall_learn_256pts_data.fbin", + "distance": "squared_l2", + "max_degree": 32, + "l_build": 50, + "alpha": 1.2, + "backedge_ratio": 1.0, + "num_threads": 1, + "start_point_strategy": "medoid", + "num_insert_attempts": 1, + "saturate_inserts": false + }, + "search_phase": { + "search-type": "topk-diverse-search", + "queries": "disk_index_sample_query_10pts.fbin", + "groundtruth": "disk_index_10pts_idx_uint32_truth_search_res.bin", + "attributes": "disk_index_siftsmall_learn_256pts_attributes.txt", + "reps": 5, + "num_threads": [ + 1 + ], + "diverse_attribute_id": 0, + "diverse_results_k": 1, + "runs": [ + { + "search_n": 20, + "search_l": [ + 20, + 30, + 40 + ], + "recall_k": 10 + } + ] + } + } + } + ] +} diff --git a/diskann-benchmark/src/index/benchmarks.rs b/diskann-benchmark/src/index/benchmarks.rs index 0a66576a5..7776a987d 100644 --- a/diskann-benchmark/src/index/benchmarks.rs +++ b/diskann-benchmark/src/index/benchmarks.rs @@ -71,16 +71,16 @@ pub(crate) fn register_benchmarks(registry: &mut Registry) -> anyhow::Result<()> // care. // Full Precision - registry.register( - "graph-index-full-precision-f32", - FullPrecision::::new() - .search(plugins::Topk) - .search(plugins::Range) - .search(plugins::TopkBetaFilter) - .search(plugins::TopkMultihopFilter) - .search(plugins::TopkInlineFilter) - .search(plugins::DeterminantDiversity), - )?; + let full_precision_f32 = FullPrecision::::new() + .search(plugins::Topk) + .search(plugins::Range) + .search(plugins::TopkBetaFilter) + .search(plugins::TopkMultihopFilter) + .search(plugins::TopkInlineFilter) + .search(plugins::DeterminantDiversity); + #[cfg(feature = "experimental_diversity_search")] + let full_precision_f32 = full_precision_f32.search(plugins::DiverseSearch); + registry.register("graph-index-full-precision-f32", full_precision_f32)?; registry.register( "graph-index-full-precision-f16", @@ -484,6 +484,54 @@ impl search::Plugin, SearchPhase, Strategy, SearchPhase, Strategy> + for plugins::DiverseSearch +{ + fn is_match(&self, phase: &SearchPhase) -> bool { + plugins::DiverseSearch::is_match(phase) + } + + fn kind(&self) -> &'static str { + plugins::DiverseSearch::as_str() + } + + fn run( + &self, + index: Arc>>, + phase: &SearchPhase, + _strategy: &Strategy, + ) -> anyhow::Result { + let phase = plugins::DiverseSearch::get(phase)?; + + let queries = Arc::new(datafiles::load_dataset::(datafiles::BinFile( + &phase.queries, + ))?); + let groundtruth = datafiles::load_groundtruth( + datafiles::BinFile(&phase.groundtruth), + Some(phase.max_k()), + )?; + + let attribute_provider = Arc::new( + crate::utils::attributes::FileAttributeProvider::load(&phase.attributes)?, + ); + + let knn = benchmark_core::search::graph::DiverseKNN::new( + index, + queries, + benchmark_core::search::graph::Strategy::broadcast(common::FullPrecision), + attribute_provider, + phase.diverse_attribute_id, + phase.diverse_results_k, + )?; + + let steps = search::knn::SearchSteps::new(phase.reps, &phase.num_threads, &phase.runs); + let results = search::knn::run(&knn, &groundtruth, steps)?; + + Ok(AggregatedSearchResults::Topk(results)) + } +} + impl search::Plugin> for plugins::Topk where DP: DataProvider + QueryType, diff --git a/diskann-benchmark/src/index/search/knn.rs b/diskann-benchmark/src/index/search/knn.rs index bc117b175..bda311cb4 100644 --- a/diskann-benchmark/src/index/search/knn.rs +++ b/diskann-benchmark/src/index/search/knn.rs @@ -171,3 +171,36 @@ where Ok(results.into_iter().map(SearchResults::new).collect()) } } + +#[cfg(feature = "experimental_diversity_search")] +impl Knn for Arc> +where + DP: diskann::provider::DataProvider, + P: diskann::neighbor::AttributeValueProvider, + core_search::graph::DiverseKNN: core_search::Search< + Id = DP::InternalId, + Parameters = diskann::graph::search::Knn, + Output = core_search::graph::knn::Metrics, + >, +{ + fn search_all( + &self, + parameters: Vec>, + groundtruth: &dyn benchmark_core::recall::Rows, + recall_k: usize, + recall_n: usize, + ) -> anyhow::Result> { + let results = core_search::search_all( + self.clone(), + parameters.into_iter(), + core_search::graph::knn::Aggregator::new( + groundtruth, + recall_k, + recall_n, + GroundTruthMode::Fixed, + ), + )?; + + Ok(results.into_iter().map(SearchResults::new).collect()) + } +} diff --git a/diskann-benchmark/src/index/search/plugins.rs b/diskann-benchmark/src/index/search/plugins.rs index de050004f..90a42d911 100644 --- a/diskann-benchmark/src/index/search/plugins.rs +++ b/diskann-benchmark/src/index/search/plugins.rs @@ -178,6 +178,28 @@ impl DeterminantDiversity { } } +/// A search plugin for diversity-aware top-k search. +#[cfg(feature = "experimental_diversity_search")] +#[derive(Debug, Clone, Copy)] +pub(crate) struct DiverseSearch; + +#[cfg(feature = "experimental_diversity_search")] +impl DiverseSearch { + pub(crate) fn is_match(phase: &SearchPhase) -> bool { + phase.as_topk_diverse_search().is_ok() + } + + pub(crate) const fn as_str() -> &'static str { + "topk-diverse-search" + } + + pub(crate) fn get( + phase: &SearchPhase, + ) -> anyhow::Result<&crate::inputs::graph_index::TopkDiverseSearchPhase> { + Ok(phase.as_topk_diverse_search()?) + } +} + /// A search plugin for range search. #[derive(Debug, Clone, Copy)] pub(crate) struct Range; diff --git a/diskann-benchmark/src/inputs/graph_index.rs b/diskann-benchmark/src/inputs/graph_index.rs index 001fb5f8e..0d9ab90b0 100644 --- a/diskann-benchmark/src/inputs/graph_index.rs +++ b/diskann-benchmark/src/inputs/graph_index.rs @@ -401,6 +401,64 @@ impl Example for TopkDeterminantDiversityPhase { } } +#[cfg(feature = "experimental_diversity_search")] +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct TopkDiverseSearchPhase { + pub(crate) queries: InputFile, + pub(crate) groundtruth: InputFile, + /// Plaintext file with one integer attribute per line; line `N` is vector `N`'s attribute. + pub(crate) attributes: InputFile, + pub(crate) reps: NonZeroUsize, + pub(crate) num_threads: Vec, + pub(crate) runs: Vec, + /// The attribute dimension used for diversity (currently only a single attribute is used). + #[serde(default)] + pub(crate) diverse_attribute_id: usize, + /// The maximum number of results to keep per distinct attribute value. + pub(crate) diverse_results_k: usize, +} + +#[cfg(feature = "experimental_diversity_search")] +impl TopkDiverseSearchPhase { + pub(crate) fn max_k(&self) -> usize { + self.runs.iter().map(|run| run.recall_k).max().unwrap_or(0) + } + + pub(crate) fn validate(&mut self, checker: &mut Checker) -> Result<(), anyhow::Error> { + if self.diverse_results_k == 0 { + return Err(anyhow!("diverse_results_k must be greater than zero")); + } + self.queries.resolve(checker)?; + self.groundtruth.resolve(checker)?; + self.attributes.resolve(checker)?; + for (i, run) in self.runs.iter_mut().enumerate() { + run.validate(checker) + .with_context(|| format!("search run {}", i))?; + } + Ok(()) + } +} + +#[cfg(feature = "experimental_diversity_search")] +impl Example for TopkDiverseSearchPhase { + fn example() -> Self { + Self { + queries: InputFile::new("path/to/queries"), + groundtruth: InputFile::new("path/to/groundtruth"), + attributes: InputFile::new("path/to/attributes.txt"), + reps: NonZeroUsize::new(1).unwrap(), + num_threads: vec![NonZeroUsize::new(1).unwrap()], + runs: vec![GraphSearch { + search_n: 10, + search_l: vec![10, 20, 30, 40], + recall_k: 10, + }], + diverse_attribute_id: 0, + diverse_results_k: 1, + } + } +} + #[derive(Debug, Deserialize, Serialize)] #[serde(tag = "search-type", rename_all = "kebab-case")] pub(crate) enum SearchPhase { @@ -410,6 +468,8 @@ pub(crate) enum SearchPhase { TopkMultihopFilter(MultihopFilterSearchPhase), TopkInlineFilter(InlineFilterSearchPhase), TopkDeterminantDiversity(TopkDeterminantDiversityPhase), + #[cfg(feature = "experimental_diversity_search")] + TopkDiverseSearch(TopkDiverseSearchPhase), } #[derive(Debug, Error)] @@ -438,6 +498,8 @@ impl SearchPhase { Self::TopkMultihopFilter(_) => SearchPhaseKind::TopkMultihopFilter, Self::TopkInlineFilter(_) => SearchPhaseKind::TopkInlineFilter, Self::TopkDeterminantDiversity(_) => SearchPhaseKind::TopkDeterminantDiversity, + #[cfg(feature = "experimental_diversity_search")] + Self::TopkDiverseSearch(_) => SearchPhaseKind::TopkDiverseSearch, } } @@ -506,6 +568,19 @@ impl SearchPhase { )), } } + + #[cfg(feature = "experimental_diversity_search")] + pub(crate) fn as_topk_diverse_search( + &self, + ) -> Result<&TopkDiverseSearchPhase, WrongSearchPhaseKind> { + match self { + Self::TopkDiverseSearch(phase) => Ok(phase), + _ => Err(WrongSearchPhaseKind::new( + SearchPhaseKind::TopkDiverseSearch, + self.kind(), + )), + } + } } impl SearchPhase { @@ -517,6 +592,8 @@ impl SearchPhase { SearchPhase::TopkMultihopFilter(phase) => phase.validate(checker), SearchPhase::TopkInlineFilter(phase) => phase.validate(checker), SearchPhase::TopkDeterminantDiversity(phase) => phase.validate(checker), + #[cfg(feature = "experimental_diversity_search")] + SearchPhase::TopkDiverseSearch(phase) => phase.validate(checker), } } } @@ -529,6 +606,8 @@ pub(crate) enum SearchPhaseKind { TopkMultihopFilter, TopkInlineFilter, TopkDeterminantDiversity, + #[cfg(feature = "experimental_diversity_search")] + TopkDiverseSearch, } impl SearchPhaseKind { @@ -540,6 +619,8 @@ impl SearchPhaseKind { Self::TopkMultihopFilter => "topk-multihop-filter", Self::TopkInlineFilter => "topk-inline-filter", Self::TopkDeterminantDiversity => "topk-determinant-diversity", + #[cfg(feature = "experimental_diversity_search")] + Self::TopkDiverseSearch => "topk-diverse-search", } } } diff --git a/diskann-benchmark/src/utils/attributes.rs b/diskann-benchmark/src/utils/attributes.rs new file mode 100644 index 000000000..513515455 --- /dev/null +++ b/diskann-benchmark/src/utils/attributes.rs @@ -0,0 +1,77 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! A file-backed [`AttributeValueProvider`] for diversity-aware benchmarks. + +use std::path::Path; + +use anyhow::Context; +use diskann::{neighbor::AttributeValueProvider, provider::HasId}; + +/// The reserved attribute bucket assigned to graph navigation nodes (frozen start points) +/// that fall outside the range of loaded per-vector attributes. +/// +/// Greedy graph search seeds traversal from the index's start points. Those points must +/// therefore carry an attribute, otherwise the diverse queue would drop them and search +/// would never expand beyond the entry node. Assigning them a dedicated bucket keeps them +/// traversable without merging them into any real attribute group. +const NAVIGATION_BUCKET: u32 = u32::MAX; + +/// An attribute value provider backed by a plaintext attribute file. +/// +/// The file is expected to contain one unsigned integer per line, where the value on the +/// `N`-th line (0-indexed) is the diversity attribute of the `N`-th vector. This matches the +/// on-disk layout produced by the labelling tools used elsewhere in the pipeline. +/// +/// Ids outside the range of loaded attributes (for example the graph's frozen start points) +/// are mapped to a reserved [`NAVIGATION_BUCKET`] so that greedy search can still traverse +/// the graph through them. +#[derive(Debug, Clone)] +pub(crate) struct FileAttributeProvider { + attributes: Vec, +} + +impl FileAttributeProvider { + /// Load attributes from the plaintext file at `path`. + /// + /// # Errors + /// + /// Returns an error if the file cannot be read or if any line fails to parse as a `u32`. + pub(crate) fn load(path: &Path) -> anyhow::Result { + let contents = std::fs::read_to_string(path) + .with_context(|| format!("while reading attribute file {}", path.display()))?; + + let attributes = contents + .lines() + .map(str::trim) + .filter(|line| !line.is_empty()) + .enumerate() + .map(|(line, value)| { + value.parse::().with_context(|| { + format!("invalid attribute value {value:?} on line {}", line + 1) + }) + }) + .collect::>>()?; + + Ok(Self { attributes }) + } +} + +impl HasId for FileAttributeProvider { + type Id = u32; +} + +impl AttributeValueProvider for FileAttributeProvider { + type Value = u32; + + fn get(&self, id: Self::Id) -> Option { + Some( + self.attributes + .get(id as usize) + .copied() + .unwrap_or(NAVIGATION_BUCKET), + ) + } +} diff --git a/diskann-benchmark/src/utils/mod.rs b/diskann-benchmark/src/utils/mod.rs index 9a25d2cc0..7d10d60ab 100644 --- a/diskann-benchmark/src/utils/mod.rs +++ b/diskann-benchmark/src/utils/mod.rs @@ -15,6 +15,9 @@ pub(crate) mod recall; pub(crate) mod streaming; pub(crate) mod tokio; +#[cfg(feature = "experimental_diversity_search")] +pub(crate) mod attributes; + const DATA_TYPE_MISMATCH: FailureScore = FailureScore(1000); pub(crate) fn match_data_type(data_type: DataType) -> Result diff --git a/test_data/disk_index_search/disk_index_siftsmall_learn_256pts_attributes.txt b/test_data/disk_index_search/disk_index_siftsmall_learn_256pts_attributes.txt new file mode 100644 index 000000000..69632cf97 --- /dev/null +++ b/test_data/disk_index_search/disk_index_siftsmall_learn_256pts_attributes.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8477e541005d06914517da07ef48b6a7dc56b3494a68eaed7cda6c124ccbe49 +size 768