Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
97b36ef
finish up recall computation patch
May 14, 2026
07b3671
Merge branch 'main' of github.com:microsoft/DiskANN
May 15, 2026
eac3ffb
Merge branch 'main' of github.com:microsoft/DiskANN
May 19, 2026
17780f8
fix conflict
May 22, 2026
43eb517
fix conflict
May 22, 2026
1d3a52b
Merge branch 'main' of github.com:microsoft/DiskANN
May 25, 2026
17eac62
Merge branch 'main' of github.com:microsoft/DiskANN
May 26, 2026
0ab4baa
Merge branch 'main' of github.com:microsoft/DiskANN
May 27, 2026
4ddca60
Merge branch 'main' of github.com:microsoft/DiskANN
Jun 2, 2026
54ee01b
fix conflict
Jun 2, 2026
9e1743f
Merge branch 'main' of github.com:microsoft/DiskANN
Jun 5, 2026
93504e6
Merge branch 'main' of github.com:microsoft/DiskANN
Jun 8, 2026
b7c27ce
Merge branch 'main' of github.com:microsoft/DiskANN
Jun 11, 2026
1dafc55
Merge branch 'main' of github.com:microsoft/DiskANN
Jun 12, 2026
33285e2
Merge branch 'main' of github.com:microsoft/DiskANN
Jun 15, 2026
824bdb3
Merge branch 'main' of github.com:microsoft/DiskANN
Jun 16, 2026
826600f
Merge branch 'main' of github.com:microsoft/DiskANN
Jun 26, 2026
1bea9c5
Merge branch 'main' of github.com:microsoft/DiskANN
Jun 30, 2026
c864722
add range groundtruth calculator, rename vector_filters_file
Jun 30, 2026
bd05e0f
fmt, clippy, add missing file
Jun 30, 2026
1477ac4
Potential fix for pull request finding
magdalendobson Jun 30, 2026
4f6f80a
Potential fix for pull request finding
magdalendobson Jun 30, 2026
2044ba1
fix: return error for filter bitmap mismatch
Copilot Jun 30, 2026
df844be
standardize naming to kebab case
Jun 30, 2026
6827f9d
standardize variable names, one more kebab-case instance
Jun 30, 2026
eb70a6f
merge with main
Jun 30, 2026
0b53bd7
fix: remove invalid clap default for base-file-labels option
Copilot Jul 1, 2026
f8b6bb9
fix: remove invalid default for optional associated data arg
Copilot Jul 1, 2026
34c4af1
Fix optional query-file-labels clap default handling
Copilot Jul 1, 2026
1b60ce2
remove accidentally added file
Jul 1, 2026
a78ce37
Merge branch 'main' of github.com:microsoft/DiskANN into users/magdal…
Jul 1, 2026
7c5d71c
Merge branch 'users/magdalen/standardize_naming_in_tools' of github.c…
Jul 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions diskann-tools/src/bin/compute_groundtruth.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,40 +108,40 @@ fn main() -> CMDResult<()> {
#[derive(Debug, Parser)]
struct ComputeGroundTruthArgs {
/// data type <int8/uint8/float / fp16> (required)
#[arg(long = "data_type", default_value = "float")]
#[arg(long = "data-type", default_value = "float")]
pub data_type: DataType,

/// Distance function to use.
#[arg(long = "dist_fn", default_value = "l2")]
#[arg(long = "dist-fn", default_value = "l2")]
pub distance_function: Metric,

/// File containing the base vectors in binary format
#[arg(long = "base_file", short, required = true)]
#[arg(long = "base-file", short, required = true)]
pub base_file: String,

#[arg(long = "base_file_labels", default_value = None)]
#[arg(long = "base-file-labels", default_value = None)]
pub base_file_labels: Option<String>,

/// File containing the query vectors in binary format
#[arg(long = "query_file", short, required = true)]
#[arg(long = "query-file", short, required = true)]
pub query_file: String,

#[arg(long = "query_file_labels", default_value = None)]
#[arg(long = "query-file-labels", default_value = None)]
pub query_file_labels: Option<String>,
Comment thread
magdalendobson marked this conversation as resolved.

/// Path of the file to write the ground truth to in binary format. Please don't append .bin at the end if no filter_label or filter_label_file is provided. It will save the file with '.bin' at the end. Otherwise it will save the file as filename_label.bin.
#[arg(long = "gt_file", short, required = true)]
#[arg(long = "gt-file", short, required = true)]
pub ground_truth_file: String,

/// Filter bitmap file in the range ground truth format
#[arg(long = "filter_bitmap_file", short, default_value = None)]
#[arg(long = "filter-bitmap-file", short, default_value = None)]
pub filter_bitmap_file: Option<String>,

/// Number of ground truth nearest neigbhors to compute
#[arg(long = "recall_at", short = 'K', default_value = "10")]
#[arg(long = "recall-at", short = 'K', default_value = "10")]
pub recall_at: u32,

/// File containing the associated data in binary format
#[arg(long = "associated_data_file", required = false, default_value = None)]
#[arg(long = "associated-data-file")]
pub associated_data_file: Option<String>,
Comment thread
magdalendobson marked this conversation as resolved.
}
16 changes: 8 additions & 8 deletions diskann-tools/src/bin/compute_multivec_groundtruth.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,36 +94,36 @@ fn main() -> CMDResult<()> {
#[derive(Debug, Parser)]
struct ComputeMultivecGroundTruthArgs {
/// data type <int8/uint8/float / fp16> (required)
#[arg(long = "data_type", default_value = "float")]
#[arg(long = "data-type", default_value = "float")]
pub data_type: DataType,

/// Distance function to use.
#[arg(long = "dist_fn", default_value = "l2")]
#[arg(long = "dist-fn", default_value = "l2")]
pub distance_function: Metric,

/// Whether to use average or min aggregation
#[arg(long = "aggregation", default_value = "average")]
pub aggregation: MultivecAggregationMethod,

/// File containing the base vectors in binary format
#[arg(long = "base_file", short, required = true)]
#[arg(long = "base-file", short, required = true)]
pub base_file: String,

#[arg(long = "base_file_labels", default_value = None)]
#[arg(long = "base-file-labels", default_value = None)]
pub base_file_labels: Option<String>,

/// File containing the query vectors in binary format
#[arg(long = "query_file", short, required = true)]
#[arg(long = "query-file", short, required = true)]
pub query_file: String,

#[arg(long = "query_file_labels", default_value = None)]
#[arg(long = "query-file-labels")]
pub query_file_labels: Option<String>,
Comment thread
magdalendobson marked this conversation as resolved.

/// Path of the file to write the ground truth to in binary format. Please don't append .bin at the end if no filter_label or filter_label_file is provided. It will save the file with '.bin' at the end. Otherwise it will save the file as filename_label.bin.
#[arg(long = "gt_file", short, required = true)]
#[arg(long = "gt-file", short, required = true)]
pub ground_truth_file: String,

/// Number of ground truth nearest neighbors to compute
#[arg(long = "recall_at", short = 'K', default_value = "10")]
#[arg(long = "recall-at", short = 'K', default_value = "10")]
pub recall_at: u32,
}
16 changes: 8 additions & 8 deletions diskann-tools/src/bin/compute_range_groundtruth.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,35 +93,35 @@ fn main() -> CMDResult<()> {
#[derive(Debug, Parser)]
struct ComputeRangeGroundTruthArgs {
/// data type <int8/uint8/float/fp16>
#[arg(long = "data_type", default_value = "float")]
#[arg(long = "data-type", default_value = "float")]
pub data_type: DataType,

/// Distance function to use.
#[arg(long = "dist_fn", default_value = "l2")]
#[arg(long = "dist-fn", default_value = "l2")]
pub distance_function: Metric,

/// File containing the base vectors in binary format
#[arg(long = "base_file", short, required = true)]
#[arg(long = "base-file", short, required = true)]
pub base_file: String,

/// Optional labels file for base vectors
#[arg(long = "base_file_labels", default_value = None)]
#[arg(long = "base-file-labels")]
pub base_file_labels: Option<String>,
Comment thread
magdalendobson marked this conversation as resolved.

/// File containing the query vectors in binary format
#[arg(long = "query_file", short, required = true)]
#[arg(long = "query-file", short, required = true)]
pub query_file: String,

/// Optional labels file for query vectors
#[arg(long = "query_file_labels", default_value = None)]
#[arg(long = "query-file-labels", default_value = None)]
pub query_file_labels: Option<String>,

/// Path of the file to write range ground truth to in binary format
#[arg(long = "gt_file", short, required = true)]
#[arg(long = "gt-file", short, required = true)]
pub ground_truth_file: String,

/// Filter bitmap file in range ground truth format
#[arg(long = "filter_bitmap_file", short, default_value = None)]
#[arg(long = "filter-bitmap-file", short, default_value = None)]
pub filter_bitmap_file: Option<String>,

/// Radius threshold used to include neighbors in range-groundtruth
Expand Down
6 changes: 3 additions & 3 deletions diskann-tools/src/bin/compute_specificities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ use std::process;
)]
struct Args {
/// File containing the base labels
#[arg(long = "base_label_file", short = 'b')]
#[arg(long = "base-file-labels", short = 'b')]
pub base_label_file: String,

/// File containing the query labels
#[arg(long = "query_label_file", short = 'q')]
#[arg(long = "query-file-labels", short = 'q')]
pub query_label_file: String,

/// Output file for specificities (optional)
#[arg(long = "specificity_output_file", short = 'o')]
#[arg(long = "specificity-output-file", short = 'o')]
pub specificity_output_file: Option<String>,
}

Expand Down
2 changes: 1 addition & 1 deletion diskann-tools/src/bin/gen_associated_data_from_range.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ fn main() -> CMDResult<()> {

#[derive(Debug, Parser)]
struct GenAssociatedDataFromRangeArgs {
#[arg(long = "associated_data_path")]
#[arg(long = "associated-data-file")]
pub associated_data_path: String,

#[arg(long = "start")]
Expand Down
4 changes: 2 additions & 2 deletions diskann-tools/src/bin/generate_minmax.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ use rand::{rngs::StdRng, SeedableRng};
#[command(author, version, about, long_about = None)]
struct Args {
/// Input binary file path containing vector data
#[arg(short, long)]
#[arg(short, long = "input-file")]
input: String,

/// Output binary file path for quantized vectors
#[arg(short, long)]
#[arg(short, long = "output-file")]
output: String,

/// Number of bits for quantization (1, 2, 4, or 8)
Expand Down
14 changes: 7 additions & 7 deletions diskann-tools/src/bin/generate_pq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,30 +52,30 @@ fn main() -> Result<(), CMDToolError> {
#[derive(Debug, Parser)]
struct BuildPQArgs {
/// data type <int8/uint8/float / fp16> (required)
#[arg(long = "data_type", default_value = "float")]
#[arg(long = "data-type", default_value = "float")]
pub data_type: DataType,

/// Distance function to use.
#[arg(long = "dist_fn", default_value = "l2")]
#[arg(long = "dist-fn", default_value = "l2")]
pub dist_fn: Metric,

/// Path to the data file. The file should be in the format specified by the `data_type` argument.
#[arg(long = "data_path", short, required = true)]
#[arg(long = "data-file", short, required = true)]
pub data_path: String,

/// Path to the index file. The index will be saved to this prefixed name.
#[arg(long = "index_path_prefix", short, required = true)]
#[arg(long = "index-path-prefix", short, required = true)]
pub index_path_prefix: String,

/// Number of threads to use.
#[arg(long = "num_threads", short = 'T')]
#[arg(long = "num-threads", short = 'T')]
pub num_threads: Option<usize>,

/// Ratio of PQ training set size to data size
#[arg(long = "p_val", short = 'p', default_value = "0.1")]
#[arg(long = "p-val", short = 'p', default_value = "0.1")]
pub p_val: f64,

/// Number of PQ bytee
#[arg(long = "pq_bytes", short, default_value = "10")]
#[arg(long = "pq-bytes", short, default_value = "10")]
pub pq_bytes: usize,
}
2 changes: 1 addition & 1 deletion diskann-tools/src/bin/generate_synthetic_labels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use tracing::{error, info};
#[derive(Debug, Parser)]
struct GenerateSyntheticLabelsArgs {
/// Filename for saving the label file
#[arg(long = "output_file", required = true)]
#[arg(long = "output-file", required = true)]
pub output_file: String,

/// Number of vectors
Expand Down
4 changes: 2 additions & 2 deletions diskann-tools/src/bin/random_data_generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ use diskann_tools::utils::{write_random_data, CMDResult, CMDToolError, DataType}
#[derive(Debug, Parser)]
struct RandomDataGeneratorArgs {
/// data type <int8/uint8/float/fp16> (required)
#[arg(long = "data_type", required = true)]
#[arg(long = "data-type", required = true)]
pub data_type: DataType,

/// File name for saving the random vectors
#[arg(long = "output_file", required = true)]
#[arg(long = "output-file", required = true)]
pub output_file: String,

/// Dimensionality of the vector
Expand Down
12 changes: 6 additions & 6 deletions diskann-tools/src/bin/relative_contrast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,26 +93,26 @@ fn main() -> CMDResult<()> {
#[derive(Debug, Parser)]
struct RelativeContrastArgs {
/// Data type <int8/uint8/float/fp16>
#[arg(long = "data_type", default_value = "fp16")]
#[arg(long = "data-type", default_value = "fp16")]
pub data_type: DataType,

/// Vector data file path
#[arg(long = "data_file", short, required = true)]
#[arg(long = "data-file", short, required = true)]
pub data_file: String,

/// Query file in binary format
#[arg(long = "query_file", short, required = true)]
#[arg(long = "query-file", short, required = true)]
pub query_file: String,

/// Ground truth file for the queryset
#[arg(long = "gt_file", required = true)]
#[arg(long = "gt-file", required = true)]
pub gt_file: String,

/// Number of neighbors to use from ground truth
#[arg(long = "recall_at", short = 'K', default_value = "10")]
#[arg(long = "recall-at", short = 'K', default_value = "10")]
pub recall_at: usize,

/// Number of random distances to average per query
#[arg(long = "search_list", short = 'L', default_value = "10")]
#[arg(long = "search-list", short = 'L', default_value = "10")]
pub search_list: usize,
}
6 changes: 5 additions & 1 deletion diskann-tools/src/bin/subsample_bin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,23 @@ use diskann_utils::io::Metadata;
#[command(name = "subsample_bin", about = "Subsample vectors from a binary file")]
struct Args {
/// Data type of the vectors, one of: float, int8, uint8, fp16
#[arg(value_enum)]
#[arg(long = "data-type", value_enum)]
data_type: DataType,

/// Input base binary file
#[arg(long = "base-bin-file")]
Comment thread
magdalendobson marked this conversation as resolved.
base_bin_file: PathBuf,

/// Output file for sampled vectors
#[arg(long = "sampled-output-file")]
sampled_output_file: PathBuf,

/// Sampling probability between 0 and 1, for example 0.1
#[arg(long = "sampling-probability")]
sampling_probability: f64,

/// Optional random seed for reproducible sampling
#[arg(long = "random-seed")]
random_seed: Option<u64>,
}

Expand Down
Loading