Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -378,3 +378,6 @@ gperftools

# Rust
rust/target

# DiskANN unit-test scratch artifacts (generated by tests/unified_index_tests.cpp)
unified_index_test_*
348 changes: 348 additions & 0 deletions docs/unified_index_format.md

Large diffs are not rendered by default.

9 changes: 8 additions & 1 deletion include/filter_match_proxy.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,18 @@ namespace diskann
const std::vector<LabelT>& filter_labels,
LabelT unv_label);

// Ctor variant that owns its per-query scratch buffer internally.
// Used by the unified-index path (see unified_label_data_bitmask::make_match_proxy).
bitmask_filter_match(simple_bitmask_buf& bitmask_filters,
const std::vector<LabelT>& filter_labels,
LabelT unv_label);

virtual bool contain_filtered_label(uint32_t id) override;

private:
simple_bitmask_buf& _bitmask_filters;
std::vector<std::uint64_t>& _query_bitmask_buf;
std::vector<std::uint64_t> _owned_query_bitmask_buf; // populated only by the 3-arg ctor
std::vector<std::uint64_t>& _query_bitmask_buf; // refs either external or _owned
simple_bitmask_full_val _bitmask_full_val;
};

Expand Down
10 changes: 9 additions & 1 deletion include/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,17 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
DISKANN_DLLEXPORT void load(const IndexLoadParams& load_params);

DISKANN_DLLEXPORT void load(const char *index_file, uint32_t num_threads, uint32_t search_l, LabelFormatType label_format_type = LabelFormatType::String);

#endif

// Unified single-file format. See docs/unified_index_format.md.
DISKANN_DLLEXPORT void save_unified(const char *filename);
// Variant of save_unified that also emits a PQ region. Pass empty
// buffers to skip PQ (equivalent to the no-arg overload). Used by
// unified_index_builder.
DISKANN_DLLEXPORT void save_unified(const char *filename, const std::vector<uint8_t> &pq_pivots_bytes,
const std::vector<uint8_t> &pq_codes_bytes);

// get some private variables
DISKANN_DLLEXPORT size_t get_num_points();
DISKANN_DLLEXPORT size_t get_max_points();
Expand Down
11 changes: 11 additions & 0 deletions include/integer_label_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,17 @@ class integer_label_vector

bool initialize_from_file(const std::string &label_file, size_t &numpoints);

bool initialize_from_buffers(const size_t *offsets, size_t num_points,
const uint32_t *labels, size_t total_labels);

// Zero-copy load path: caller pre-sizes both buffers, writes into the raw
// pointers, and the integer_label_vector is ready to use. The two-step
// form lets the caller skip the intermediate vector<uint8_t> + assign()
// copies that initialize_from_buffers incurs.
void resize_for_load(size_t num_points, size_t total_labels);
size_t *mutable_offset_data(); // size: num_points + 1 entries (size_t each)
uint32_t *mutable_label_data(); // size: total_labels entries (uint32_t each)

bool write_to_file(const std::string &label_file) const;

template <typename LabelT>
Expand Down
12 changes: 11 additions & 1 deletion include/label_bitmask.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#include <cstdint>
#include <vector>

#include "windows_customizations.h"

namespace diskann
{

Expand Down Expand Up @@ -45,7 +47,15 @@ struct simple_bitmask_buf

};

class simple_bitmask
// NOTE: simple_bitmask stays DISKANN_DLLEXPORT even though the unit tests now
// link the static diskann_s lib (where DISKANN_DLLEXPORT is a no-op) and no
// longer need it exported. It is kept because ColorInfoVector's inline
// constructor (include/color_info.h, pulled in widely via neighbor.h) odr-uses
// simple_bitmask's out-of-line methods (ctor, get_bitmask_size), so any DLL
// consumer that instantiates it must import them. TODO: once that inline
// dependency is removed or proven unused by every DLL consumer, drop this
// export too -- simple_bitmask is otherwise an internal helper.
class DISKANN_DLLEXPORT simple_bitmask
{
public:
simple_bitmask(std::uint64_t* bitsets, std::uint64_t bitmask_size);
Expand Down
8 changes: 8 additions & 0 deletions include/pq.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@ class FixedChunkPQTable
void load_pq_centroid_bin(const char *pq_table_file, size_t num_chunks);
#endif

// In-memory variant of load_pq_centroid_bin. Parses the same on-disk
// pq_pivots blob format (outer bin -> 4 or 5 sub-bins for offsets,
// pivot table, centroid, [old per-chunk dim], chunk offsets), but reads
// straight from a caller-supplied buffer -- no temp file, no disk IO.
// Does NOT support OPQ rotation matrix (unified-format PQ is always
// standard PQ).
void load_pq_centroid_bin_from_memory(const uint8_t *blob, size_t blob_len, size_t num_chunks);

uint32_t get_num_chunks();

void preprocess_query(float *query_vec);
Expand Down
2 changes: 2 additions & 0 deletions include/pq_flash_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ template <typename T, typename LabelT = uint32_t> class PQFlashIndex
LabelFormatType label_format_type = LabelFormatType::String);
#endif

// (load_unified removed; use diskann::make_unified_index_ssd(reader, ctx) — see include/unified_index.h.)

DISKANN_DLLEXPORT void load_cache_list(std::vector<uint32_t> &node_list);

DISKANN_DLLEXPORT void cache_bfs_levels(uint64_t num_nodes_to_cache, std::vector<uint32_t> &node_list,
Expand Down
96 changes: 96 additions & 0 deletions include/unified_index.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#pragma once

#include <cstdint>
#include <functional>
#include <memory>
#include <optional>
#include <string>
#include <vector>

#include "aligned_file_reader.h"
#include "distance.h"
#include "percentile_stats.h"
#include "unified_index_format.h"
#include "windows_customizations.h"

namespace diskann
{

struct QueryStats;
struct DebugTraversalInfo;

// Knobs passed to unified_index::load. Path identifies the unified container
// file. `num_threads` and `search_l` size per-thread scratch on the memory
// implementation. `num_nodes_to_cache` triggers SSD static-cache priming
// (no-op for the memory implementation).
struct UnifiedLoadContext
{
std::string path;
uint32_t num_threads = 1;
uint32_t search_l = 100;
uint64_t num_nodes_to_cache = 0;
};

// Single in/out container for a search call. The caller fills inputs and
// allocates the output buffers; search() writes outputs (and optional
// telemetry) directly. No allocation happens inside search().
struct UnifiedSearchContext
{
// ---- Inputs ----
const void *query = nullptr; // typed by caller as const T*
size_t K = 10;
uint32_t L = 100;
// Filter labels as user-facing strings. Required non-empty if the loaded
// index has labels; required empty otherwise. The index converts strings
// to internal label ints per its encoding.
std::vector<std::string> filter_labels;
std::optional<uint32_t> beam_width; // SSD-only
std::optional<uint32_t> io_limit; // SSD-only
std::function<float(const std::uint8_t *, size_t)> rerank_fn; // SSD-only

// ---- Outputs (caller-allocated, length >= K) ----
uint64_t *indices = nullptr;
float *distances = nullptr;

// ---- Optional telemetry sinks (nullptr = no telemetry) ----
QueryStats *stats = nullptr;
DebugTraversalInfo *debug_info = nullptr;
};

// Non-templated public interface returned by the factory. Users program
// against this; the templated `unified_index_base<T>` implements it.
class unified_index
{
public:
virtual ~unified_index() = default;

virtual void load(const UnifiedLoadContext &ctx) = 0;
virtual void search(UnifiedSearchContext &ctx) = 0;

virtual const UnifiedIndexHeader &header() const = 0;
virtual uint64_t num_points() const = 0;
virtual uint64_t dim() const = 0;
virtual uint64_t aligned_dim() const = 0;
virtual diskann::Metric metric() const = 0;
virtual DataTypeTag data_type() const = 0;
virtual bool has_labels() const = 0;

// Resident memory / cardinality accounting for the loaded index, mirroring
// Index::get_table_stats() and PQFlashIndex::get_table_stats().
virtual TableStats get_table_stats() const = 0;
};

// Factory: open a unified file fully in memory. Peeks the 4 KiB header,
// dispatches on `data_type`, instantiates the right templated implementation,
// calls load(ctx), returns the owning pointer as the non-templated interface.
std::unique_ptr<unified_index> make_unified_index_memory(const UnifiedLoadContext &ctx);

// Factory: open a unified file in disk-resident (SSD) mode. The supplied
// AlignedFileReader is handed to the constructed unified_index_ssd<T>.
std::unique_ptr<unified_index> make_unified_index_ssd(
std::shared_ptr<AlignedFileReader> reader, const UnifiedLoadContext &ctx);

} // namespace diskann
109 changes: 109 additions & 0 deletions include/unified_index_base.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#pragma once

#include <cstdint>
#include <memory>
#include <string>

#include "distance.h"
#include "unified_index.h"
#include "unified_index_format.h"
#include "unified_label_data.h"
#include "unified_node_store.h"
#include "windows_customizations.h"

namespace diskann
{

class UnifiedIndexReader;

// Templated implementation of the non-templated `unified_index` interface.
// Holds the parsed header, the metric, the label data (built by
// make_unified_label_data), and the node store (a unified_node_store_memory<T>
// or unified_node_store_ssd<T>, plugged in by the derived class's
// `load_storage`).
template <typename T>
class unified_index_base : public unified_index
{
public:
explicit unified_index_base(diskann::Metric metric);
~unified_index_base() override;

void load(const UnifiedLoadContext &ctx) override;
void search(UnifiedSearchContext &ctx) override;

const UnifiedIndexHeader &header() const override
{
return _header;
}
uint64_t num_points() const override
{
return _header.npts;
}
uint64_t dim() const override
{
return _header.dim;
}
uint64_t aligned_dim() const override
{
return _header.aligned_dim;
}
diskann::Metric metric() const override
{
return _metric;
}
DataTypeTag data_type() const override
{
return data_type_tag_of<T>();
}
bool has_labels() const override
{
return _labels && _labels->has_labels();
}
TableStats get_table_stats() const override
{
return _table_stats;
}

// Templated read-only accessors for in-process callers that *do* know T
// (unit tests, the index's own search loop). Not on the public interface.
const unified_label_data_base *labels() const
{
return _labels.get();
}
const unified_node_store_base<T> *nodes() const
{
return _store.get();
}
unified_node_store_base<T> *nodes()
{
return _store.get();
}

protected:
// Derived class is responsible for instantiating the right _store subclass
// and calling its load(). It may inspect ctx for SSD-only knobs like
// ctx.num_nodes_to_cache.
virtual void load_storage(UnifiedIndexReader &r, const UnifiedLoadContext &ctx) = 0;
virtual void search_impl(UnifiedSearchContext &ctx) = 0;

// Fill the storage-specific resident-memory fields (node_mem_usage,
// graph_mem_usage) of `stats`. Memory reports resident coords/graph; SSD
// reports the resident PQ codes (graph lives on disk). Called by load()
// after load_storage() so the store is populated.
virtual void fill_storage_stats(TableStats &stats) const = 0;

void validate_header(const UnifiedIndexHeader &h) const;
void validate_search_context(const UnifiedSearchContext &ctx) const;

UnifiedIndexHeader _header{};
diskann::Metric _metric;
std::unique_ptr<unified_label_data_base> _labels; // nullptr when header has no labels
std::unique_ptr<unified_node_store_base<T>> _store; // built by derived load_storage()
std::string _index_path;
TableStats _table_stats;
};

} // namespace diskann
74 changes: 74 additions & 0 deletions include/unified_index_builder.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#pragma once

#include <cstdint>
#include <memory>
#include <string>

#include "distance.h"
#include "unified_index_format.h"
#include "windows_customizations.h"

namespace diskann
{

// All parameters required to build a unified-format index file.
//
// One struct, runtime-typed (no template). The data_type field selects which
// concrete `Index<T>` is instantiated internally; coords are read from
// `data_file_path` in `.bin` format (the legacy DiskANN file layout).
struct UnifiedBuildContext
{
// --- Input data ---
std::string data_file_path; // .bin file holding N points x dim coords of `data_type`
DataTypeTag data_type = DataTypeTag::Float;
diskann::Metric metric = diskann::Metric::L2;

// --- Graph build parameters (Vamana) ---
uint32_t R = 64; // max degree
uint32_t L = 100; // search list size during build
float alpha = 1.2f; // pruning alpha
uint32_t num_threads = 0; // 0 = use omp_get_num_procs()

// --- PQ parameters ---
// pq_dim == 0 => no PQ (memory-only unified file; SSD load will reject).
// 0 < pq_dim < dim => train PQ with `pq_dim` chunks on a sampled subset and
// emit pivots + codes into the unified file.
// pq_dim >= dim => train PQ with `dim` chunks (chunk size 1, full-precision
// per dimension). Clamped so the SSD load path -- which
// requires HAS_PQ -- can always load the produced file.
uint32_t pq_dim = 0;
double pq_sampling_rate = 0.1; // fraction of points to sample for pivot training (clamped server-side)

// --- Optional filtered-index inputs ---
std::string label_file; // per-point labels (.txt), empty = unfiltered
std::string universal_label; // string to treat as "any label"
bool use_integer_labels = false;

// --- Output ---
std::string output_path; // destination unified container file
};

// Builds a unified-format index file end-to-end: trains the Vamana graph from
// the input data file, optionally trains PQ on a sampled subset, then writes
// graph + medoids + (optional) PQ + (optional) labels into the unified
// container at `ctx.output_path`.
//
// Class shape (instead of free function) leaves room for future stateful build
// modes (incremental build, multi-pass, etc.). For now `build()` is the only
// method.
class unified_index_builder
{
public:
unified_index_builder();
~unified_index_builder();

// Throws ANNException on failure (file open, mismatched dims, build crash,
// PQ training error, etc.). Returns successfully when the unified file is
// fully written and closed.
void build(const UnifiedBuildContext &ctx);
};

} // namespace diskann
Loading
Loading