From 52997da2051966e63c2ae480be3f33e7a4e7921d Mon Sep 17 00:00:00 2001 From: Julian Miller Date: Tue, 26 May 2026 08:32:53 +0200 Subject: [PATCH 1/6] Add HNSW layered hierarchy --- cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu | 11 +- .../ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h | 30 +- cpp/include/cuvs/neighbors/hnsw.hpp | 18 +- cpp/src/neighbors/detail/hnsw.hpp | 1125 ++++++++++++++++- cpp/tests/neighbors/ann_hnsw_ace.cuh | 257 ++++ .../ann_hnsw_ace/test_float_uint32_t.cu | 10 + .../ann_hnsw_ace/test_half_uint32_t.cu | 20 + .../ann_hnsw_ace/test_int8_t_uint32_t.cu | 20 + .../ann_hnsw_ace/test_uint8_t_uint32_t.cu | 20 + examples/cpp/CMakeLists.txt | 2 + examples/cpp/src/hnsw_ace_layered_example.cu | 278 ++++ 11 files changed, 1723 insertions(+), 68 deletions(-) create mode 100644 examples/cpp/src/hnsw_ace_layered_example.cu diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu index 26028b6d98..b535ced763 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu @@ -1,9 +1,10 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #include "../common/ann_types.hpp" +#include "../common/conf.hpp" #include "cuvs_ann_bench_param_parser.h" #include "cuvs_cagra_hnswlib_wrapper.h" @@ -27,6 +28,9 @@ auto parse_build_param(const nlohmann::json& conf) -> hnsw_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::CPU; } else if (conf.at("hierarchy") == "gpu") { hnsw_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::GPU; + } else if (conf.at("hierarchy") == "gpu_layered_on_disk" || + conf.at("hierarchy") == "gpu_layered" || conf.at("hierarchy") == "layered") { + hnsw_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK; } else { THROW("Invalid value for hierarchy: %s", conf.at("hierarchy").get().c_str()); } @@ -36,6 +40,11 @@ auto parse_build_param(const nlohmann::json& conf) -> if (conf.contains("ef_construction")) { hnsw_params.ef_construction = conf.at("ef_construction"); } + if (conf.contains("dataset_path")) { + hnsw_params.dataset_path = conf.at("dataset_path"); + } else if (hnsw_params.hierarchy == cuvs::neighbors::hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK) { + hnsw_params.dataset_path = configuration::singleton().get_dataset_conf().base_file; + } if (conf.contains("num_threads")) { hnsw_params.num_threads = conf.at("num_threads"); } // Reuse the CAGRA wrapper params parser diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h index 2f0c54e1bd..0104ccdb3e 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -9,10 +9,26 @@ #include #include +#include #include namespace cuvs::bench { +inline void copy_file_overwrite(const std::filesystem::path& src, const std::filesystem::path& dst) +{ + std::error_code ec; + if (src == dst || + (std::filesystem::exists(dst, ec) && std::filesystem::equivalent(src, dst, ec))) { + return; + } + if (!dst.parent_path().empty()) { std::filesystem::create_directories(dst.parent_path()); } + + std::filesystem::copy_file(src, dst, std::filesystem::copy_options::overwrite_existing, ec); + const auto src_str = src.string(); + const auto dst_str = dst.string(); + RAFT_EXPECTS(!ec, "Failed to copy '%s' to '%s'.", src_str.c_str(), dst_str.c_str()); +} + template class cuvs_cagra_hnswlib : public algo, public algo_gpu { public: @@ -130,6 +146,18 @@ void cuvs_cagra_hnswlib::set_search_param(const search_param_base& para template void cuvs_cagra_hnswlib::save(const std::string& file) const { + if (build_param_.hnsw_index_params.hierarchy == + cuvs::neighbors::hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK) { + const auto src_artifact = std::filesystem::path(hnsw_index_->file_path()); + RAFT_EXPECTS(!src_artifact.empty(), "Layered HNSW artifact path is not available."); + RAFT_EXPECTS(std::filesystem::exists(src_artifact), + "Layered HNSW artifact '%s' does not exist.", + src_artifact.c_str()); + + copy_file_overwrite(src_artifact, std::filesystem::path(file)); + return; + } + if (cagra_ace_build_) { std::string index_filename = hnsw_index_->file_path(); RAFT_EXPECTS(!index_filename.empty(), "HNSW index file path is not available."); diff --git a/cpp/include/cuvs/neighbors/hnsw.hpp b/cpp/include/cuvs/neighbors/hnsw.hpp index fb726fed71..5800e05a96 100644 --- a/cpp/include/cuvs/neighbors/hnsw.hpp +++ b/cpp/include/cuvs/neighbors/hnsw.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -41,9 +42,10 @@ namespace graph_build_params = cuvs::neighbors::graph_build_params; * NOTE: When the value is `NONE`, the HNSW index is built as a base-layer-only index. */ enum class HnswHierarchy { - NONE, // base-layer-only index - CPU, // full index with CPU-built hierarchy - GPU // full index with GPU-built hierarchy + NONE, // base-layer-only index + CPU, // full index with CPU-built hierarchy + GPU, // full index with GPU-built hierarchy + GPU_LAYERED_ON_DISK // GPU-built hierarchy stored as layered on-disk topology }; struct index_params : cuvs::neighbors::index_params { @@ -64,6 +66,16 @@ struct index_params : cuvs::neighbors::index_params { */ size_t M = 32; + /** Local dataset path used by layered HNSW deserialization. + * + * When `hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK`, the index artifact stores graph + * topology only. `deserialize` loads vectors from this local dataset path to reconstruct an + * in-memory HNSW index. + * Currently supported local dataset formats are `.npy` and ANN benchmark `*.bin` files with a + * `[uint32 rows, uint32 cols]` header. + */ + std::string dataset_path; + /** Parameters to fine tune GPU graph building. By default we select the parameters based on * dataset shape and HNSW build parameters. You can override these parameters to fine tune the * graph building process as described in the CAGRA build docs. diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp index 4914a0fa1b..44dc4e688e 100644 --- a/cpp/src/neighbors/detail/hnsw.hpp +++ b/cpp/src/neighbors/detail/hnsw.hpp @@ -23,15 +23,28 @@ #include #include +#include +#include +#include +#include +#include +#include #include #include #include +#include #include #include +#include +#include #include +#include +#include #include #include #include +#include +#include namespace cuvs::neighbors::hnsw::detail { @@ -166,6 +179,9 @@ struct index_impl : index { RAFT_LOG_INFO("Loading HNSW index from disk: %s", filepath.c_str()); try { + RAFT_EXPECTS(this->hierarchy() != HnswHierarchy::GPU_LAYERED_ON_DISK, + "Layered HNSW indexes must be loaded with hnsw::deserialize so a local dataset " + "can be provided through index_params.dataset_path."); appr_alg_ = std::make_unique::type>>( space_.get(), filepath); if (this->hierarchy() == HnswHierarchy::NONE) { appr_alg_->base_layer_only = true; } @@ -341,6 +357,796 @@ void all_neighbors_graph(raft::resources const& res, } } +struct hnsw_level_plan { + size_t n_rows = 0; + std::vector hist; + // Bucket-end offsets after order construction. Level L starts at offsets[L - 1]. + std::vector offsets; + std::vector order; + std::vector order_bw; + std::vector levels; + + [[nodiscard]] auto max_level() const -> int + { + return hist.empty() ? 0 : static_cast(hist.size() - 1); + } + + [[nodiscard]] auto promoted_count() const -> size_t + { + return hist.empty() ? 0 : n_rows - hist[0]; + } +}; + +inline auto hnsw_upper_layer_degree(size_t row_count, size_t M) -> size_t +{ + return row_count > M ? M : row_count > 0 ? row_count - 1 : 0; +} + +inline auto elapsed_ms_since(std::chrono::steady_clock::time_point start) -> int64_t +{ + return std::chrono::duration_cast(std::chrono::steady_clock::now() - + start) + .count(); +} + +inline auto to_gib(size_t bytes) -> double +{ + return static_cast(bytes) / (1024.0 * 1024.0 * 1024.0); +} + +inline auto throughput_gib_per_s(size_t bytes, int64_t elapsed_ms) -> double +{ + return elapsed_ms > 0 ? to_gib(bytes) / (elapsed_ms / 1000.0) : 0.0; +} + +inline auto make_hnsw_level_plan_from_levels(size_t n_rows, + std::vector&& levels, + bool build_reverse_order, + const char* log_prefix = nullptr) -> hnsw_level_plan +{ + hnsw_level_plan plan; + plan.n_rows = n_rows; + plan.levels = std::move(levels); + RAFT_EXPECTS(n_rows > 0, "HNSW hierarchy requires at least one row"); + RAFT_EXPECTS(plan.levels.size() == n_rows, + "HNSW level count (%zu) must match row count (%zu)", + plan.levels.size(), + n_rows); + + for (auto level : plan.levels) { + while (static_cast(level) >= plan.hist.size()) { + plan.hist.push_back(0); + } + plan.hist[level]++; + } + + plan.offsets.resize(plan.hist.size() + 1, 0); + for (size_t i = 0; i < plan.hist.size() - 1; ++i) { + plan.offsets[i + 1] = plan.offsets[i] + plan.hist[i]; + if (log_prefix != nullptr) { + RAFT_LOG_INFO("%s%zu : %zu", log_prefix, i + 1, n_rows - plan.offsets[i + 1]); + } + } + + plan.order.resize(n_rows); + if (build_reverse_order) { plan.order_bw.resize(n_rows); } + for (size_t i = 0; i < n_rows; ++i) { + const auto level = static_cast(plan.levels[i]); + if (build_reverse_order) { plan.order_bw[i] = plan.offsets[level]; } + plan.order[plan.offsets[level]++] = i; + } + + return plan; +} + +template +auto make_random_hnsw_level_plan(size_t n_rows, HnswAlgo& appr_algo, const char* log_prefix) + -> hnsw_level_plan +{ + std::vector levels(n_rows); + for (int64_t i = 0; i < static_cast(n_rows); ++i) { + const auto pt_level = appr_algo.getRandomLevel(appr_algo.mult_); + RAFT_EXPECTS(pt_level <= std::numeric_limits::max(), + "HNSW serialization only supports levels up to %u", + static_cast(std::numeric_limits::max())); + levels[i] = static_cast(pt_level); + } + + return make_hnsw_level_plan_from_levels(n_rows, std::move(levels), true, log_prefix); +} + +template +void build_hnsw_upper_layer_graphs( + raft::resources const& res, + raft::host_matrix_view promoted_dataset, + const hnsw_level_plan& plan, + size_t M, + cuvs::distance::DistanceType metric, + Callback&& callback) +{ + static_assert(std::is_same_v, + "HNSW upper-layer graph construction expects uint32_t neighbor ids"); + + const auto dim = static_cast(promoted_dataset.extent(1)); + for (size_t pt_level = 1; pt_level < plan.hist.size(); ++pt_level) { + const auto start_idx = plan.offsets[pt_level - 1]; + const auto removed_rows = start_idx - plan.offsets[0]; + const auto row_count = plan.n_rows - start_idx; + const auto neighbor_size = hnsw_upper_layer_degree(row_count, M); + + RAFT_LOG_INFO("Compute hierarchy neighbors level %zu", pt_level); + auto host_neighbors = raft::make_host_matrix( + static_cast(row_count), static_cast(neighbor_size)); + + if (neighbor_size > 0) { + auto layer_dataset_view = raft::make_host_matrix_view( + promoted_dataset.data_handle() + removed_rows * dim, + static_cast(row_count), + promoted_dataset.extent(1)); + all_neighbors_graph(res, layer_dataset_view, host_neighbors.view(), metric); + } + + callback(pt_level, start_idx, host_neighbors); + } +} + +struct layered_hnsw_layer_info { + size_t level = 0; + size_t row_count = 0; + size_t degree = 0; + size_t node_offset = 0; + size_t link_offset = 0; +}; + +struct layered_hnsw_file_metadata { + size_t n_rows = 0; + size_t dim = 0; + size_t M = 0; + size_t maxM = 0; + size_t maxM0 = 0; + size_t ef_construction = 0; + size_t base_degree = 0; + size_t levels_bytes = 0; + size_t base_link_row_bytes = 0; + size_t base_links_bytes = 0; + size_t upper_nodes_count = 0; + size_t upper_nodes_bytes = 0; + size_t upper_link_row_bytes = 0; + size_t upper_links_bytes = 0; + double mult = 0.0; + int maxlevel = 0; + int enterpoint_node = 0; + std::vector layers; +}; + +struct layered_hnsw_file_header { + char magic[32]; + uint32_t version; + uint32_t reserved; + uint64_t metadata_size; + uint64_t metadata_offset; +}; + +constexpr const char* layered_hnsw_magic = "CUVS_HNSW_LAYERED"; +constexpr uint32_t layered_hnsw_version = 1; +constexpr size_t layered_hnsw_alignment = 64; + +inline auto align_up(size_t value, size_t alignment) -> size_t +{ + return ((value + alignment - 1) / alignment) * alignment; +} + +template +auto layered_dtype_name() -> const char* +{ + if constexpr (std::is_same_v) { + return "float32"; + } else if constexpr (std::is_same_v) { + return "float16"; + } else if constexpr (std::is_same_v) { + return "uint8"; + } else if constexpr (std::is_same_v) { + return "int8"; + } else { + return "unknown"; + } +} + +inline auto metric_name(cuvs::distance::DistanceType metric) -> const char* +{ + switch (metric) { + case cuvs::distance::DistanceType::L2Expanded: return "L2Expanded"; + case cuvs::distance::DistanceType::InnerProduct: return "InnerProduct"; + default: return "Unknown"; + } +} + +template +auto make_layered_hnsw_metadata_json(const layered_hnsw_file_metadata& metadata, + cuvs::distance::DistanceType metric) -> std::string +{ + std::stringstream os; + os << "{\n"; + os << " \"format\": \"cuvs_hnsw_layered\",\n"; + os << " \"version\": 1,\n"; + os << " \"n_rows\": " << metadata.n_rows << ",\n"; + os << " \"dim\": " << metadata.dim << ",\n"; + os << " \"dtype\": \"" << layered_dtype_name() << "\",\n"; + os << " \"metric\": \"" << metric_name(metric) << "\",\n"; + os << " \"row_order\": \"original_id\",\n"; + os << " \"M\": " << metadata.M << ",\n"; + os << " \"maxM\": " << metadata.maxM << ",\n"; + os << " \"maxM0\": " << metadata.maxM0 << ",\n"; + os << " \"ef_construction\": " << metadata.ef_construction << ",\n"; + os << " \"mult\": " << metadata.mult << ",\n"; + os << " \"maxlevel\": " << metadata.maxlevel << ",\n"; + os << " \"enterpoint_node\": " << metadata.enterpoint_node << ",\n"; + os << " \"base_degree\": " << metadata.base_degree << ",\n"; + os << " \"levels_bytes\": " << metadata.levels_bytes << ",\n"; + os << " \"base_link_row_bytes\": " << metadata.base_link_row_bytes << ",\n"; + os << " \"base_links_bytes\": " << metadata.base_links_bytes << ",\n"; + os << " \"upper_nodes_count\": " << metadata.upper_nodes_count << ",\n"; + os << " \"upper_nodes_bytes\": " << metadata.upper_nodes_bytes << ",\n"; + os << " \"upper_link_row_bytes\": " << metadata.upper_link_row_bytes << ",\n"; + os << " \"upper_links_bytes\": " << metadata.upper_links_bytes << ",\n"; + os << " \"layers\": [\n"; + for (size_t i = 0; i < metadata.layers.size(); ++i) { + const auto& layer = metadata.layers[i]; + os << " {\"level\": " << layer.level << ", \"row_count\": " << layer.row_count + << ", \"degree\": " << layer.degree << ", \"node_offset\": " << layer.node_offset + << ", \"link_offset\": " << layer.link_offset << "}"; + os << (i + 1 == metadata.layers.size() ? "\n" : ",\n"); + } + os << " ]\n"; + os << "}\n"; + return os.str(); +} + +inline auto json_find_key(const std::string& json, const std::string& key, size_t start = 0) + -> size_t +{ + const auto pos = json.find("\"" + key + "\"", start); + RAFT_EXPECTS( + pos != std::string::npos, "Cannot find key '%s' in layered HNSW metadata", key.c_str()); + return pos; +} + +inline auto json_parse_size(const std::string& json, const std::string& key) -> size_t +{ + auto pos = json_find_key(json, key); + auto colon = json.find(':', pos); + RAFT_EXPECTS(colon != std::string::npos, "Malformed JSON near key '%s'", key.c_str()); + auto begin = json.find_first_of("0123456789", colon + 1); + auto end = json.find_first_not_of("0123456789", begin); + RAFT_EXPECTS( + begin != std::string::npos, "Malformed integer JSON value for key '%s'", key.c_str()); + return static_cast(std::stoull(json.substr(begin, end - begin))); +} + +inline auto json_parse_double(const std::string& json, const std::string& key) -> double +{ + auto pos = json_find_key(json, key); + auto colon = json.find(':', pos); + RAFT_EXPECTS(colon != std::string::npos, "Malformed JSON near key '%s'", key.c_str()); + auto begin = json.find_first_of("0123456789-.", colon + 1); + auto end = json.find_first_not_of("0123456789-.eE+", begin); + RAFT_EXPECTS(begin != std::string::npos, "Malformed double JSON value for key '%s'", key.c_str()); + return std::stod(json.substr(begin, end - begin)); +} + +inline auto json_parse_layer_field(const std::string& object, const std::string& key) -> size_t +{ + return json_parse_size(object, key); +} + +inline auto parse_layered_hnsw_metadata(const std::string& json) -> layered_hnsw_file_metadata +{ + layered_hnsw_file_metadata metadata; + metadata.n_rows = json_parse_size(json, "n_rows"); + metadata.dim = json_parse_size(json, "dim"); + metadata.M = json_parse_size(json, "M"); + metadata.maxM = json_parse_size(json, "maxM"); + metadata.maxM0 = json_parse_size(json, "maxM0"); + metadata.ef_construction = json_parse_size(json, "ef_construction"); + metadata.mult = json_parse_double(json, "mult"); + metadata.maxlevel = static_cast(json_parse_size(json, "maxlevel")); + metadata.enterpoint_node = static_cast(json_parse_size(json, "enterpoint_node")); + metadata.base_degree = json_parse_size(json, "base_degree"); + metadata.levels_bytes = json_parse_size(json, "levels_bytes"); + metadata.base_link_row_bytes = json_parse_size(json, "base_link_row_bytes"); + metadata.base_links_bytes = json_parse_size(json, "base_links_bytes"); + metadata.upper_nodes_count = json_parse_size(json, "upper_nodes_count"); + metadata.upper_nodes_bytes = json_parse_size(json, "upper_nodes_bytes"); + metadata.upper_link_row_bytes = json_parse_size(json, "upper_link_row_bytes"); + metadata.upper_links_bytes = json_parse_size(json, "upper_links_bytes"); + + auto layers_pos = json_find_key(json, "layers"); + auto array_open = json.find('[', layers_pos); + auto array_end = json.find(']', array_open); + RAFT_EXPECTS(array_open != std::string::npos && array_end != std::string::npos, + "Malformed layers array in layered HNSW metadata"); + + size_t pos = array_open; + while (true) { + auto object_open = json.find('{', pos); + if (object_open == std::string::npos || object_open > array_end) { break; } + auto object_end = json.find('}', object_open); + RAFT_EXPECTS(object_end != std::string::npos && object_end < array_end, + "Malformed layer object in layered HNSW metadata"); + const auto object = json.substr(object_open, object_end - object_open + 1); + metadata.layers.push_back({json_parse_layer_field(object, "level"), + json_parse_layer_field(object, "row_count"), + json_parse_layer_field(object, "degree"), + json_parse_layer_field(object, "node_offset"), + json_parse_layer_field(object, "link_offset")}); + pos = object_end + 1; + } + return metadata; +} + +struct npy_file { + cuvs::util::file_descriptor fd; + size_t header_size = 0; + std::vector shape; +}; + +inline auto open_npy_file(const std::string& path) -> npy_file +{ + std::ifstream stream(path, std::ios::binary); + RAFT_EXPECTS(stream.good(), "Failed to open numpy file: %s", path.c_str()); + auto header = raft::detail::numpy_serializer::read_header(stream); + auto header_size = static_cast(stream.tellg()); + auto fd = cuvs::util::file_descriptor(path, O_RDONLY); + return {std::move(fd), header_size, header.shape}; +} + +inline auto ends_with(const std::string& value, const std::string& suffix) -> bool +{ + return value.size() >= suffix.size() && + value.compare(value.size() - suffix.size(), suffix.size(), suffix) == 0; +} + +inline void copy_file_overwrite(const std::filesystem::path& src, const std::filesystem::path& dst) +{ + std::error_code ec; + if (src == dst || + (std::filesystem::exists(dst, ec) && std::filesystem::equivalent(src, dst, ec))) { + return; + } + if (!dst.parent_path().empty()) { std::filesystem::create_directories(dst.parent_path()); } + std::filesystem::copy_file(src, dst, std::filesystem::copy_options::overwrite_existing); +} + +template +inline auto open_layered_dataset_file(const std::string& path) -> npy_file +{ + if (std::filesystem::path(path).extension() == ".npy") { return open_npy_file(path); } + + std::ifstream stream(path, std::ios::binary); + RAFT_EXPECTS(stream.good(), "Failed to open dataset file: %s", path.c_str()); + std::array header{}; + stream.read(reinterpret_cast(header.data()), sizeof(uint32_t) * header.size()); + RAFT_EXPECTS(stream.gcount() == static_cast(sizeof(uint32_t) * header.size()), + "Failed to read ANN benchmark binary dataset header: %s", + path.c_str()); + + const auto ext = std::filesystem::path(path).extension().string(); + if constexpr (std::is_same_v) { + RAFT_EXPECTS(ext == ".fbin" && !ends_with(path, ".fp16.fbin"), + "Expected a .fbin dataset for float layered HNSW load: %s", + path.c_str()); + } else if constexpr (std::is_same_v) { + RAFT_EXPECTS(ext == ".f16bin" || ends_with(path, ".fp16.fbin"), + "Expected a .f16bin or .fp16.fbin dataset for half layered HNSW load: %s", + path.c_str()); + } else if constexpr (std::is_same_v) { + RAFT_EXPECTS( + ext == ".u8bin", "Expected a .u8bin dataset for uint8 layered HNSW load: %s", path.c_str()); + } else if constexpr (std::is_same_v) { + RAFT_EXPECTS( + ext == ".i8bin", "Expected a .i8bin dataset for int8 layered HNSW load: %s", path.c_str()); + } + + auto fd = cuvs::util::file_descriptor(path, O_RDONLY); + return {std::move(fd), sizeof(uint32_t) * header.size(), {header[0], header[1]}}; +} + +template +void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index& index_, + const cuvs::util::file_descriptor& output_fd, + size_t output_offset, + size_t base_link_row_bytes, + size_t maxM0) +{ + const auto total_start_time = std::chrono::steady_clock::now(); + const auto& graph_fd_opt = index_.graph_fd(); + const auto& mapping_fd_opt = index_.mapping_fd(); + RAFT_EXPECTS(graph_fd_opt.has_value() && graph_fd_opt->is_valid(), + "Graph file descriptor is not available"); + RAFT_EXPECTS(mapping_fd_opt.has_value() && mapping_fd_opt->is_valid(), + "Mapping file descriptor is not available"); + + const auto graph_path = graph_fd_opt->get_path(); + const auto mapping_path = mapping_fd_opt->get_path(); + RAFT_EXPECTS(!graph_path.empty(), "Unable to get path from graph file descriptor"); + RAFT_EXPECTS(!mapping_path.empty(), "Unable to get path from mapping file descriptor"); + + auto graph_npy = open_npy_file(graph_path); + auto mapping_npy = open_npy_file(mapping_path); + + RAFT_EXPECTS(graph_npy.shape.size() == 2, "Graph file should be 2D"); + RAFT_EXPECTS(mapping_npy.shape.size() == 1, "Mapping file should be 1D"); + const auto n_rows = graph_npy.shape[0]; + const auto degree = graph_npy.shape[1]; + RAFT_EXPECTS(mapping_npy.shape[0] == n_rows, + "Mapping elements (%zu) != graph rows (%zu)", + mapping_npy.shape[0], + n_rows); + RAFT_EXPECTS(n_rows == static_cast(index_.size()), + "Graph rows (%zu) != index size (%zu)", + n_rows, + static_cast(index_.size())); + RAFT_EXPECTS(degree == static_cast(index_.graph_degree()), + "Graph degree (%zu) != index graph degree (%zu)", + degree, + static_cast(index_.graph_degree())); + RAFT_EXPECTS(degree > 0, "Graph degree must be nonzero"); + RAFT_EXPECTS( + degree <= maxM0, "Base graph degree (%zu) must not exceed HNSW maxM0 (%zu)", degree, maxM0); + RAFT_EXPECTS(base_link_row_bytes >= sizeof(hnswlib::linklistsizeint) + maxM0 * sizeof(IdxT), + "Base link row size is too small"); + + RAFT_LOG_INFO("HNSW remap: loading ACE reordered-to-original mapping (%zu rows)", n_rows); + const auto mapping_start_time = std::chrono::steady_clock::now(); + const auto mapping_bytes = n_rows * sizeof(IdxT); + std::vector reordered_to_original(n_rows); + std::vector original_to_reordered(n_rows); + cuvs::util::read_large_file( + mapping_npy.fd, reordered_to_original.data(), mapping_bytes, mapping_npy.header_size); + for (size_t reordered_id = 0; reordered_id < n_rows; ++reordered_id) { + const auto original_id = static_cast(reordered_to_original[reordered_id]); + RAFT_EXPECTS(original_id < n_rows, + "Invalid original id %zu in ACE dataset mapping at row %zu", + original_id, + reordered_id); + original_to_reordered[original_id] = static_cast(reordered_id); + } + const auto mapping_elapsed_ms = elapsed_ms_since(mapping_start_time); + RAFT_LOG_INFO("HNSW remap: mapping loaded in %ld ms (%.2f GiB, %.2f GiB/s)", + mapping_elapsed_ms, + to_gib(mapping_bytes), + throughput_gib_per_s(mapping_bytes, mapping_elapsed_ms)); + + const size_t graph_row_bytes = degree * sizeof(IdxT); + const size_t target_batch_bytes = 64 * 1024 * 1024; + const size_t batch_size = std::max(1, target_batch_bytes / base_link_row_bytes); + const size_t max_gap_rows = std::max(1, (128 * 1024) / graph_row_bytes); + const size_t max_span_rows = std::max(1, (8 * 1024 * 1024) / graph_row_bytes); + + RAFT_LOG_INFO( + "HNSW remap: n_rows=%zu degree=%zu maxM0=%zu batch_size=%zu max_gap=%zu rows max_span=%zu rows", + n_rows, + degree, + maxM0, + batch_size, + max_gap_rows, + max_span_rows); + + auto span_buffer = raft::make_host_matrix(static_cast(max_span_rows), + static_cast(degree)); + std::vector output_buffer(batch_size * base_link_row_bytes); + std::vector source_rows(batch_size); + std::vector order(batch_size); + + const auto graph_start_time = std::chrono::steady_clock::now(); + size_t spans_read = 0; + size_t bytes_read = 0; + size_t bytes_written = 0; + + for (size_t batch_start = 0; batch_start < n_rows; batch_start += batch_size) { + const auto current_batch_size = std::min(batch_size, n_rows - batch_start); + std::fill( + output_buffer.begin(), output_buffer.begin() + current_batch_size * base_link_row_bytes, 0); + for (size_t i = 0; i < current_batch_size; ++i) { + source_rows[i] = original_to_reordered[batch_start + i]; + order[i] = i; + } + std::sort(order.begin(), order.begin() + current_batch_size, [&](size_t lhs, size_t rhs) { + return source_rows[lhs] < source_rows[rhs]; + }); + + size_t sorted_pos = 0; + while (sorted_pos < current_batch_size) { + const auto span_start = static_cast(source_rows[order[sorted_pos]]); + auto span_end = span_start; + auto span_next = sorted_pos + 1; + while (span_next < current_batch_size) { + const auto next_row = static_cast(source_rows[order[span_next]]); + if (next_row - span_start + 1 > max_span_rows) { break; } + if (next_row > span_end + max_gap_rows + 1) { break; } + span_end = next_row; + ++span_next; + } + + const auto span_rows = span_end - span_start + 1; + const auto span_bytes = span_rows * graph_row_bytes; + cuvs::util::read_large_file(graph_npy.fd, + span_buffer.data_handle(), + span_bytes, + graph_npy.header_size + span_start * graph_row_bytes); + ++spans_read; + bytes_read += span_bytes; + + for (size_t i = sorted_pos; i < span_next; ++i) { + const auto dst_row = order[i]; + const auto source_row = static_cast(source_rows[dst_row]); + auto* src = span_buffer.data_handle() + (source_row - span_start) * degree; + auto* dst_row_ptr = output_buffer.data() + dst_row * base_link_row_bytes; + hnswlib::linklistsizeint list_count = static_cast(degree); + std::memcpy(dst_row_ptr, &list_count, sizeof(list_count)); + auto* dst = reinterpret_cast(dst_row_ptr + sizeof(hnswlib::linklistsizeint)); + for (size_t j = 0; j < degree; ++j) { + const auto neighbor = static_cast(src[j]); + RAFT_EXPECTS(neighbor < n_rows, + "Invalid reordered neighbor id %zu in ACE graph row %zu", + neighbor, + source_row); + dst[j] = reordered_to_original[neighbor]; + } + } + sorted_pos = span_next; + } + + const auto batch_bytes = current_batch_size * base_link_row_bytes; + cuvs::util::write_large_file(output_fd, + output_buffer.data(), + batch_bytes, + output_offset + batch_start * base_link_row_bytes); + bytes_written += batch_bytes; + } + + const auto graph_elapsed_ms = elapsed_ms_since(graph_start_time); + const auto total_elapsed_ms = elapsed_ms_since(total_start_time); + const auto read_gib = to_gib(bytes_read); + const auto written_gib = to_gib(bytes_written); + const auto ideal_read_bytes = n_rows * graph_row_bytes; + const auto amplification = + ideal_read_bytes == 0 ? 0.0 : static_cast(bytes_read) / ideal_read_bytes; + RAFT_LOG_INFO( + "HNSW remap: base links written in %ld ms: read %.2f GiB at %.2f GiB/s across %zu spans " + "(%.2fx amplification), wrote %.2f GiB at %.2f GiB/s", + graph_elapsed_ms, + read_gib, + throughput_gib_per_s(bytes_read, graph_elapsed_ms), + spans_read, + amplification, + written_gib, + throughput_gib_per_s(bytes_written, graph_elapsed_ms)); + RAFT_LOG_INFO("HNSW remap: completed in %ld ms total", total_elapsed_ms); +} + +template +auto serialize_to_layered_hnsw_from_disk( + raft::resources const& res, + const cuvs::neighbors::hnsw::index_params& params, + const cuvs::neighbors::cagra::index& index_, + raft::host_matrix_view dataset) -> std::string +{ + raft::common::nvtx::range fun_scope("hnsw::serialize_layered"); + const auto total_start_time = std::chrono::steady_clock::now(); + + RAFT_EXPECTS(index_.graph_fd().has_value() && index_.mapping_fd().has_value(), + "Layered HNSW serialization requires a disk-backed ACE graph and mapping."); + RAFT_EXPECTS(static_cast(dataset.extent(0)) == static_cast(index_.size()), + "Dataset rows (%zu) must match index size (%zu)", + static_cast(dataset.extent(0)), + static_cast(index_.size())); + RAFT_EXPECTS(static_cast(dataset.extent(1)) == static_cast(index_.dim()), + "Dataset cols (%zu) must match index dimensions (%zu)", + static_cast(dataset.extent(1)), + static_cast(index_.dim())); + + const auto graph_path = index_.graph_fd()->get_path(); + RAFT_EXPECTS(!graph_path.empty(), "Unable to get path from graph file descriptor"); + const auto ace_dir = std::filesystem::path(graph_path).parent_path(); + const auto artifact_file = ace_dir / "hnsw_index.cuvs"; + std::filesystem::create_directories(ace_dir); + + auto n_rows = static_cast(index_.size()); + auto dim = static_cast(index_.dim()); + auto graph_degree_int = static_cast(index_.graph_degree()); + + auto hnsw_index = + std::make_unique>(index_.dim(), index_.metric(), params.hierarchy); + auto appr_algo = std::make_unique::type>>( + hnsw_index->get_space(), 1, (graph_degree_int + 1) / 2, params.ef_construction); + + RAFT_LOG_INFO("Layered HNSW: generating hierarchy levels"); + const auto hierarchy_start_time = std::chrono::steady_clock::now(); + auto hierarchy = make_random_hnsw_level_plan(n_rows, *appr_algo, "Layered HNSW: Level "); + const auto hierarchy_elapsed_ms = elapsed_ms_since(hierarchy_start_time); + RAFT_LOG_INFO("Layered HNSW: hierarchy levels generated in %ld ms (max_level=%d, promoted=%zu)", + hierarchy_elapsed_ms, + hierarchy.max_level(), + hierarchy.promoted_count()); + + layered_hnsw_file_metadata metadata; + metadata.n_rows = n_rows; + metadata.dim = dim; + metadata.M = appr_algo->M_; + metadata.maxM = appr_algo->maxM_; + metadata.maxM0 = appr_algo->maxM0_; + metadata.ef_construction = appr_algo->ef_construction_; + metadata.mult = appr_algo->mult_; + metadata.maxlevel = hierarchy.max_level(); + metadata.enterpoint_node = static_cast(hierarchy.order.back()); + metadata.base_degree = static_cast(graph_degree_int); + metadata.levels_bytes = n_rows * sizeof(uint8_t); + metadata.base_link_row_bytes = appr_algo->size_links_level0_; + metadata.base_links_bytes = n_rows * metadata.base_link_row_bytes; + metadata.upper_link_row_bytes = appr_algo->size_links_per_element_; + size_t next_upper_node_offset = 0; + size_t next_upper_link_offset = 0; + for (size_t pt_level = 1; pt_level < hierarchy.hist.size(); pt_level++) { + auto start_idx = hierarchy.offsets[pt_level - 1]; + auto row_count = n_rows - start_idx; + auto layer_degree = hnsw_upper_layer_degree(row_count, appr_algo->M_); + metadata.layers.push_back( + {pt_level, row_count, layer_degree, next_upper_node_offset, next_upper_link_offset}); + next_upper_node_offset += row_count; + next_upper_link_offset += row_count; + } + metadata.upper_nodes_count = next_upper_node_offset; + metadata.upper_nodes_bytes = metadata.upper_nodes_count * sizeof(IdxT); + metadata.upper_links_bytes = next_upper_link_offset * metadata.upper_link_row_bytes; + + const auto metadata_json = make_layered_hnsw_metadata_json(metadata, index_.metric()); + layered_hnsw_file_header header{}; + std::strncpy(header.magic, layered_hnsw_magic, sizeof(header.magic) - 1); + header.version = layered_hnsw_version; + header.metadata_size = metadata_json.size(); + header.metadata_offset = sizeof(layered_hnsw_file_header); + + const auto payload_offset = + align_up(header.metadata_offset + header.metadata_size, layered_hnsw_alignment); + const auto levels_offset = payload_offset; + const auto base_links_offset = levels_offset + metadata.levels_bytes; + const auto upper_nodes_offset = base_links_offset + metadata.base_links_bytes; + const auto upper_links_offset = upper_nodes_offset + metadata.upper_nodes_bytes; + const auto final_file_size = upper_links_offset + metadata.upper_links_bytes; + + cuvs::util::file_descriptor artifact_fd(artifact_file.string(), O_CREAT | O_RDWR | O_TRUNC, 0644); + const auto fallocate_result = posix_fallocate(artifact_fd.get(), 0, final_file_size); + RAFT_EXPECTS(fallocate_result == 0, + "Failed to pre-allocate layered HNSW artifact %s: %s", + artifact_file.string().c_str(), + std::strerror(fallocate_result)); + cuvs::util::write_large_file(artifact_fd, &header, sizeof(header), 0); + cuvs::util::write_large_file( + artifact_fd, metadata_json.data(), metadata_json.size(), header.metadata_offset); + + const auto levels_start_time = std::chrono::steady_clock::now(); + cuvs::util::write_large_file( + artifact_fd, hierarchy.levels.data(), metadata.levels_bytes, levels_offset); + const auto levels_elapsed_ms = elapsed_ms_since(levels_start_time); + RAFT_LOG_INFO("Layered HNSW: wrote levels section in %ld ms (%.2f GiB, %.2f GiB/s)", + levels_elapsed_ms, + to_gib(metadata.levels_bytes), + throughput_gib_per_s(metadata.levels_bytes, levels_elapsed_ms)); + + RAFT_LOG_INFO("Layered HNSW: writing hnswlib-ready base links"); + const auto layer0_start_time = std::chrono::steady_clock::now(); + write_layered_base_links_from_disk( + index_, artifact_fd, base_links_offset, metadata.base_link_row_bytes, metadata.maxM0); + const auto layer0_elapsed_ms = elapsed_ms_since(layer0_start_time); + RAFT_LOG_INFO("Layered HNSW: base links written in %ld ms (%.2f GiB, %.2f GiB/s effective)", + layer0_elapsed_ms, + to_gib(metadata.base_links_bytes), + throughput_gib_per_s(metadata.base_links_bytes, layer0_elapsed_ms)); + + size_t upper_graph_bytes_written = 0; + if (hierarchy.hist.size() > 1) { + RAFT_LOG_INFO("Layered HNSW: gathering promoted vectors"); + const auto gather_start_time = std::chrono::steady_clock::now(); + auto host_query_set = + raft::make_host_matrix(static_cast(hierarchy.promoted_count()), dim); +#pragma omp parallel for + for (int64_t i = 0; i < static_cast(n_rows); i++) { + if (hierarchy.levels[i] > 0) { + const auto query_row = hierarchy.order_bw[i] - hierarchy.hist[0]; + auto* dst = host_query_set.data_handle() + query_row * dim; + std::copy(&dataset(i, 0), &dataset(i, 0) + dim, dst); + } + } + const auto gather_elapsed_ms = elapsed_ms_since(gather_start_time); + const auto gathered_bytes = hierarchy.promoted_count() * dim * sizeof(T); + RAFT_LOG_INFO("Layered HNSW: gathered promoted vectors in %ld ms (%.2f GiB copied, %.2f GiB/s)", + gather_elapsed_ms, + to_gib(gathered_bytes), + throughput_gib_per_s(gathered_bytes, gather_elapsed_ms)); + + auto promoted_dataset = raft::make_host_matrix_view( + host_query_set.data_handle(), host_query_set.extent(0), host_query_set.extent(1)); + const auto upper_layers_start_time = std::chrono::steady_clock::now(); + build_hnsw_upper_layer_graphs( + res, + promoted_dataset, + hierarchy, + appr_algo->M_, + index_.metric(), + [&](size_t pt_level, size_t start_idx, auto& host_neighbors) { + const auto& layer = metadata.layers[pt_level - 1]; + RAFT_LOG_INFO("Layered HNSW: writing upper layer %zu (%zu rows, degree %zu)", + layer.level, + layer.row_count, + layer.degree); + const auto layer_write_start_time = std::chrono::steady_clock::now(); + const size_t target_batch_bytes = 64 * 1024 * 1024; + const size_t row_bytes = sizeof(IdxT) + metadata.upper_link_row_bytes; + const size_t batch_size = std::max(1, target_batch_bytes / row_bytes); + std::vector node_buffer(batch_size); + std::vector link_buffer(batch_size * metadata.upper_link_row_bytes); + for (size_t batch_start = 0; batch_start < layer.row_count; batch_start += batch_size) { + const auto current_batch_size = std::min(batch_size, layer.row_count - batch_start); + std::fill(link_buffer.begin(), + link_buffer.begin() + current_batch_size * metadata.upper_link_row_bytes, + 0); +#pragma omp parallel for + for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); + ++batch_idx) { + const auto row = batch_start + static_cast(batch_idx); + node_buffer[batch_idx] = static_cast(hierarchy.order[start_idx + row]); + auto* link_row = link_buffer.data() + batch_idx * metadata.upper_link_row_bytes; + hnswlib::linklistsizeint list_count = + static_cast(layer.degree); + std::memcpy(link_row, &list_count, sizeof(list_count)); + auto* dst = reinterpret_cast(link_row + sizeof(hnswlib::linklistsizeint)); + if (layer.degree > 0) { + auto* src = host_neighbors.data_handle() + row * layer.degree; + for (size_t j = 0; j < layer.degree; ++j) { + dst[j] = static_cast(hierarchy.order[src[j] + start_idx]); + } + } + } + cuvs::util::write_large_file( + artifact_fd, + node_buffer.data(), + current_batch_size * sizeof(IdxT), + upper_nodes_offset + (layer.node_offset + batch_start) * sizeof(IdxT)); + cuvs::util::write_large_file( + artifact_fd, + link_buffer.data(), + current_batch_size * metadata.upper_link_row_bytes, + upper_links_offset + (layer.link_offset + batch_start) * metadata.upper_link_row_bytes); + } + const auto layer_bytes = layer.row_count * (sizeof(IdxT) + metadata.upper_link_row_bytes); + upper_graph_bytes_written += layer_bytes; + const auto layer_write_elapsed_ms = elapsed_ms_since(layer_write_start_time); + RAFT_LOG_INFO("Layered HNSW: upper layer %zu written in %ld ms (%.2f GiB, %.2f GiB/s)", + layer.level, + layer_write_elapsed_ms, + to_gib(layer_bytes), + throughput_gib_per_s(layer_bytes, layer_write_elapsed_ms)); + }); + const auto upper_layers_elapsed_ms = elapsed_ms_since(upper_layers_start_time); + RAFT_LOG_INFO( + "Layered HNSW: upper layers generated and written in %ld ms (%.2f GiB written, %.2f " + "GiB/s effective)", + upper_layers_elapsed_ms, + to_gib(upper_graph_bytes_written), + throughput_gib_per_s(upper_graph_bytes_written, upper_layers_elapsed_ms)); + } + + const auto total_elapsed_ms = elapsed_ms_since(total_start_time); + RAFT_LOG_INFO( + "Layered HNSW index written to: %s in %ld ms (artifact %.2f GiB, %.2f GiB/s effective)", + artifact_file.string().c_str(), + total_elapsed_ms, + to_gib(final_file_size), + throughput_gib_per_s(final_file_size, total_elapsed_ms)); + return artifact_file.string(); +} + template void serialize_to_hnswlib_from_disk(raft::resources const& res, std::ostream& os_raw, @@ -501,44 +1307,12 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res, hnsw_index->get_space(), 1, (graph_degree_int + 1) / 2, params.ef_construction); bool create_hierarchy = params.hierarchy != HnswHierarchy::NONE; - - // create hierarchy order - // sort the points by levels - // roll dice & build histogram - std::vector hist; - std::vector order(n_rows); - std::vector order_bw(n_rows); - std::vector levels(n_rows); - std::vector offsets; - - if (create_hierarchy) { - RAFT_LOG_INFO("Sort points by levels"); - for (int64_t i = 0; i < n_rows; i++) { - auto pt_level = appr_algo->getRandomLevel(appr_algo->mult_); - while (pt_level >= static_cast(hist.size())) - hist.push_back(0); - hist[pt_level]++; - levels[i] = pt_level; - } - - // accumulate - offsets.resize(hist.size() + 1, 0); - for (size_t i = 0; i < hist.size() - 1; i++) { - offsets[i + 1] = offsets[i] + hist[i]; - RAFT_LOG_INFO("Level %zu : %zu", i + 1, size_t(n_rows) - offsets[i + 1]); - } - - // fw/bw indices - for (int64_t i = 0; i < n_rows; i++) { - auto pt_level = levels[i]; - order_bw[i] = offsets[pt_level]; - order[offsets[pt_level]++] = i; - } - } + hnsw_level_plan hierarchy; + if (create_hierarchy) { hierarchy = make_random_hnsw_level_plan(n_rows, *appr_algo, "Level "); } // set last point of the highest level as the entry point - appr_algo->enterpoint_node_ = create_hierarchy ? order.back() : n_rows / 2; - appr_algo->maxlevel_ = create_hierarchy ? hist.size() - 1 : 1; + appr_algo->enterpoint_node_ = create_hierarchy ? hierarchy.order.back() : n_rows / 2; + appr_algo->maxlevel_ = create_hierarchy ? hierarchy.max_level() : 1; // write header information RAFT_LOG_DEBUG("Writing HNSW header: offsetLevel0=%zu, n_rows=%zu, size_data_per_element=%zu", @@ -582,7 +1356,7 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res, // host queries auto host_query_set = - raft::make_host_matrix(create_hierarchy ? n_rows - hist[0] : 0, dim); + raft::make_host_matrix(create_hierarchy ? hierarchy.promoted_count() : 0, dim); int64_t d_report_offset = n_rows / 10; // Report progress in 10% steps. int64_t next_report_offset = d_report_offset; @@ -690,11 +1464,11 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res, const T* data_row = &dataset_buffer(batch_idx, 0); os.write(reinterpret_cast(data_row), sizeof(T) * dim); - if (create_hierarchy && levels[i] > 0) { + if (create_hierarchy && hierarchy.levels[i] > 0) { // position in query: order_bw[i]-hist[0] - std::copy(data_row, - data_row + dim, - reinterpret_cast(&host_query_set(order_bw[i] - hist[0], 0))); + auto* dst = host_query_set.data_handle() + + (hierarchy.order_bw[i] - hierarchy.hist[0]) * static_cast(dim); + std::copy(data_row, data_row + dim, dst); } // assign original label @@ -732,22 +1506,15 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res, // trigger knn builds for all levels std::vector> host_neighbors; if (create_hierarchy) { - for (size_t pt_level = 1; pt_level < hist.size(); pt_level++) { - auto num_pts = n_rows - offsets[pt_level - 1]; - auto neighbor_size = num_pts > appr_algo->M_ ? appr_algo->M_ : num_pts - 1; - host_neighbors.emplace_back(raft::make_host_matrix(num_pts, neighbor_size)); - } - for (size_t pt_level = 1; pt_level < hist.size(); pt_level++) { - RAFT_LOG_INFO("Compute hierarchy neighbors level %zu", pt_level); - auto removed_rows = offsets[pt_level - 1] - offsets[0]; - raft::host_matrix_view sub_query_view( - host_query_set.data_handle() + removed_rows * dim, - host_query_set.extent(0) - removed_rows, - dim); - auto neighbor_view = host_neighbors[pt_level - 1].view(); - all_neighbors_graph( - res, raft::make_const_mdspan(sub_query_view), neighbor_view, index_.metric()); - } + host_neighbors.reserve(hierarchy.hist.size() - 1); + build_hnsw_upper_layer_graphs( + res, + raft::make_host_matrix_view( + host_query_set.data_handle(), host_query_set.extent(0), host_query_set.extent(1)), + hierarchy, + appr_algo->M_, + index_.metric(), + [&](size_t, size_t, auto& neighbors) { host_neighbors.emplace_back(std::move(neighbors)); }); } if (create_hierarchy) { @@ -758,7 +1525,7 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res, start_clock = std::chrono::system_clock::now(); for (int64_t i = 0; i < n_rows; i++) { - size_t cur_level = create_hierarchy ? levels[i] : 0; + size_t cur_level = create_hierarchy ? hierarchy.levels[i] : 0; unsigned int linkListSize = create_hierarchy && cur_level > 0 ? appr_algo->size_links_per_element_ * cur_level : 0; os.write(reinterpret_cast(&linkListSize), sizeof(int)); @@ -766,14 +1533,16 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res, if (linkListSize) { for (size_t pt_level = 1; pt_level <= cur_level; pt_level++) { auto neighbor_view = host_neighbors[pt_level - 1].view(); - auto my_row = order_bw[i] - offsets[pt_level - 1]; + auto my_row = hierarchy.order_bw[i] - hierarchy.offsets[pt_level - 1]; - IdxT* neighbors = &neighbor_view(my_row, 0); unsigned int extent = neighbor_view.extent(1); os.write(reinterpret_cast(&extent), sizeof(int)); - for (unsigned int j = 0; j < extent; j++) { - const IdxT converted = order[neighbors[j] + offsets[pt_level - 1]]; - os.write(reinterpret_cast(&converted), sizeof(IdxT)); + if (extent > 0) { + IdxT* neighbors = &neighbor_view(my_row, 0); + for (unsigned int j = 0; j < extent; j++) { + const IdxT converted = hierarchy.order[neighbors[j] + hierarchy.offsets[pt_level - 1]]; + os.write(reinterpret_cast(&converted), sizeof(IdxT)); + } } auto remainder = appr_algo->M_ - neighbor_view.extent(1); for (size_t j = 0; j < remainder; j++) { @@ -1088,6 +1857,18 @@ std::unique_ptr> from_cagra( std::filesystem::exists(index_directory) && std::filesystem::is_directory(index_directory), "Directory '%s' does not exist", index_directory.c_str()); + if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) { + RAFT_EXPECTS(dataset.has_value(), + "Layered HNSW serialization requires the original-order dataset."); + auto artifact_path = + serialize_to_layered_hnsw_from_disk(res, params, cagra_index, dataset.value()); + + auto hnsw_index = + std::make_unique>(cagra_index.dim(), cagra_index.metric(), params.hierarchy); + hnsw_index->set_file_descriptor(cuvs::util::file_descriptor(artifact_path, O_RDONLY)); + return hnsw_index; + } + std::string index_filename = (std::filesystem::path(index_directory) / "hnsw_index.bin").string(); @@ -1118,6 +1899,8 @@ std::unique_ptr> from_cagra( return from_cagra(res, params, cagra_index, dataset); } else if (params.hierarchy == HnswHierarchy::GPU) { return from_cagra(res, params, cagra_index, dataset); + } else if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) { + RAFT_FAIL("GPU_LAYERED_ON_DISK requires disk-backed ACE build artifacts."); } else { RAFT_FAIL("Unsupported hierarchy type"); } @@ -1228,6 +2011,13 @@ void serialize(raft::resources const& res, const std::string& filename, const in "Disk-based index file does not exist: %s", source_path.c_str()); + if (idx_impl->hierarchy() == HnswHierarchy::GPU_LAYERED_ON_DISK) { + copy_file_overwrite(source_path, filename); + RAFT_LOG_INFO( + "Copied layered HNSW index from %s to %s", source_path.c_str(), filename.c_str()); + return; + } + // Copy the file to the new location std::filesystem::copy_file( source_path, filename, std::filesystem::copy_options::overwrite_existing); @@ -1241,6 +2031,202 @@ void serialize(raft::resources const& res, const std::string& filename, const in hnswlib_index->saveIndex(filename); } +template +auto deserialize_layered_hnsw(raft::resources const& res, + const index_params& params, + const std::string& artifact_path, + int dim, + cuvs::distance::DistanceType metric) -> std::unique_ptr> +{ + common::nvtx::range fun_scope("hnsw::deserialize_layered"); + cuvs::util::file_descriptor artifact_fd(artifact_path, O_RDONLY); + layered_hnsw_file_header header{}; + cuvs::util::read_large_file(artifact_fd, &header, sizeof(header), 0); + RAFT_EXPECTS(std::strncmp(header.magic, layered_hnsw_magic, sizeof(header.magic)) == 0, + "Invalid layered HNSW artifact magic: %s", + artifact_path.c_str()); + RAFT_EXPECTS(header.version == layered_hnsw_version, + "Unsupported layered HNSW artifact version %u", + header.version); + RAFT_EXPECTS(header.metadata_offset >= sizeof(layered_hnsw_file_header), + "Invalid layered HNSW metadata offset"); + RAFT_EXPECTS(header.metadata_size > 0, "Invalid layered HNSW metadata size"); + + std::string metadata_json(header.metadata_size, '\0'); + cuvs::util::read_large_file( + artifact_fd, metadata_json.data(), metadata_json.size(), header.metadata_offset); + const auto metadata = parse_layered_hnsw_metadata(metadata_json); + + RAFT_EXPECTS(metadata.n_rows > 0, "Layered HNSW artifact must contain at least one row"); + RAFT_EXPECTS(static_cast(dim) == metadata.dim, + "Layered HNSW artifact dim (%zu) does not match requested dim (%d)", + metadata.dim, + dim); + RAFT_EXPECTS(metadata.layers.size() == static_cast(metadata.maxlevel), + "Layered HNSW artifact has %zu upper layers, expected %d", + metadata.layers.size(), + metadata.maxlevel); + + RAFT_EXPECTS(!params.dataset_path.empty(), + "Layered HNSW deserialization requires index_params.dataset_path"); + + const auto payload_offset = + align_up(header.metadata_offset + header.metadata_size, layered_hnsw_alignment); + const auto levels_offset = payload_offset; + const auto base_links_offset = levels_offset + metadata.levels_bytes; + const auto upper_nodes_offset = base_links_offset + metadata.base_links_bytes; + const auto upper_links_offset = upper_nodes_offset + metadata.upper_nodes_bytes; + const auto expected_file_size = upper_links_offset + metadata.upper_links_bytes; + const auto artifact_size = static_cast(std::filesystem::file_size(artifact_path)); + RAFT_EXPECTS(artifact_size >= expected_file_size, + "Layered HNSW artifact is truncated: expected at least %zu bytes, got %zu", + expected_file_size, + artifact_size); + + auto dataset_file = open_layered_dataset_file(params.dataset_path); + + RAFT_EXPECTS(dataset_file.shape.size() == 2 && dataset_file.shape[0] == metadata.n_rows && + dataset_file.shape[1] == metadata.dim, + "Layered HNSW dataset shape mismatch"); + RAFT_EXPECTS(metadata.levels_bytes == metadata.n_rows * sizeof(uint8_t), + "Layered HNSW levels section size mismatch"); + RAFT_EXPECTS(metadata.base_links_bytes == metadata.n_rows * metadata.base_link_row_bytes, + "Layered HNSW base links section size mismatch"); + RAFT_EXPECTS(metadata.upper_nodes_bytes == metadata.upper_nodes_count * sizeof(uint32_t), + "Layered HNSW upper node section size mismatch"); + RAFT_EXPECTS( + metadata.upper_links_bytes == metadata.upper_nodes_count * metadata.upper_link_row_bytes, + "Layered HNSW upper link section size mismatch"); + RAFT_EXPECTS(metadata.base_degree <= metadata.maxM0, + "Layered HNSW base degree (%zu) exceeds maxM0 (%zu)", + metadata.base_degree, + metadata.maxM0); + + std::vector levels_u8(metadata.n_rows); + cuvs::util::read_large_file(artifact_fd, levels_u8.data(), metadata.levels_bytes, levels_offset); + const auto max_level_in_levels = *std::max_element(levels_u8.begin(), levels_u8.end()); + RAFT_EXPECTS(static_cast(max_level_in_levels) == metadata.maxlevel, + "Layered HNSW levels max level (%d) does not match artifact maxlevel (%d)", + static_cast(max_level_in_levels), + metadata.maxlevel); + + auto hnsw_index = std::make_unique>(dim, metric, params.hierarchy); + auto appr_algo = std::make_unique::type>>( + hnsw_index->get_space(), metadata.n_rows, metadata.M, metadata.ef_construction); + appr_algo->cur_element_count = metadata.n_rows; + appr_algo->maxlevel_ = metadata.maxlevel; + appr_algo->enterpoint_node_ = metadata.enterpoint_node; + std::fill_n(appr_algo->linkLists_, metadata.n_rows, nullptr); + RAFT_EXPECTS(appr_algo->size_links_level0_ == metadata.base_link_row_bytes, + "Layered HNSW base link row size mismatch"); + RAFT_EXPECTS(appr_algo->size_links_per_element_ == metadata.upper_link_row_bytes, + "Layered HNSW upper link row size mismatch"); + RAFT_EXPECTS(appr_algo->maxM0_ == metadata.maxM0 && appr_algo->maxM_ == metadata.maxM, + "Layered HNSW M parameter mismatch"); + + auto num_threads = + params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads; + + RAFT_LOG_INFO("Layered HNSW: initializing in-memory hnswlib index from local dataset"); + const size_t target_batch_bytes = 64 * 1024 * 1024; + const size_t row_bytes = metadata.dim * sizeof(T) + metadata.base_link_row_bytes; + const size_t batch_size = std::max(1, target_batch_bytes / row_bytes); + auto dataset_buffer = + raft::make_host_matrix(static_cast(batch_size), metadata.dim); + std::vector base_link_buffer(batch_size * metadata.base_link_row_bytes); + + for (size_t batch_start = 0; batch_start < metadata.n_rows; batch_start += batch_size) { + const auto current_batch_size = std::min(batch_size, metadata.n_rows - batch_start); + cuvs::util::read_large_file(dataset_file.fd, + dataset_buffer.data_handle(), + current_batch_size * metadata.dim * sizeof(T), + dataset_file.header_size + batch_start * metadata.dim * sizeof(T)); + cuvs::util::read_large_file(artifact_fd, + base_link_buffer.data(), + current_batch_size * metadata.base_link_row_bytes, + base_links_offset + batch_start * metadata.base_link_row_bytes); + + bool link_list_allocation_failed = false; +#pragma omp parallel for num_threads(num_threads) reduction(|| : link_list_allocation_failed) + for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); ++batch_idx) { + const auto i = batch_start + static_cast(batch_idx); + auto level = static_cast(levels_u8[i]); + appr_algo->element_levels_[i] = level; + auto ll0 = appr_algo->get_linklist0(i); + memcpy(ll0, + base_link_buffer.data() + batch_idx * metadata.base_link_row_bytes, + metadata.base_link_row_bytes); + memcpy(appr_algo->getDataByInternalId(i), + dataset_buffer.data_handle() + batch_idx * metadata.dim, + appr_algo->data_size_); + *appr_algo->getExternalLabeLp(i) = static_cast(i); + if (level > 0) { + const auto link_list_size = appr_algo->size_links_per_element_ * level + 1; + appr_algo->linkLists_[i] = static_cast(malloc(link_list_size)); + if (appr_algo->linkLists_[i] == nullptr) { + link_list_allocation_failed = true; + continue; + } + memset(appr_algo->linkLists_[i], 0, link_list_size); + } + } + if (link_list_allocation_failed) { + for (size_t i = 0; i < metadata.n_rows; ++i) { + free(appr_algo->linkLists_[i]); + appr_algo->linkLists_[i] = nullptr; + } + throw std::runtime_error("Not enough memory to allocate HNSW upper linklists"); + } + } + + RAFT_LOG_INFO("Layered HNSW: loading upper-layer topology"); + for (const auto& layer : metadata.layers) { + RAFT_LOG_INFO("Layered HNSW: loading layer %zu (%zu rows, degree %zu)", + layer.level, + layer.row_count, + layer.degree); + const auto layer_batch_size = std::min(batch_size, layer.row_count); + std::vector node_buffer(layer_batch_size); + std::vector link_buffer(layer_batch_size * metadata.upper_link_row_bytes); + for (size_t batch_start = 0; batch_start < layer.row_count; batch_start += layer_batch_size) { + const auto current_batch_size = std::min(layer_batch_size, layer.row_count - batch_start); + cuvs::util::read_large_file( + artifact_fd, + node_buffer.data(), + current_batch_size * sizeof(uint32_t), + upper_nodes_offset + (layer.node_offset + batch_start) * sizeof(uint32_t)); + cuvs::util::read_large_file( + artifact_fd, + link_buffer.data(), + current_batch_size * metadata.upper_link_row_bytes, + upper_links_offset + (layer.link_offset + batch_start) * metadata.upper_link_row_bytes); + for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); + ++batch_idx) { + const auto node_id = static_cast(node_buffer[batch_idx]); + RAFT_EXPECTS(node_id < metadata.n_rows, + "Invalid upper-layer node id %zu in layered HNSW artifact", + node_id); + RAFT_EXPECTS(layer.level <= static_cast(levels_u8[node_id]), + "Layered HNSW artifact references node %zu at invalid level %zu", + node_id, + layer.level); + } +#pragma omp parallel for num_threads(num_threads) + for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); + ++batch_idx) { + const auto node_id = static_cast(node_buffer[batch_idx]); + auto ll = appr_algo->get_linklist(node_id, layer.level); + memcpy(ll, + link_buffer.data() + batch_idx * metadata.upper_link_row_bytes, + metadata.upper_link_row_bytes); + } + } + } + + hnsw_index->set_index(std::move(appr_algo)); + return hnsw_index; +} + template void deserialize(raft::resources const& res, const index_params& params, @@ -1249,6 +2235,12 @@ void deserialize(raft::resources const& res, cuvs::distance::DistanceType metric, index** idx) { + if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) { + auto hnsw_index = deserialize_layered_hnsw(res, params, filename, dim, metric); + *idx = hnsw_index.release(); + return; + } + try { auto hnsw_index = std::make_unique>(dim, metric, params.hierarchy); auto appr_algo = std::make_unique::type>>( @@ -1286,6 +2278,13 @@ std::unique_ptr> build(raft::resources const& res, ? std::get(params.graph_build_params) : graph_build_params::ace_params{}; + if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) { + RAFT_EXPECTS(ace_params.use_disk, + "GPU_LAYERED_ON_DISK requires ACE disk mode (ace_params.use_disk = true)"); + RAFT_EXPECTS(!ace_params.build_dir.empty(), + "GPU_LAYERED_ON_DISK requires ace_params.build_dir to be set"); + } + // Create CAGRA index parameters from HNSW parameters cuvs::neighbors::cagra::index_params cagra_params; cagra_params.metric = params.metric; diff --git a/cpp/tests/neighbors/ann_hnsw_ace.cuh b/cpp/tests/neighbors/ann_hnsw_ace.cuh index 78ae54d9fc..46caef7ae1 100644 --- a/cpp/tests/neighbors/ann_hnsw_ace.cuh +++ b/cpp/tests/neighbors/ann_hnsw_ace.cuh @@ -7,9 +7,16 @@ #include "ann_cagra.cuh" #include +#include +#include +#include #include +#include +#include #include +#include +#include namespace cuvs::neighbors::hnsw { @@ -259,6 +266,238 @@ class AnnHnswAceTest : public ::testing::TestWithParam { std::filesystem::remove_all(temp_dir); } + void testHnswAceLayeredBuildDeserializeSearch() + { + size_t queries_size = ps.n_queries * ps.k; + std::vector indexes_naive(queries_size); + std::vector distances_naive(queries_size); + + { + rmm::device_uvector distances_naive_dev(queries_size, stream_); + rmm::device_uvector indexes_naive_dev(queries_size, stream_); + + cuvs::neighbors::naive_knn(handle_, + distances_naive_dev.data(), + indexes_naive_dev.data(), + search_queries.data(), + database_dev.data(), + ps.n_queries, + ps.n_rows, + ps.dim, + ps.k, + ps.metric); + raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_); + raft::update_host(indexes_naive.data(), indexes_naive_dev.data(), queries_size, stream_); + raft::resource::sync_stream(handle_); + } + + std::string temp_dir = std::string("/tmp/cuvs_hnsw_ace_layered_test_") + + std::to_string(std::time(nullptr)) + "_" + + std::to_string(reinterpret_cast(this)); + std::filesystem::create_directories(temp_dir); + struct temp_dir_cleanup { + std::string path; + ~temp_dir_cleanup() + { + std::error_code ec; + std::filesystem::remove_all(path, ec); + } + } cleanup{temp_dir}; + + auto database_host = raft::make_host_matrix(ps.n_rows, ps.dim); + raft::copy(database_host.data_handle(), database_dev.data(), ps.n_rows * ps.dim, stream_); + auto queries_host = raft::make_host_matrix(ps.n_queries, ps.dim); + raft::copy(queries_host.data_handle(), search_queries.data(), ps.n_queries * ps.dim, stream_); + raft::resource::sync_stream(handle_); + + const auto dataset_file = (std::filesystem::path(temp_dir) / "dataset.npy").string(); + auto [dataset_fd, dataset_header_size] = cuvs::util::create_numpy_file( + dataset_file, {static_cast(ps.n_rows), static_cast(ps.dim)}); + cuvs::util::write_large_file(dataset_fd, + database_host.data_handle(), + static_cast(ps.n_rows) * ps.dim * sizeof(DataT), + dataset_header_size); + + hnsw::index_params hnsw_params; + hnsw_params.metric = ps.metric; + hnsw_params.hierarchy = hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK; + hnsw_params.M = 32; + hnsw_params.ef_construction = ps.ef_construction; + hnsw_params.dataset_path = dataset_file; + + auto ace_params = graph_build_params::ace_params(); + ace_params.npartitions = ps.npartitions; + ace_params.build_dir = temp_dir; + ace_params.use_disk = true; + ace_params.max_host_memory_gb = ps.max_host_memory_gb; + ace_params.max_gpu_memory_gb = ps.max_gpu_memory_gb; + hnsw_params.graph_build_params = ace_params; + + auto hnsw_index = + hnsw::build(handle_, hnsw_params, raft::make_const_mdspan(database_host.view())); + ASSERT_NE(hnsw_index, nullptr); + + const auto artifact_path = hnsw_index->file_path(); + ASSERT_FALSE(artifact_path.empty()); + ASSERT_TRUE(std::filesystem::is_regular_file(artifact_path)); + EXPECT_EQ(std::filesystem::path(artifact_path).filename().string(), "hnsw_index.cuvs"); + size_t cuvs_artifact_count = 0; + for (const auto& entry : std::filesystem::directory_iterator(temp_dir)) { + if (entry.path().extension() == ".cuvs") { ++cuvs_artifact_count; } + } + EXPECT_EQ(cuvs_artifact_count, 1); + EXPECT_FALSE(std::filesystem::exists(std::filesystem::path(temp_dir) / "layered_hnsw")); + + auto indexes_hnsw_host = raft::make_host_matrix(ps.n_queries, ps.k); + auto distances_hnsw_host = raft::make_host_matrix(ps.n_queries, ps.k); + + hnsw::search_params search_params; + search_params.ef = std::max(ps.ef_construction, ps.k * 2); + search_params.num_threads = 1; + + EXPECT_THROW(hnsw::search(handle_, + search_params, + *hnsw_index, + queries_host.view(), + indexes_hnsw_host.view(), + distances_hnsw_host.view()), + std::exception); + + hnsw::index* deserialized_index = nullptr; + hnsw::deserialize(handle_, hnsw_params, artifact_path, ps.dim, ps.metric, &deserialized_index); + ASSERT_NE(deserialized_index, nullptr); + std::unique_ptr> deserialized_guard(deserialized_index); + + hnsw::search(handle_, + search_params, + *deserialized_guard, + queries_host.view(), + indexes_hnsw_host.view(), + distances_hnsw_host.view()); + + std::vector indexes_hnsw_converted(queries_size); + std::vector distances_hnsw(queries_size); + for (size_t i = 0; i < queries_size; i++) { + indexes_hnsw_converted[i] = static_cast(indexes_hnsw_host.data_handle()[i]); + distances_hnsw[i] = distances_hnsw_host.data_handle()[i]; + } + + EXPECT_TRUE(cuvs::neighbors::eval_neighbours(indexes_naive, + indexes_hnsw_converted, + distances_naive, + distances_hnsw, + ps.n_queries, + ps.k, + 0.003, + ps.min_recall)) + << "Layered HNSW deserialize and search failed recall check"; + + const auto copied_artifact = + (std::filesystem::path(temp_dir) / "copied_layered" / "hnsw_index.cuvs").string(); + hnsw::serialize(handle_, copied_artifact, *hnsw_index); + EXPECT_TRUE(std::filesystem::is_regular_file(copied_artifact)); + size_t copied_file_count = 0; + for (const auto& entry : std::filesystem::directory_iterator( + std::filesystem::path(copied_artifact).parent_path())) { + if (entry.is_regular_file()) { ++copied_file_count; } + } + EXPECT_EQ(copied_file_count, 1); + + hnsw::index* copied_index = nullptr; + hnsw::deserialize(handle_, hnsw_params, copied_artifact, ps.dim, ps.metric, &copied_index); + ASSERT_NE(copied_index, nullptr); + std::unique_ptr> copied_guard(copied_index); + + hnsw::search(handle_, + search_params, + *copied_guard, + queries_host.view(), + indexes_hnsw_host.view(), + distances_hnsw_host.view()); + + for (size_t i = 0; i < queries_size; i++) { + indexes_hnsw_converted[i] = static_cast(indexes_hnsw_host.data_handle()[i]); + distances_hnsw[i] = distances_hnsw_host.data_handle()[i]; + } + EXPECT_TRUE(cuvs::neighbors::eval_neighbours(indexes_naive, + indexes_hnsw_converted, + distances_naive, + distances_hnsw, + ps.n_queries, + ps.k, + 0.003, + ps.min_recall)) + << "Copied layered HNSW artifact deserialize and search failed recall check"; + + const auto bad_artifact = + (std::filesystem::path(temp_dir) / "bad_layered" / "bad_magic.cuvs").string(); + std::filesystem::create_directories(std::filesystem::path(bad_artifact).parent_path()); + { + std::ofstream bad_file(bad_artifact, std::ios::binary); + std::array bad_bytes{}; + bad_file.write(bad_bytes.data(), bad_bytes.size()); + } + hnsw::index* bad_index = nullptr; + EXPECT_THROW( + hnsw::deserialize(handle_, hnsw_params, bad_artifact, ps.dim, ps.metric, &bad_index), + std::exception); + + const auto bad_version_artifact = + (std::filesystem::path(temp_dir) / "bad_layered" / "bad_version.cuvs").string(); + std::filesystem::copy_file( + copied_artifact, bad_version_artifact, std::filesystem::copy_options::overwrite_existing); + constexpr std::streamoff layered_header_version_offset = 32; + { + std::fstream bad_version_file(bad_version_artifact, + std::ios::in | std::ios::out | std::ios::binary); + const uint32_t bad_version = 999; + bad_version_file.seekp(layered_header_version_offset); + bad_version_file.write(reinterpret_cast(&bad_version), sizeof(bad_version)); + } + hnsw::index* bad_version_index = nullptr; + EXPECT_THROW( + hnsw::deserialize( + handle_, hnsw_params, bad_version_artifact, ps.dim, ps.metric, &bad_version_index), + std::exception); + + auto missing_dataset_params = hnsw_params; + missing_dataset_params.dataset_path.clear(); + hnsw::index* missing_dataset_index = nullptr; + EXPECT_THROW(hnsw::deserialize(handle_, + missing_dataset_params, + copied_artifact, + ps.dim, + ps.metric, + &missing_dataset_index), + std::exception); + + const auto truncated_artifact = + (std::filesystem::path(temp_dir) / "bad_layered" / "truncated.cuvs").string(); + std::filesystem::copy_file( + copied_artifact, truncated_artifact, std::filesystem::copy_options::overwrite_existing); + std::filesystem::resize_file(truncated_artifact, 128); + hnsw::index* truncated_index = nullptr; + EXPECT_THROW(hnsw::deserialize( + handle_, hnsw_params, truncated_artifact, ps.dim, ps.metric, &truncated_index), + std::exception); + + const auto wrong_dataset_file = + (std::filesystem::path(temp_dir) / "bad_layered" / "wrong_dataset.npy").string(); + auto [wrong_dataset_fd, wrong_dataset_header_size] = cuvs::util::create_numpy_file( + wrong_dataset_file, {static_cast(ps.n_rows - 1), static_cast(ps.dim)}); + cuvs::util::write_large_file(wrong_dataset_fd, + database_host.data_handle(), + static_cast(ps.n_rows - 1) * ps.dim * sizeof(DataT), + wrong_dataset_header_size); + auto wrong_dataset_params = hnsw_params; + wrong_dataset_params.dataset_path = wrong_dataset_file; + hnsw::index* wrong_dataset_index = nullptr; + EXPECT_THROW( + hnsw::deserialize( + handle_, wrong_dataset_params, copied_artifact, ps.dim, ps.metric, &wrong_dataset_index), + std::exception); + } + void SetUp() override { database_dev.resize(((size_t)ps.n_rows) * ps.dim, stream_); @@ -323,8 +562,26 @@ inline std::vector generate_hnsw_ace_memory_fallback_inputs() }; } +inline std::vector generate_hnsw_ace_layered_inputs() +{ + return { + {10, // n_queries + 5000, // n_rows + 64, // dim + 10, // k + 2, // npartitions + 100, // ef_construction + true, // use_disk + cuvs::distance::DistanceType::L2Expanded, + 0.9, // min_recall + 0.0, // max_host_memory_gb + 0.0} // max_gpu_memory_gb + }; +} + const std::vector hnsw_ace_inputs = generate_hnsw_ace_inputs(); const std::vector hnsw_ace_memory_fallback_inputs = generate_hnsw_ace_memory_fallback_inputs(); +const std::vector hnsw_ace_layered_inputs = generate_hnsw_ace_layered_inputs(); } // namespace cuvs::neighbors::hnsw diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu index 57b16d74e9..97a6255300 100644 --- a/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu +++ b/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu @@ -23,4 +23,14 @@ INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest, AnnHnswAceMemoryFallbackTest_float, ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs)); +typedef AnnHnswAceTest AnnHnswAceLayeredTest_float; +TEST_P(AnnHnswAceLayeredTest_float, AnnHnswAceLayeredBuildDeserializeSearch) +{ + this->testHnswAceLayeredBuildDeserializeSearch(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest, + AnnHnswAceLayeredTest_float, + ::testing::ValuesIn(hnsw_ace_layered_inputs)); + } // namespace cuvs::neighbors::hnsw diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu index 38f75b2afa..c77391ba28 100644 --- a/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu +++ b/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu @@ -12,4 +12,24 @@ TEST_P(AnnHnswAceTest_half, AnnHnswAceBuild) { this->testHnswAceBuild(); } INSTANTIATE_TEST_CASE_P(AnnHnswAceTest, AnnHnswAceTest_half, ::testing::ValuesIn(hnsw_ace_inputs)); +typedef AnnHnswAceTest AnnHnswAceMemoryFallbackTest_half; +TEST_P(AnnHnswAceMemoryFallbackTest_half, AnnHnswAceMemoryLimitFallback) +{ + this->testHnswAceMemoryLimitFallback(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest, + AnnHnswAceMemoryFallbackTest_half, + ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs)); + +typedef AnnHnswAceTest AnnHnswAceLayeredTest_half; +TEST_P(AnnHnswAceLayeredTest_half, AnnHnswAceLayeredBuildDeserializeSearch) +{ + this->testHnswAceLayeredBuildDeserializeSearch(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest, + AnnHnswAceLayeredTest_half, + ::testing::ValuesIn(hnsw_ace_layered_inputs)); + } // namespace cuvs::neighbors::hnsw diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu index 279df6555f..4ee6ecc5e5 100644 --- a/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu +++ b/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu @@ -14,4 +14,24 @@ INSTANTIATE_TEST_CASE_P(AnnHnswAceTest, AnnHnswAceTest_int8_t, ::testing::ValuesIn(hnsw_ace_inputs)); +typedef AnnHnswAceTest AnnHnswAceMemoryFallbackTest_int8_t; +TEST_P(AnnHnswAceMemoryFallbackTest_int8_t, AnnHnswAceMemoryLimitFallback) +{ + this->testHnswAceMemoryLimitFallback(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest, + AnnHnswAceMemoryFallbackTest_int8_t, + ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs)); + +typedef AnnHnswAceTest AnnHnswAceLayeredTest_int8_t; +TEST_P(AnnHnswAceLayeredTest_int8_t, AnnHnswAceLayeredBuildDeserializeSearch) +{ + this->testHnswAceLayeredBuildDeserializeSearch(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest, + AnnHnswAceLayeredTest_int8_t, + ::testing::ValuesIn(hnsw_ace_layered_inputs)); + } // namespace cuvs::neighbors::hnsw diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu index 7e68dc4b17..1ef791df8e 100644 --- a/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu +++ b/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu @@ -14,4 +14,24 @@ INSTANTIATE_TEST_CASE_P(AnnHnswAceTest, AnnHnswAceTest_uint8_t, ::testing::ValuesIn(hnsw_ace_inputs)); +typedef AnnHnswAceTest AnnHnswAceMemoryFallbackTest_uint8_t; +TEST_P(AnnHnswAceMemoryFallbackTest_uint8_t, AnnHnswAceMemoryLimitFallback) +{ + this->testHnswAceMemoryLimitFallback(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest, + AnnHnswAceMemoryFallbackTest_uint8_t, + ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs)); + +typedef AnnHnswAceTest AnnHnswAceLayeredTest_uint8_t; +TEST_P(AnnHnswAceLayeredTest_uint8_t, AnnHnswAceLayeredBuildDeserializeSearch) +{ + this->testHnswAceLayeredBuildDeserializeSearch(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest, + AnnHnswAceLayeredTest_uint8_t, + ::testing::ValuesIn(hnsw_ace_layered_inputs)); + } // namespace cuvs::neighbors::hnsw diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index 034b0b3d96..930f898266 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -35,6 +35,7 @@ add_executable(CAGRA_HNSW_ACE_EXAMPLE src/cagra_hnsw_ace_example.cu) add_executable(CAGRA_PERSISTENT_EXAMPLE src/cagra_persistent_example.cu) add_executable(DYNAMIC_BATCHING_EXAMPLE src/dynamic_batching_example.cu) add_executable(HNSW_ACE_EXAMPLE src/hnsw_ace_example.cu) +add_executable(HNSW_ACE_LAYERED_EXAMPLE src/hnsw_ace_layered_example.cu) add_executable(IVF_FLAT_EXAMPLE src/ivf_flat_example.cu) add_executable(IVF_PQ_EXAMPLE src/ivf_pq_example.cu) add_executable(VAMANA_EXAMPLE src/vamana_example.cu) @@ -52,6 +53,7 @@ target_link_libraries( DYNAMIC_BATCHING_EXAMPLE PRIVATE cuvs::cuvs $ Threads::Threads ) target_link_libraries(HNSW_ACE_EXAMPLE PRIVATE cuvs::cuvs $) +target_link_libraries(HNSW_ACE_LAYERED_EXAMPLE PRIVATE cuvs::cuvs $) target_link_libraries(IVF_PQ_EXAMPLE PRIVATE cuvs::cuvs $) target_link_libraries(IVF_FLAT_EXAMPLE PRIVATE cuvs::cuvs $) target_link_libraries(VAMANA_EXAMPLE PRIVATE cuvs::cuvs $) diff --git a/examples/cpp/src/hnsw_ace_layered_example.cu b/examples/cpp/src/hnsw_ace_layered_example.cu new file mode 100644 index 0000000000..02f735ef37 --- /dev/null +++ b/examples/cpp/src/hnsw_ace_layered_example.cu @@ -0,0 +1,278 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Large graphs exceeding the memory capacity can be built using the Augmented Core Extraction (ACE) +// algorithm, which partitions the dataset. The resulting HNSW index is too large to fit in memory +// as well. Thus, the index needs to be transferred to a search server with enough memory. +// +// HNSWHierarchy::GPU_LAYERED_ON_DISK is a special hierarchy that builds a layered HNSW index on +// disk. It emits one topology-only artifact, hnsw_index.cuvs. The dataset remains separate and does +// not need to be transferred to the search server, which typically has the dataset locally. +// +// This example demonstrates how to build a layered HNSW index with ACE: +// +// 1. Optionally quantize the dataset and queries to int8. +// 2. Build a single-file layered HNSW artifact with ACE using hnsw::build. +// 3. Deserialize the layered HNSW artifact using hnsw::deserialize. +// 4. Search the in-memory HNSW index. +// +// Layered-on-disk layout: +// +// index_dir/hnsw_index.cuvs +// fixed header + metadata JSON +// levels: uint8 [N], max HNSW level for each original row id +// base links: hnswlib-ready [count + uint32 neighbors padded to maxM0] +// upper nodes + upper links: hnswlib-ready upper-layer topology +// +// The transferred index artifact is topology-only. The dataset is loaded locally during +// deserialization from hnsw::index_params::dataset_path. The loader supports .npy and ANN benchmark +// *.bin datasets; this example writes a local dataset .npy only to make the demo self-contained. +// +// Layer 0 rows and neighbor IDs are original dataset row IDs. Upper layers are generated with the +// same level/order/KNN logic as serialize_to_hnswlib_from_disk, then stored as hnswlib-ready link +// rows so deserialization does no graph remapping or link padding on the search node. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "common.cuh" + +// When 1, scalar-quantize the float dataset to int8. +#define HNSW_ACE_LAYERED_USE_QUANTIZATION 1 + +namespace { + +constexpr const char* kBuildDir = "/tmp/hnsw_ace_layered"; + +template +std::string write_local_dataset(raft::host_matrix_view dataset, + const std::string& path) +{ + auto [fd, header_size] = cuvs::util::create_numpy_file( + path, {static_cast(dataset.extent(0)), static_cast(dataset.extent(1))}); + cuvs::util::write_large_file( + fd, dataset.data_handle(), dataset.extent(0) * dataset.extent(1) * sizeof(T), header_size); + return path; +} + +template +struct quantized_pair { + raft::host_matrix dataset; + raft::host_matrix queries; +}; + +quantized_pair quantize_dataset(raft::device_resources const& dev_resources, + raft::host_matrix_view dataset_float, + raft::host_matrix_view queries_float) +{ + std::cout << " quantize_dataset: training scalar quantizer (float -> int8)" << std::endl; + cuvs::preprocessing::quantize::scalar::params qp; + auto quantizer = cuvs::preprocessing::quantize::scalar::train(dev_resources, qp, dataset_float); + + auto dataset_i8 = + raft::make_host_matrix(dataset_float.extent(0), dataset_float.extent(1)); + cuvs::preprocessing::quantize::scalar::transform( + dev_resources, quantizer, dataset_float, dataset_i8.view()); + + auto queries_i8 = + raft::make_host_matrix(queries_float.extent(0), queries_float.extent(1)); + cuvs::preprocessing::quantize::scalar::transform( + dev_resources, quantizer, queries_float, queries_i8.view()); + + return {std::move(dataset_i8), std::move(queries_i8)}; +} + +auto make_hnsw_ace_params(const std::string& build_dir, const std::string& dataset_path) + -> cuvs::neighbors::hnsw::index_params +{ + using namespace cuvs::neighbors; + + hnsw::index_params hnsw_params; + hnsw_params.metric = cuvs::distance::DistanceType::L2Expanded; + hnsw_params.hierarchy = hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK; + hnsw_params.M = 32; + hnsw_params.dataset_path = dataset_path; // Override this path on the search server. + hnsw_params.ef_construction = 120; + + auto ace_params = hnsw::graph_build_params::ace_params(); + ace_params.npartitions = 4; + ace_params.use_disk = true; + ace_params.build_dir = build_dir; + hnsw_params.graph_build_params = ace_params; + + return hnsw_params; +} + +template +auto hnsw_build(raft::device_resources const& dev_resources, + const cuvs::neighbors::hnsw::index_params& hnsw_params, + raft::host_matrix_view dataset) -> std::string +{ + using namespace cuvs::neighbors; + + auto hnsw_index = hnsw::build(dev_resources, hnsw_params, dataset); + const auto artifact_path = hnsw_index->file_path(); + if (artifact_path.empty()) { + throw std::runtime_error("Expected layered HNSW build to return an artifact path."); + } + std::cout << " hnsw_build: layered artifact written to " << artifact_path << std::endl; + return artifact_path; +} + +template +auto hnsw_deserialize(raft::device_resources const& dev_resources, + const cuvs::neighbors::hnsw::index_params& hnsw_params, + const std::string& artifact_path, + int64_t dim) -> std::unique_ptr> +{ + using namespace cuvs::neighbors; + + hnsw::index* deserialized_index = nullptr; + // Set params.dataset_path to the local dataset path to load the dataset from disk. + // hnsw_params.dataset_path = "/tmp/dataset.npy"; + hnsw::deserialize( + dev_resources, hnsw_params, artifact_path, dim, hnsw_params.metric, &deserialized_index); + return std::unique_ptr>(deserialized_index); +} + +template +void hnsw_search(raft::device_resources const& dev_resources, + const cuvs::neighbors::hnsw::index& hnsw_index, + raft::host_matrix_view queries, + int64_t topk = 12) +{ + using namespace cuvs::neighbors; + + const int64_t n_queries = queries.extent(0); + auto indices_hnsw_host = raft::make_host_matrix(n_queries, topk); + auto distances_hnsw_host = raft::make_host_matrix(n_queries, topk); + + hnsw::search_params search_params; + search_params.ef = std::max(200, static_cast(topk) * 2); + search_params.num_threads = 1; + + hnsw::search(dev_resources, + search_params, + hnsw_index, + queries, + indices_hnsw_host.view(), + distances_hnsw_host.view()); + + auto neighbors = raft::make_device_matrix(dev_resources, n_queries, topk); + auto distances = raft::make_device_matrix(dev_resources, n_queries, topk); + auto neighbors_host = raft::make_host_matrix(n_queries, topk); + for (int64_t i = 0; i < n_queries; ++i) { + for (int64_t j = 0; j < topk; ++j) { + neighbors_host(i, j) = static_cast(indices_hnsw_host(i, j)); + } + } + + raft::copy(neighbors.data_handle(), + neighbors_host.data_handle(), + n_queries * topk, + raft::resource::get_cuda_stream(dev_resources)); + raft::copy(distances.data_handle(), + distances_hnsw_host.data_handle(), + n_queries * topk, + raft::resource::get_cuda_stream(dev_resources)); + raft::resource::sync_stream(dev_resources); + + print_results(dev_resources, neighbors.view(), distances.view()); +} + +} // namespace + +int main() +{ + raft::device_resources dev_resources; + + rmm::mr::pool_memory_resource pool_mr(rmm::mr::get_current_device_resource_ref(), + 1024 * 1024 * 1024ull); + rmm::mr::set_current_device_resource(pool_mr); + +#if HNSW_ACE_LAYERED_USE_QUANTIZATION + std::cout << "[stage 1] Generate and quantize dataset (float -> int8)" << std::endl; +#else + std::cout << "[stage 1] Generate dataset (float)" << std::endl; +#endif + + int64_t n_samples = 10000; + int64_t n_dim = 90; + int64_t n_queries = 10; + auto dataset = raft::make_device_matrix(dev_resources, n_samples, n_dim); + auto queries = raft::make_device_matrix(dev_resources, n_queries, n_dim); + generate_dataset(dev_resources, dataset.view(), queries.view()); + + auto dataset_host = raft::make_host_matrix(n_samples, n_dim); + auto queries_host = raft::make_host_matrix(n_queries, n_dim); + raft::copy(dataset_host.data_handle(), + dataset.data_handle(), + dataset.extent(0) * dataset.extent(1), + raft::resource::get_cuda_stream(dev_resources)); + raft::copy(queries_host.data_handle(), + queries.data_handle(), + queries.extent(0) * queries.extent(1), + raft::resource::get_cuda_stream(dev_resources)); + raft::resource::sync_stream(dev_resources); + + auto dataset_host_view = raft::make_host_matrix_view( + dataset_host.data_handle(), n_samples, n_dim); + auto queries_host_view = raft::make_host_matrix_view( + queries_host.data_handle(), n_queries, n_dim); + + std::filesystem::create_directories(kBuildDir); + +#if HNSW_ACE_LAYERED_USE_QUANTIZATION + auto q = quantize_dataset(dev_resources, dataset_host_view, queries_host_view); + auto dataset_i8_view = raft::make_host_matrix_view( + q.dataset.data_handle(), n_samples, n_dim); + auto queries_i8_view = raft::make_host_matrix_view( + q.queries.data_handle(), n_queries, n_dim); + auto dataset_path = write_local_dataset(dataset_i8_view, std::string{kBuildDir} + "/dataset.npy"); + auto hnsw_params = make_hnsw_ace_params(kBuildDir, dataset_path); + + std::cout << "[stage 2] Build layered HNSW index with ACE" << std::endl; + auto artifact_path = hnsw_build(dev_resources, hnsw_params, dataset_i8_view); + + std::cout << "[stage 3] Deserialize layered HNSW index" << std::endl; + auto hnsw_index = hnsw_deserialize(dev_resources, hnsw_params, artifact_path, n_dim); + + std::cout << "[stage 4] Search HNSW index" << std::endl; + hnsw_search(dev_resources, *hnsw_index, queries_i8_view); +#else + auto dataset_path = + write_local_dataset(dataset_host_view, std::string{kBuildDir} + "/dataset.npy"); + auto hnsw_params = make_hnsw_ace_params(kBuildDir, dataset_path); + + std::cout << "[stage 2] Build layered HNSW index with ACE" << std::endl; + auto artifact_path = hnsw_build(dev_resources, hnsw_params, dataset_host_view); + + std::cout << "[stage 3] Deserialize layered HNSW index" << std::endl; + auto hnsw_index = hnsw_deserialize(dev_resources, hnsw_params, artifact_path, n_dim); + + std::cout << "[stage 4] Search HNSW index" << std::endl; + hnsw_search(dev_resources, *hnsw_index, queries_host_view); +#endif + + return 0; +} From a11d9ee6ae0ee1d00f666fc90b40df2de09b64e8 Mon Sep 17 00:00:00 2001 From: Julian Miller Date: Wed, 27 May 2026 10:01:25 +0200 Subject: [PATCH 2/6] Improve deserialization logging --- cpp/src/neighbors/detail/hnsw.hpp | 101 ++++++++++++++++++++++++++++-- 1 file changed, 97 insertions(+), 4 deletions(-) diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp index 44dc4e688e..c5b86076c9 100644 --- a/cpp/src/neighbors/detail/hnsw.hpp +++ b/cpp/src/neighbors/detail/hnsw.hpp @@ -389,6 +389,11 @@ inline auto elapsed_ms_since(std::chrono::steady_clock::time_point start) -> int .count(); } +inline auto elapsed_ms(std::chrono::steady_clock::duration elapsed) -> double +{ + return std::chrono::duration(elapsed).count(); +} + inline auto to_gib(size_t bytes) -> double { return static_cast(bytes) / (1024.0 * 1024.0 * 1024.0); @@ -399,6 +404,13 @@ inline auto throughput_gib_per_s(size_t bytes, int64_t elapsed_ms) -> double return elapsed_ms > 0 ? to_gib(bytes) / (elapsed_ms / 1000.0) : 0.0; } +inline auto throughput_gib_per_s(size_t bytes, std::chrono::steady_clock::duration elapsed) + -> double +{ + const auto elapsed_s = std::chrono::duration(elapsed).count(); + return elapsed_s > 0.0 ? to_gib(bytes) / elapsed_s : 0.0; +} + inline auto make_hnsw_level_plan_from_levels(size_t n_rows, std::vector&& levels, bool build_reverse_order, @@ -2039,6 +2051,8 @@ auto deserialize_layered_hnsw(raft::resources const& res, cuvs::distance::DistanceType metric) -> std::unique_ptr> { common::nvtx::range fun_scope("hnsw::deserialize_layered"); + const auto total_start_time = std::chrono::steady_clock::now(); + const auto metadata_start_time = std::chrono::steady_clock::now(); cuvs::util::file_descriptor artifact_fd(artifact_path, O_RDONLY); layered_hnsw_file_header header{}; cuvs::util::read_large_file(artifact_fd, &header, sizeof(header), 0); @@ -2055,7 +2069,8 @@ auto deserialize_layered_hnsw(raft::resources const& res, std::string metadata_json(header.metadata_size, '\0'); cuvs::util::read_large_file( artifact_fd, metadata_json.data(), metadata_json.size(), header.metadata_offset); - const auto metadata = parse_layered_hnsw_metadata(metadata_json); + const auto metadata = parse_layered_hnsw_metadata(metadata_json); + const auto metadata_elapsed_ms = elapsed_ms_since(metadata_start_time); RAFT_EXPECTS(metadata.n_rows > 0, "Layered HNSW artifact must contain at least one row"); RAFT_EXPECTS(static_cast(dim) == metadata.dim, @@ -2082,8 +2097,15 @@ auto deserialize_layered_hnsw(raft::resources const& res, "Layered HNSW artifact is truncated: expected at least %zu bytes, got %zu", expected_file_size, artifact_size); + RAFT_LOG_INFO("Layered HNSW load: metadata read in %ld ms (rows=%zu dim=%zu artifact=%.2f GiB)", + metadata_elapsed_ms, + metadata.n_rows, + metadata.dim, + to_gib(artifact_size)); - auto dataset_file = open_layered_dataset_file(params.dataset_path); + const auto dataset_open_start_time = std::chrono::steady_clock::now(); + auto dataset_file = open_layered_dataset_file(params.dataset_path); + const auto dataset_open_elapsed_ms = elapsed_ms_since(dataset_open_start_time); RAFT_EXPECTS(dataset_file.shape.size() == 2 && dataset_file.shape[0] == metadata.n_rows && dataset_file.shape[1] == metadata.dim, @@ -2102,6 +2124,11 @@ auto deserialize_layered_hnsw(raft::resources const& res, metadata.base_degree, metadata.maxM0); + RAFT_LOG_INFO("Layered HNSW load: dataset header validated in %ld ms (%s)", + dataset_open_elapsed_ms, + params.dataset_path.c_str()); + + const auto levels_start_time = std::chrono::steady_clock::now(); std::vector levels_u8(metadata.n_rows); cuvs::util::read_large_file(artifact_fd, levels_u8.data(), metadata.levels_bytes, levels_offset); const auto max_level_in_levels = *std::max_element(levels_u8.begin(), levels_u8.end()); @@ -2109,9 +2136,14 @@ auto deserialize_layered_hnsw(raft::resources const& res, "Layered HNSW levels max level (%d) does not match artifact maxlevel (%d)", static_cast(max_level_in_levels), metadata.maxlevel); + const auto levels_elapsed_ms = elapsed_ms_since(levels_start_time); + RAFT_LOG_INFO("Layered HNSW load: levels read in %ld ms (%.2f MiB)", + levels_elapsed_ms, + static_cast(metadata.levels_bytes) / (1024.0 * 1024.0)); - auto hnsw_index = std::make_unique>(dim, metric, params.hierarchy); - auto appr_algo = std::make_unique::type>>( + const auto allocation_start_time = std::chrono::steady_clock::now(); + auto hnsw_index = std::make_unique>(dim, metric, params.hierarchy); + auto appr_algo = std::make_unique::type>>( hnsw_index->get_space(), metadata.n_rows, metadata.M, metadata.ef_construction); appr_algo->cur_element_count = metadata.n_rows; appr_algo->maxlevel_ = metadata.maxlevel; @@ -2123,6 +2155,12 @@ auto deserialize_layered_hnsw(raft::resources const& res, "Layered HNSW upper link row size mismatch"); RAFT_EXPECTS(appr_algo->maxM0_ == metadata.maxM0 && appr_algo->maxM_ == metadata.maxM, "Layered HNSW M parameter mismatch"); + const auto allocation_elapsed_ms = elapsed_ms_since(allocation_start_time); + RAFT_LOG_INFO( + "Layered HNSW load: allocated hnswlib storage in %ld ms (base=%.2f GiB upper=%.2f GiB)", + allocation_elapsed_ms, + to_gib(metadata.n_rows * appr_algo->size_data_per_element_), + to_gib(metadata.upper_nodes_count * metadata.upper_link_row_bytes)); auto num_threads = params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads; @@ -2135,18 +2173,31 @@ auto deserialize_layered_hnsw(raft::resources const& res, raft::make_host_matrix(static_cast(batch_size), metadata.dim); std::vector base_link_buffer(batch_size * metadata.base_link_row_bytes); + const auto base_start_time = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration dataset_read_time{}; + std::chrono::steady_clock::duration base_link_read_time{}; + std::chrono::steady_clock::duration base_copy_time{}; + size_t dataset_bytes_read = 0; + size_t base_link_bytes_read = 0; for (size_t batch_start = 0; batch_start < metadata.n_rows; batch_start += batch_size) { const auto current_batch_size = std::min(batch_size, metadata.n_rows - batch_start); + auto batch_timer = std::chrono::steady_clock::now(); cuvs::util::read_large_file(dataset_file.fd, dataset_buffer.data_handle(), current_batch_size * metadata.dim * sizeof(T), dataset_file.header_size + batch_start * metadata.dim * sizeof(T)); + dataset_read_time += std::chrono::steady_clock::now() - batch_timer; + dataset_bytes_read += current_batch_size * metadata.dim * sizeof(T); + batch_timer = std::chrono::steady_clock::now(); cuvs::util::read_large_file(artifact_fd, base_link_buffer.data(), current_batch_size * metadata.base_link_row_bytes, base_links_offset + batch_start * metadata.base_link_row_bytes); + base_link_read_time += std::chrono::steady_clock::now() - batch_timer; + base_link_bytes_read += current_batch_size * metadata.base_link_row_bytes; bool link_list_allocation_failed = false; + batch_timer = std::chrono::steady_clock::now(); #pragma omp parallel for num_threads(num_threads) reduction(|| : link_list_allocation_failed) for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); ++batch_idx) { const auto i = batch_start + static_cast(batch_idx); @@ -2177,10 +2228,30 @@ auto deserialize_layered_hnsw(raft::resources const& res, } throw std::runtime_error("Not enough memory to allocate HNSW upper linklists"); } + base_copy_time += std::chrono::steady_clock::now() - batch_timer; } + const auto base_elapsed_ms = elapsed_ms_since(base_start_time); + RAFT_LOG_INFO( + "Layered HNSW load: base layer initialized in %ld ms " + "(dataset read %.2f ms %.2f GiB %.2f GiB/s, links read %.2f ms %.2f GiB %.2f GiB/s, " + "copy %.2f ms)", + base_elapsed_ms, + elapsed_ms(dataset_read_time), + to_gib(dataset_bytes_read), + throughput_gib_per_s(dataset_bytes_read, dataset_read_time), + elapsed_ms(base_link_read_time), + to_gib(base_link_bytes_read), + throughput_gib_per_s(base_link_bytes_read, base_link_read_time), + elapsed_ms(base_copy_time)); RAFT_LOG_INFO("Layered HNSW: loading upper-layer topology"); + const auto upper_start_time = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration upper_read_time{}; + std::chrono::steady_clock::duration upper_validate_time{}; + std::chrono::steady_clock::duration upper_copy_time{}; + size_t upper_bytes_read = 0; for (const auto& layer : metadata.layers) { + const auto layer_start_time = std::chrono::steady_clock::now(); RAFT_LOG_INFO("Layered HNSW: loading layer %zu (%zu rows, degree %zu)", layer.level, layer.row_count, @@ -2190,6 +2261,7 @@ auto deserialize_layered_hnsw(raft::resources const& res, std::vector link_buffer(layer_batch_size * metadata.upper_link_row_bytes); for (size_t batch_start = 0; batch_start < layer.row_count; batch_start += layer_batch_size) { const auto current_batch_size = std::min(layer_batch_size, layer.row_count - batch_start); + auto batch_timer = std::chrono::steady_clock::now(); cuvs::util::read_large_file( artifact_fd, node_buffer.data(), @@ -2200,6 +2272,9 @@ auto deserialize_layered_hnsw(raft::resources const& res, link_buffer.data(), current_batch_size * metadata.upper_link_row_bytes, upper_links_offset + (layer.link_offset + batch_start) * metadata.upper_link_row_bytes); + upper_read_time += std::chrono::steady_clock::now() - batch_timer; + upper_bytes_read += current_batch_size * (sizeof(uint32_t) + metadata.upper_link_row_bytes); + batch_timer = std::chrono::steady_clock::now(); for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); ++batch_idx) { const auto node_id = static_cast(node_buffer[batch_idx]); @@ -2211,6 +2286,8 @@ auto deserialize_layered_hnsw(raft::resources const& res, node_id, layer.level); } + upper_validate_time += std::chrono::steady_clock::now() - batch_timer; + batch_timer = std::chrono::steady_clock::now(); #pragma omp parallel for num_threads(num_threads) for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); ++batch_idx) { @@ -2220,10 +2297,26 @@ auto deserialize_layered_hnsw(raft::resources const& res, link_buffer.data() + batch_idx * metadata.upper_link_row_bytes, metadata.upper_link_row_bytes); } + upper_copy_time += std::chrono::steady_clock::now() - batch_timer; } + RAFT_LOG_INFO("Layered HNSW load: layer %zu loaded in %ld ms", + layer.level, + elapsed_ms_since(layer_start_time)); } + const auto upper_elapsed_ms = elapsed_ms_since(upper_start_time); + RAFT_LOG_INFO( + "Layered HNSW load: upper layers loaded in %ld ms " + "(read %.2f ms %.2f GiB %.2f GiB/s, validate %.2f ms, copy %.2f ms)", + upper_elapsed_ms, + elapsed_ms(upper_read_time), + to_gib(upper_bytes_read), + throughput_gib_per_s(upper_bytes_read, upper_read_time), + elapsed_ms(upper_validate_time), + elapsed_ms(upper_copy_time)); hnsw_index->set_index(std::move(appr_algo)); + RAFT_LOG_INFO("Layered HNSW load: total deserialize completed in %ld ms", + elapsed_ms_since(total_start_time)); return hnsw_index; } From a95b0e0c0014982f469e62e757a349798eaedb2b Mon Sep 17 00:00:00 2001 From: Julian Miller Date: Wed, 27 May 2026 11:58:21 +0200 Subject: [PATCH 3/6] Use ace prefix in benchmarking consistently --- cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu index b535ced763..d4791dde11 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu @@ -64,16 +64,18 @@ auto parse_build_param(const nlohmann::json& conf) -> cuvs::neighbors::cagra::hnsw_heuristic_type::SAME_GRAPH_FOOTPRINT, dist_type); ps.metric = dist_type; - // Parse ACE parameters if provided - if (conf.contains("npartitions") || conf.contains("build_dir") || - conf.contains("ef_construction") || conf.contains("use_disk")) { + // Parse ACE parameters if provided. + auto ace_conf = collect_conf_with_prefix(conf, "ace_"); + if (!ace_conf.empty()) { auto ace_params = cuvs::neighbors::cagra::graph_build_params::ace_params(); - if (conf.contains("npartitions")) { ace_params.npartitions = conf.at("npartitions"); } - if (conf.contains("build_dir")) { ace_params.build_dir = conf.at("build_dir"); } - if (conf.contains("ef_construction")) { - ace_params.ef_construction = conf.at("ef_construction"); + if (ace_conf.contains("npartitions")) { + ace_params.npartitions = ace_conf.at("npartitions"); } - if (conf.contains("use_disk")) { ace_params.use_disk = conf.at("use_disk"); } + if (ace_conf.contains("build_dir")) { ace_params.build_dir = ace_conf.at("build_dir"); } + if (ace_conf.contains("ef_construction")) { + ace_params.ef_construction = ace_conf.at("ef_construction"); + } + if (ace_conf.contains("use_disk")) { ace_params.use_disk = ace_conf.at("use_disk"); } ps.graph_build_params = ace_params; } // NB: above, we only provide the defaults. Below we parse the explicit parameters as usual. From 47e2d858a410030913bd587ed2c11661694ad95d Mon Sep 17 00:00:00 2001 From: Julian Miller Date: Wed, 27 May 2026 11:59:10 +0200 Subject: [PATCH 4/6] Validate metadata before allocating --- cpp/src/neighbors/detail/hnsw.hpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp index c5b86076c9..d048d322a2 100644 --- a/cpp/src/neighbors/detail/hnsw.hpp +++ b/cpp/src/neighbors/detail/hnsw.hpp @@ -2065,6 +2065,14 @@ auto deserialize_layered_hnsw(raft::resources const& res, RAFT_EXPECTS(header.metadata_offset >= sizeof(layered_hnsw_file_header), "Invalid layered HNSW metadata offset"); RAFT_EXPECTS(header.metadata_size > 0, "Invalid layered HNSW metadata size"); + const auto artifact_size = static_cast(std::filesystem::file_size(artifact_path)); + RAFT_EXPECTS(header.metadata_offset <= artifact_size && + header.metadata_size <= artifact_size - header.metadata_offset, + "Layered HNSW metadata range is outside artifact: offset=%zu size=%zu " + "artifact=%zu", + static_cast(header.metadata_offset), + static_cast(header.metadata_size), + artifact_size); std::string metadata_json(header.metadata_size, '\0'); cuvs::util::read_large_file( @@ -2092,7 +2100,6 @@ auto deserialize_layered_hnsw(raft::resources const& res, const auto upper_nodes_offset = base_links_offset + metadata.base_links_bytes; const auto upper_links_offset = upper_nodes_offset + metadata.upper_nodes_bytes; const auto expected_file_size = upper_links_offset + metadata.upper_links_bytes; - const auto artifact_size = static_cast(std::filesystem::file_size(artifact_path)); RAFT_EXPECTS(artifact_size >= expected_file_size, "Layered HNSW artifact is truncated: expected at least %zu bytes, got %zu", expected_file_size, From 21ce339abadb11c2bcbeca3c389614877f4e72d1 Mon Sep 17 00:00:00 2001 From: Julian Miller Date: Mon, 1 Jun 2026 10:12:45 +0200 Subject: [PATCH 5/6] Store layered base topology by original node ID - Add base node IDs for sequential access. - Scattered writes happen only in deserialization step using host memory. --- cpp/src/neighbors/detail/hnsw.hpp | 351 ++++++++++--------- examples/cpp/src/hnsw_ace_layered_example.cu | 8 +- 2 files changed, 185 insertions(+), 174 deletions(-) diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp index d048d322a2..d090de07f8 100644 --- a/cpp/src/neighbors/detail/hnsw.hpp +++ b/cpp/src/neighbors/detail/hnsw.hpp @@ -40,8 +40,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -519,6 +519,7 @@ struct layered_hnsw_file_metadata { size_t ef_construction = 0; size_t base_degree = 0; size_t levels_bytes = 0; + size_t base_nodes_bytes = 0; size_t base_link_row_bytes = 0; size_t base_links_bytes = 0; size_t upper_nodes_count = 0; @@ -580,12 +581,13 @@ auto make_layered_hnsw_metadata_json(const layered_hnsw_file_metadata& metadata, std::stringstream os; os << "{\n"; os << " \"format\": \"cuvs_hnsw_layered\",\n"; - os << " \"version\": 1,\n"; + os << " \"version\": " << layered_hnsw_version << ",\n"; os << " \"n_rows\": " << metadata.n_rows << ",\n"; os << " \"dim\": " << metadata.dim << ",\n"; os << " \"dtype\": \"" << layered_dtype_name() << "\",\n"; os << " \"metric\": \"" << metric_name(metric) << "\",\n"; os << " \"row_order\": \"original_id\",\n"; + os << " \"base_link_order\": \"base_nodes\",\n"; os << " \"M\": " << metadata.M << ",\n"; os << " \"maxM\": " << metadata.maxM << ",\n"; os << " \"maxM0\": " << metadata.maxM0 << ",\n"; @@ -595,6 +597,7 @@ auto make_layered_hnsw_metadata_json(const layered_hnsw_file_metadata& metadata, os << " \"enterpoint_node\": " << metadata.enterpoint_node << ",\n"; os << " \"base_degree\": " << metadata.base_degree << ",\n"; os << " \"levels_bytes\": " << metadata.levels_bytes << ",\n"; + os << " \"base_nodes_bytes\": " << metadata.base_nodes_bytes << ",\n"; os << " \"base_link_row_bytes\": " << metadata.base_link_row_bytes << ",\n"; os << " \"base_links_bytes\": " << metadata.base_links_bytes << ",\n"; os << " \"upper_nodes_count\": " << metadata.upper_nodes_count << ",\n"; @@ -665,6 +668,7 @@ inline auto parse_layered_hnsw_metadata(const std::string& json) -> layered_hnsw metadata.enterpoint_node = static_cast(json_parse_size(json, "enterpoint_node")); metadata.base_degree = json_parse_size(json, "base_degree"); metadata.levels_bytes = json_parse_size(json, "levels_bytes"); + metadata.base_nodes_bytes = json_parse_size(json, "base_nodes_bytes"); metadata.base_link_row_bytes = json_parse_size(json, "base_link_row_bytes"); metadata.base_links_bytes = json_parse_size(json, "base_links_bytes"); metadata.upper_nodes_count = json_parse_size(json, "upper_nodes_count"); @@ -766,13 +770,15 @@ inline auto open_layered_dataset_file(const std::string& path) -> npy_file template void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index& index_, const cuvs::util::file_descriptor& output_fd, - size_t output_offset, + size_t base_nodes_offset, + size_t base_links_offset, size_t base_link_row_bytes, size_t maxM0) { - const auto total_start_time = std::chrono::steady_clock::now(); - const auto& graph_fd_opt = index_.graph_fd(); - const auto& mapping_fd_opt = index_.mapping_fd(); + static_assert(std::is_same_v, + "Layered HNSW artifacts store topology ids as uint32_t"); + const auto& graph_fd_opt = index_.graph_fd(); + const auto& mapping_fd_opt = index_.mapping_fd(); RAFT_EXPECTS(graph_fd_opt.has_value() && graph_fd_opt->is_valid(), "Graph file descriptor is not available"); RAFT_EXPECTS(mapping_fd_opt.has_value() && mapping_fd_opt->is_valid(), @@ -812,7 +818,6 @@ void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index reordered_to_original(n_rows); - std::vector original_to_reordered(n_rows); cuvs::util::read_large_file( mapping_npy.fd, reordered_to_original.data(), mapping_bytes, mapping_npy.header_size); for (size_t reordered_id = 0; reordered_id < n_rows; ++reordered_id) { @@ -821,7 +826,6 @@ void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index(reordered_id); } const auto mapping_elapsed_ms = elapsed_ms_since(mapping_start_time); RAFT_LOG_INFO("HNSW remap: mapping loaded in %ld ms (%.2f GiB, %.2f GiB/s)", @@ -829,112 +833,80 @@ void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index(1, target_batch_bytes / base_link_row_bytes); - const size_t max_gap_rows = std::max(1, (128 * 1024) / graph_row_bytes); - const size_t max_span_rows = std::max(1, (8 * 1024 * 1024) / graph_row_bytes); - - RAFT_LOG_INFO( - "HNSW remap: n_rows=%zu degree=%zu maxM0=%zu batch_size=%zu max_gap=%zu rows max_span=%zu rows", - n_rows, - degree, - maxM0, - batch_size, - max_gap_rows, - max_span_rows); - - auto span_buffer = raft::make_host_matrix(static_cast(max_span_rows), - static_cast(degree)); - std::vector output_buffer(batch_size * base_link_row_bytes); - std::vector source_rows(batch_size); - std::vector order(batch_size); - - const auto graph_start_time = std::chrono::steady_clock::now(); - size_t spans_read = 0; - size_t bytes_read = 0; - size_t bytes_written = 0; - - for (size_t batch_start = 0; batch_start < n_rows; batch_start += batch_size) { - const auto current_batch_size = std::min(batch_size, n_rows - batch_start); - std::fill( - output_buffer.begin(), output_buffer.begin() + current_batch_size * base_link_row_bytes, 0); - for (size_t i = 0; i < current_batch_size; ++i) { - source_rows[i] = original_to_reordered[batch_start + i]; - order[i] = i; - } - std::sort(order.begin(), order.begin() + current_batch_size, [&](size_t lhs, size_t rhs) { - return source_rows[lhs] < source_rows[rhs]; - }); - - size_t sorted_pos = 0; - while (sorted_pos < current_batch_size) { - const auto span_start = static_cast(source_rows[order[sorted_pos]]); - auto span_end = span_start; - auto span_next = sorted_pos + 1; - while (span_next < current_batch_size) { - const auto next_row = static_cast(source_rows[order[span_next]]); - if (next_row - span_start + 1 > max_span_rows) { break; } - if (next_row > span_end + max_gap_rows + 1) { break; } - span_end = next_row; - ++span_next; - } - - const auto span_rows = span_end - span_start + 1; - const auto span_bytes = span_rows * graph_row_bytes; - cuvs::util::read_large_file(graph_npy.fd, - span_buffer.data_handle(), - span_bytes, - graph_npy.header_size + span_start * graph_row_bytes); - ++spans_read; - bytes_read += span_bytes; - - for (size_t i = sorted_pos; i < span_next; ++i) { - const auto dst_row = order[i]; - const auto source_row = static_cast(source_rows[dst_row]); - auto* src = span_buffer.data_handle() + (source_row - span_start) * degree; - auto* dst_row_ptr = output_buffer.data() + dst_row * base_link_row_bytes; - hnswlib::linklistsizeint list_count = static_cast(degree); - std::memcpy(dst_row_ptr, &list_count, sizeof(list_count)); - auto* dst = reinterpret_cast(dst_row_ptr + sizeof(hnswlib::linklistsizeint)); - for (size_t j = 0; j < degree; ++j) { - const auto neighbor = static_cast(src[j]); - RAFT_EXPECTS(neighbor < n_rows, - "Invalid reordered neighbor id %zu in ACE graph row %zu", - neighbor, - source_row); - dst[j] = reordered_to_original[neighbor]; + const auto total_start_time = std::chrono::steady_clock::now(); + const auto graph_row_bytes = degree * sizeof(IdxT); + const auto base_node_row_bytes = sizeof(IdxT); + const auto base_topology_row_size = graph_row_bytes + base_node_row_bytes + base_link_row_bytes; + const size_t target_batch_bytes = 64 * 1024 * 1024; + const size_t batch_size = std::max(1, target_batch_bytes / base_topology_row_size); + auto graph_buffer = raft::make_host_matrix(static_cast(batch_size), + static_cast(degree)); + std::vector base_node_buffer(batch_size); + std::vector base_link_buffer(batch_size * base_link_row_bytes); + + size_t graph_bytes_read = 0; + size_t node_bytes_written = 0; + size_t link_bytes_written = 0; + for (size_t source_start = 0; source_start < n_rows; source_start += batch_size) { + const auto current_batch_size = std::min(batch_size, n_rows - source_start); + const auto batch_bytes = current_batch_size * graph_row_bytes; + cuvs::util::read_large_file(graph_npy.fd, + graph_buffer.data_handle(), + batch_bytes, + graph_npy.header_size + source_start * graph_row_bytes); + graph_bytes_read += batch_bytes; + + bool invalid_neighbor = false; +#pragma omp parallel for reduction(|| : invalid_neighbor) + for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); ++batch_idx) { + const auto source_row = source_start + static_cast(batch_idx); + const auto original_id = static_cast(reordered_to_original[source_row]); + base_node_buffer[batch_idx] = static_cast(original_id); + auto* dst_row_ptr = base_link_buffer.data() + batch_idx * base_link_row_bytes; + hnswlib::linklistsizeint list_count = static_cast(degree); + std::memcpy(dst_row_ptr, &list_count, sizeof(list_count)); + auto* dst = reinterpret_cast(dst_row_ptr + sizeof(hnswlib::linklistsizeint)); + auto* src = graph_buffer.data_handle() + batch_idx * degree; + for (size_t j = 0; j < degree; ++j) { + const auto neighbor = static_cast(src[j]); + if (neighbor >= n_rows) { + invalid_neighbor = true; + continue; } + dst[j] = reordered_to_original[neighbor]; + } + const auto written_bytes = sizeof(hnswlib::linklistsizeint) + degree * sizeof(IdxT); + if (written_bytes < base_link_row_bytes) { + std::memset(dst_row_ptr + written_bytes, 0, base_link_row_bytes - written_bytes); } - sorted_pos = span_next; } + RAFT_EXPECTS(!invalid_neighbor, "Invalid reordered neighbor id in ACE graph"); - const auto batch_bytes = current_batch_size * base_link_row_bytes; + const auto current_node_bytes = current_batch_size * sizeof(IdxT); + const auto current_link_bytes = current_batch_size * base_link_row_bytes; + cuvs::util::write_large_file(output_fd, + base_node_buffer.data(), + current_node_bytes, + base_nodes_offset + source_start * sizeof(IdxT)); cuvs::util::write_large_file(output_fd, - output_buffer.data(), - batch_bytes, - output_offset + batch_start * base_link_row_bytes); - bytes_written += batch_bytes; + base_link_buffer.data(), + current_link_bytes, + base_links_offset + source_start * base_link_row_bytes); + node_bytes_written += current_node_bytes; + link_bytes_written += current_link_bytes; } - const auto graph_elapsed_ms = elapsed_ms_since(graph_start_time); const auto total_elapsed_ms = elapsed_ms_since(total_start_time); - const auto read_gib = to_gib(bytes_read); - const auto written_gib = to_gib(bytes_written); - const auto ideal_read_bytes = n_rows * graph_row_bytes; - const auto amplification = - ideal_read_bytes == 0 ? 0.0 : static_cast(bytes_read) / ideal_read_bytes; RAFT_LOG_INFO( - "HNSW remap: base links written in %ld ms: read %.2f GiB at %.2f GiB/s across %zu spans " - "(%.2fx amplification), wrote %.2f GiB at %.2f GiB/s", - graph_elapsed_ms, - read_gib, - throughput_gib_per_s(bytes_read, graph_elapsed_ms), - spans_read, - amplification, - written_gib, - throughput_gib_per_s(bytes_written, graph_elapsed_ms)); - RAFT_LOG_INFO("HNSW remap: completed in %ld ms total", total_elapsed_ms); + "HNSW remap: base topology written in %ld ms (graph_read=%.2f GiB %.2f GiB/s, " + "artifact_write=%.2f GiB %.2f GiB/s, nodes=%.2f GiB links=%.2f GiB)", + total_elapsed_ms, + to_gib(graph_bytes_read), + throughput_gib_per_s(graph_bytes_read, total_elapsed_ms), + to_gib(node_bytes_written + link_bytes_written), + throughput_gib_per_s(node_bytes_written + link_bytes_written, total_elapsed_ms), + to_gib(node_bytes_written), + to_gib(link_bytes_written)); } template @@ -975,7 +947,7 @@ auto serialize_to_layered_hnsw_from_disk( RAFT_LOG_INFO("Layered HNSW: generating hierarchy levels"); const auto hierarchy_start_time = std::chrono::steady_clock::now(); - auto hierarchy = make_random_hnsw_level_plan(n_rows, *appr_algo, "Layered HNSW: Level "); + auto hierarchy = make_random_hnsw_level_plan(n_rows, *appr_algo, nullptr); const auto hierarchy_elapsed_ms = elapsed_ms_since(hierarchy_start_time); RAFT_LOG_INFO("Layered HNSW: hierarchy levels generated in %ld ms (max_level=%d, promoted=%zu)", hierarchy_elapsed_ms, @@ -994,6 +966,7 @@ auto serialize_to_layered_hnsw_from_disk( metadata.enterpoint_node = static_cast(hierarchy.order.back()); metadata.base_degree = static_cast(graph_degree_int); metadata.levels_bytes = n_rows * sizeof(uint8_t); + metadata.base_nodes_bytes = n_rows * sizeof(IdxT); metadata.base_link_row_bytes = appr_algo->size_links_level0_; metadata.base_links_bytes = n_rows * metadata.base_link_row_bytes; metadata.upper_link_row_bytes = appr_algo->size_links_per_element_; @@ -1022,7 +995,8 @@ auto serialize_to_layered_hnsw_from_disk( const auto payload_offset = align_up(header.metadata_offset + header.metadata_size, layered_hnsw_alignment); const auto levels_offset = payload_offset; - const auto base_links_offset = levels_offset + metadata.levels_bytes; + const auto base_nodes_offset = levels_offset + metadata.levels_bytes; + const auto base_links_offset = base_nodes_offset + metadata.base_nodes_bytes; const auto upper_nodes_offset = base_links_offset + metadata.base_links_bytes; const auto upper_links_offset = upper_nodes_offset + metadata.upper_nodes_bytes; const auto final_file_size = upper_links_offset + metadata.upper_links_bytes; @@ -1048,8 +1022,12 @@ auto serialize_to_layered_hnsw_from_disk( RAFT_LOG_INFO("Layered HNSW: writing hnswlib-ready base links"); const auto layer0_start_time = std::chrono::steady_clock::now(); - write_layered_base_links_from_disk( - index_, artifact_fd, base_links_offset, metadata.base_link_row_bytes, metadata.maxM0); + write_layered_base_links_from_disk(index_, + artifact_fd, + base_nodes_offset, + base_links_offset, + metadata.base_link_row_bytes, + metadata.maxM0); const auto layer0_elapsed_ms = elapsed_ms_since(layer0_start_time); RAFT_LOG_INFO("Layered HNSW: base links written in %ld ms (%.2f GiB, %.2f GiB/s effective)", layer0_elapsed_ms, @@ -2081,6 +2059,7 @@ auto deserialize_layered_hnsw(raft::resources const& res, const auto metadata_elapsed_ms = elapsed_ms_since(metadata_start_time); RAFT_EXPECTS(metadata.n_rows > 0, "Layered HNSW artifact must contain at least one row"); + RAFT_EXPECTS(metadata.dim > 0, "Layered HNSW artifact must contain at least one dimension"); RAFT_EXPECTS(static_cast(dim) == metadata.dim, "Layered HNSW artifact dim (%zu) does not match requested dim (%d)", metadata.dim, @@ -2096,7 +2075,8 @@ auto deserialize_layered_hnsw(raft::resources const& res, const auto payload_offset = align_up(header.metadata_offset + header.metadata_size, layered_hnsw_alignment); const auto levels_offset = payload_offset; - const auto base_links_offset = levels_offset + metadata.levels_bytes; + const auto base_nodes_offset = levels_offset + metadata.levels_bytes; + const auto base_links_offset = base_nodes_offset + metadata.base_nodes_bytes; const auto upper_nodes_offset = base_links_offset + metadata.base_links_bytes; const auto upper_links_offset = upper_nodes_offset + metadata.upper_nodes_bytes; const auto expected_file_size = upper_links_offset + metadata.upper_links_bytes; @@ -2119,6 +2099,8 @@ auto deserialize_layered_hnsw(raft::resources const& res, "Layered HNSW dataset shape mismatch"); RAFT_EXPECTS(metadata.levels_bytes == metadata.n_rows * sizeof(uint8_t), "Layered HNSW levels section size mismatch"); + RAFT_EXPECTS(metadata.base_nodes_bytes == metadata.n_rows * sizeof(uint32_t), + "Layered HNSW base node section size mismatch"); RAFT_EXPECTS(metadata.base_links_bytes == metadata.n_rows * metadata.base_link_row_bytes, "Layered HNSW base links section size mismatch"); RAFT_EXPECTS(metadata.upper_nodes_bytes == metadata.upper_nodes_count * sizeof(uint32_t), @@ -2172,36 +2154,26 @@ auto deserialize_layered_hnsw(raft::resources const& res, auto num_threads = params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads; - RAFT_LOG_INFO("Layered HNSW: initializing in-memory hnswlib index from local dataset"); const size_t target_batch_bytes = 64 * 1024 * 1024; - const size_t row_bytes = metadata.dim * sizeof(T) + metadata.base_link_row_bytes; - const size_t batch_size = std::max(1, target_batch_bytes / row_bytes); + const size_t dataset_row_bytes = metadata.dim * sizeof(T); + const size_t batch_size = std::max(1, target_batch_bytes / dataset_row_bytes); auto dataset_buffer = raft::make_host_matrix(static_cast(batch_size), metadata.dim); - std::vector base_link_buffer(batch_size * metadata.base_link_row_bytes); const auto base_start_time = std::chrono::steady_clock::now(); std::chrono::steady_clock::duration dataset_read_time{}; - std::chrono::steady_clock::duration base_link_read_time{}; std::chrono::steady_clock::duration base_copy_time{}; - size_t dataset_bytes_read = 0; - size_t base_link_bytes_read = 0; + size_t dataset_bytes_read = 0; for (size_t batch_start = 0; batch_start < metadata.n_rows; batch_start += batch_size) { - const auto current_batch_size = std::min(batch_size, metadata.n_rows - batch_start); - auto batch_timer = std::chrono::steady_clock::now(); + const auto current_batch_size = std::min(batch_size, metadata.n_rows - batch_start); + const auto current_dataset_bytes = current_batch_size * metadata.dim * sizeof(T); + auto batch_timer = std::chrono::steady_clock::now(); cuvs::util::read_large_file(dataset_file.fd, dataset_buffer.data_handle(), - current_batch_size * metadata.dim * sizeof(T), + current_dataset_bytes, dataset_file.header_size + batch_start * metadata.dim * sizeof(T)); dataset_read_time += std::chrono::steady_clock::now() - batch_timer; - dataset_bytes_read += current_batch_size * metadata.dim * sizeof(T); - batch_timer = std::chrono::steady_clock::now(); - cuvs::util::read_large_file(artifact_fd, - base_link_buffer.data(), - current_batch_size * metadata.base_link_row_bytes, - base_links_offset + batch_start * metadata.base_link_row_bytes); - base_link_read_time += std::chrono::steady_clock::now() - batch_timer; - base_link_bytes_read += current_batch_size * metadata.base_link_row_bytes; + dataset_bytes_read += current_dataset_bytes; bool link_list_allocation_failed = false; batch_timer = std::chrono::steady_clock::now(); @@ -2210,10 +2182,6 @@ auto deserialize_layered_hnsw(raft::resources const& res, const auto i = batch_start + static_cast(batch_idx); auto level = static_cast(levels_u8[i]); appr_algo->element_levels_[i] = level; - auto ll0 = appr_algo->get_linklist0(i); - memcpy(ll0, - base_link_buffer.data() + batch_idx * metadata.base_link_row_bytes, - metadata.base_link_row_bytes); memcpy(appr_algo->getDataByInternalId(i), dataset_buffer.data_handle() + batch_idx * metadata.dim, appr_algo->data_size_); @@ -2225,7 +2193,6 @@ auto deserialize_layered_hnsw(raft::resources const& res, link_list_allocation_failed = true; continue; } - memset(appr_algo->linkLists_[i], 0, link_list_size); } } if (link_list_allocation_failed) { @@ -2239,36 +2206,83 @@ auto deserialize_layered_hnsw(raft::resources const& res, } const auto base_elapsed_ms = elapsed_ms_since(base_start_time); RAFT_LOG_INFO( - "Layered HNSW load: base layer initialized in %ld ms " - "(dataset read %.2f ms %.2f GiB %.2f GiB/s, links read %.2f ms %.2f GiB %.2f GiB/s, " - "copy %.2f ms)", + "Layered HNSW load: hnswlib data and levels initialized in %ld ms " + "(dataset read %.2f ms %.2f GiB %.2f GiB/s, copy %.2f ms)", base_elapsed_ms, elapsed_ms(dataset_read_time), to_gib(dataset_bytes_read), throughput_gib_per_s(dataset_bytes_read, dataset_read_time), - elapsed_ms(base_link_read_time), - to_gib(base_link_bytes_read), - throughput_gib_per_s(base_link_bytes_read, base_link_read_time), elapsed_ms(base_copy_time)); - RAFT_LOG_INFO("Layered HNSW: loading upper-layer topology"); + const auto base_topology_start_time = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration base_topology_read_time{}; + std::chrono::steady_clock::duration base_topology_copy_time{}; + size_t base_topology_bytes_read = 0; + const size_t base_topology_row_bytes = sizeof(uint32_t) + metadata.base_link_row_bytes; + const size_t base_topology_batch_size = + std::max(1, target_batch_bytes / base_topology_row_bytes); + std::vector base_node_buffer(base_topology_batch_size); + std::vector base_link_buffer(base_topology_batch_size * metadata.base_link_row_bytes); + for (size_t batch_start = 0; batch_start < metadata.n_rows; + batch_start += base_topology_batch_size) { + const auto current_batch_size = + std::min(base_topology_batch_size, metadata.n_rows - batch_start); + const auto current_base_topology_bytes = + current_batch_size * (sizeof(uint32_t) + metadata.base_link_row_bytes); + auto batch_timer = std::chrono::steady_clock::now(); + cuvs::util::read_large_file(artifact_fd, + base_node_buffer.data(), + current_batch_size * sizeof(uint32_t), + base_nodes_offset + batch_start * sizeof(uint32_t)); + cuvs::util::read_large_file(artifact_fd, + base_link_buffer.data(), + current_batch_size * metadata.base_link_row_bytes, + base_links_offset + batch_start * metadata.base_link_row_bytes); + base_topology_read_time += std::chrono::steady_clock::now() - batch_timer; + base_topology_bytes_read += current_base_topology_bytes; + + batch_timer = std::chrono::steady_clock::now(); + bool invalid_node_id = false; +#pragma omp parallel for num_threads(num_threads) reduction(|| : invalid_node_id) + for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); ++batch_idx) { + const auto node_id = static_cast(base_node_buffer[batch_idx]); + if (node_id >= metadata.n_rows) { + invalid_node_id = true; + continue; + } + auto ll0 = appr_algo->get_linklist0(node_id); + memcpy(ll0, + base_link_buffer.data() + batch_idx * metadata.base_link_row_bytes, + metadata.base_link_row_bytes); + } + RAFT_EXPECTS(!invalid_node_id, "Invalid base-layer node id in layered HNSW artifact"); + base_topology_copy_time += std::chrono::steady_clock::now() - batch_timer; + } + const auto base_topology_elapsed_ms = elapsed_ms_since(base_topology_start_time); + RAFT_LOG_INFO( + "Layered HNSW load: base-layer topology loaded in %ld ms " + "(read %.2f ms %.2f GiB %.2f GiB/s, scatter-copy %.2f ms)", + base_topology_elapsed_ms, + elapsed_ms(base_topology_read_time), + to_gib(base_topology_bytes_read), + throughput_gib_per_s(base_topology_bytes_read, base_topology_read_time), + elapsed_ms(base_topology_copy_time)); + const auto upper_start_time = std::chrono::steady_clock::now(); std::chrono::steady_clock::duration upper_read_time{}; - std::chrono::steady_clock::duration upper_validate_time{}; std::chrono::steady_clock::duration upper_copy_time{}; size_t upper_bytes_read = 0; for (const auto& layer : metadata.layers) { - const auto layer_start_time = std::chrono::steady_clock::now(); - RAFT_LOG_INFO("Layered HNSW: loading layer %zu (%zu rows, degree %zu)", - layer.level, - layer.row_count, - layer.degree); - const auto layer_batch_size = std::min(batch_size, layer.row_count); + const auto layer_row_bytes = sizeof(uint32_t) + metadata.upper_link_row_bytes; + const auto layer_batch_size = + std::min(std::max(1, target_batch_bytes / layer_row_bytes), layer.row_count); std::vector node_buffer(layer_batch_size); std::vector link_buffer(layer_batch_size * metadata.upper_link_row_bytes); for (size_t batch_start = 0; batch_start < layer.row_count; batch_start += layer_batch_size) { const auto current_batch_size = std::min(layer_batch_size, layer.row_count - batch_start); - auto batch_timer = std::chrono::steady_clock::now(); + const auto current_upper_bytes = + current_batch_size * (sizeof(uint32_t) + metadata.upper_link_row_bytes); + auto batch_timer = std::chrono::steady_clock::now(); cuvs::util::read_large_file( artifact_fd, node_buffer.data(), @@ -2280,45 +2294,42 @@ auto deserialize_layered_hnsw(raft::resources const& res, current_batch_size * metadata.upper_link_row_bytes, upper_links_offset + (layer.link_offset + batch_start) * metadata.upper_link_row_bytes); upper_read_time += std::chrono::steady_clock::now() - batch_timer; - upper_bytes_read += current_batch_size * (sizeof(uint32_t) + metadata.upper_link_row_bytes); - batch_timer = std::chrono::steady_clock::now(); + upper_bytes_read += current_upper_bytes; + batch_timer = std::chrono::steady_clock::now(); + bool invalid_node_id = false; + bool invalid_node_level = false; +#pragma omp parallel for num_threads(num_threads) \ + reduction(|| : invalid_node_id, invalid_node_level) for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); ++batch_idx) { const auto node_id = static_cast(node_buffer[batch_idx]); - RAFT_EXPECTS(node_id < metadata.n_rows, - "Invalid upper-layer node id %zu in layered HNSW artifact", - node_id); - RAFT_EXPECTS(layer.level <= static_cast(levels_u8[node_id]), - "Layered HNSW artifact references node %zu at invalid level %zu", - node_id, - layer.level); - } - upper_validate_time += std::chrono::steady_clock::now() - batch_timer; - batch_timer = std::chrono::steady_clock::now(); -#pragma omp parallel for num_threads(num_threads) - for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); - ++batch_idx) { - const auto node_id = static_cast(node_buffer[batch_idx]); - auto ll = appr_algo->get_linklist(node_id, layer.level); + if (node_id >= metadata.n_rows) { + invalid_node_id = true; + continue; + } + if (layer.level > static_cast(levels_u8[node_id])) { + invalid_node_level = true; + continue; + } + auto ll = appr_algo->get_linklist(node_id, layer.level); memcpy(ll, link_buffer.data() + batch_idx * metadata.upper_link_row_bytes, metadata.upper_link_row_bytes); } + RAFT_EXPECTS(!invalid_node_id, "Invalid upper-layer node id in layered HNSW artifact"); + RAFT_EXPECTS(!invalid_node_level, + "Layered HNSW artifact references a node at an invalid upper level"); upper_copy_time += std::chrono::steady_clock::now() - batch_timer; } - RAFT_LOG_INFO("Layered HNSW load: layer %zu loaded in %ld ms", - layer.level, - elapsed_ms_since(layer_start_time)); } const auto upper_elapsed_ms = elapsed_ms_since(upper_start_time); RAFT_LOG_INFO( "Layered HNSW load: upper layers loaded in %ld ms " - "(read %.2f ms %.2f GiB %.2f GiB/s, validate %.2f ms, copy %.2f ms)", + "(read %.2f ms %.2f GiB %.2f GiB/s, validate+copy %.2f ms)", upper_elapsed_ms, elapsed_ms(upper_read_time), to_gib(upper_bytes_read), throughput_gib_per_s(upper_bytes_read, upper_read_time), - elapsed_ms(upper_validate_time), elapsed_ms(upper_copy_time)); hnsw_index->set_index(std::move(appr_algo)); diff --git a/examples/cpp/src/hnsw_ace_layered_example.cu b/examples/cpp/src/hnsw_ace_layered_example.cu index 02f735ef37..f11551df20 100644 --- a/examples/cpp/src/hnsw_ace_layered_example.cu +++ b/examples/cpp/src/hnsw_ace_layered_example.cu @@ -23,16 +23,16 @@ // index_dir/hnsw_index.cuvs // fixed header + metadata JSON // levels: uint8 [N], max HNSW level for each original row id -// base links: hnswlib-ready [count + uint32 neighbors padded to maxM0] +// base nodes + base links: uint32 node ids with hnswlib-ready link rows // upper nodes + upper links: hnswlib-ready upper-layer topology // // The transferred index artifact is topology-only. The dataset is loaded locally during // deserialization from hnsw::index_params::dataset_path. The loader supports .npy and ANN benchmark // *.bin datasets; this example writes a local dataset .npy only to make the demo self-contained. // -// Layer 0 rows and neighbor IDs are original dataset row IDs. Upper layers are generated with the -// same level/order/KNN logic as serialize_to_hnswlib_from_disk, then stored as hnswlib-ready link -// rows so deserialization does no graph remapping or link padding on the search node. +// Layer 0 node IDs and neighbor IDs are original dataset row IDs. Upper layers are generated with +// the same level/order/KNN logic as serialize_to_hnswlib_from_disk, then stored as hnswlib-ready +// link rows so deserialization does no graph remapping or link padding on the search node. #include #include From 4514bc85b1c1743ee05af9eb4b2465967925c390 Mon Sep 17 00:00:00 2001 From: Julian Miller Date: Mon, 1 Jun 2026 10:15:11 +0200 Subject: [PATCH 6/6] Unify the ACE logging format --- .../neighbors/detail/cagra/cagra_build.cuh | 234 +++++++++--------- cpp/src/neighbors/detail/hnsw.hpp | 160 ++++++++---- 2 files changed, 238 insertions(+), 156 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 96ff8344d3..3edbec46af 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -51,6 +51,13 @@ namespace cuvs::neighbors::cagra::detail { constexpr double to_mib(size_t bytes) { return static_cast(bytes) / (1 << 20); } constexpr double to_gib(size_t bytes) { return static_cast(bytes) / (1 << 30); } +inline auto progress_step_10(size_t completed, size_t total) -> size_t +{ + if (total == 0) { return 100; } + const auto percent = (std::min(completed, total) * 100) / total; + return completed >= total ? 100 : (percent / 10) * 10; +} + template void check_graph_degree(size_t& intermediate_degree, size_t& graph_degree, size_t dataset_size) { @@ -99,7 +106,7 @@ void ace_get_partition_labels( const size_t min_samples = 100 * n_partitions; n_samples = std::max(n_samples, min_samples); n_samples = std::min(n_samples, dataset_size); - RAFT_LOG_DEBUG("ACE: n_samples: %lu", n_samples); + RAFT_LOG_DEBUG("ACE build: partition labeling uses %lu sampled vectors", n_samples); auto sample_db = raft::make_host_matrix(n_samples, dataset_dim); #pragma omp parallel for @@ -123,18 +130,10 @@ void ace_get_partition_labels( auto _sub_distances = raft::make_host_matrix(chunk_size, n_partitions); auto _sub_dataset_dev = raft::make_device_matrix(res, chunk_size, dataset_dim); auto _sub_distances_dev = raft::make_device_matrix(res, chunk_size, n_partitions); - size_t report_interval = dataset_size / 10; - report_interval = (report_interval / chunk_size) * chunk_size; - report_interval = std::max(report_interval, chunk_size); + size_t next_progress_percent = 10; for (size_t i_base = 0; i_base < dataset_size; i_base += chunk_size) { const size_t sub_dataset_size = std::min(chunk_size, dataset_size - i_base); - if (i_base % report_interval == 0) { - RAFT_LOG_INFO("ACE: Processing chunk %lu / %lu (%.1f%%)", - i_base, - dataset_size, - static_cast(100 * i_base) / dataset_size); - } auto sub_dataset = raft::make_host_matrix_view( _sub_dataset.data_handle(), sub_dataset_size, dataset_dim); @@ -191,6 +190,16 @@ void ace_get_partition_labels( #pragma omp atomic update partition_histogram(augmented_label, 1) += 1; } + + const auto completed_rows = i_base + sub_dataset_size; + const auto report_percent = progress_step_10(completed_rows, dataset_size); + if (report_percent >= next_progress_percent) { + RAFT_LOG_INFO("ACE build: partition labeling progress %zu%% (%zu/%zu rows)", + report_percent, + completed_rows, + dataset_size); + next_progress_percent = report_percent + 10; + } } } @@ -261,15 +270,15 @@ void ace_check_partition_sizes( if (total_count > 0 && total_count < very_small_threshold) { RAFT_LOG_WARN( - "ACE: Partition %lu is very small (%lu vectors, expected ~%.1f). This may affect graph " - "quality.", + "ACE build: partition %lu is very small (%lu vectors, expected ~%.1f); graph quality may " + "be affected", c, total_count, expected_avg_vectors); } else if (total_count > very_large_threshold) { RAFT_LOG_WARN( - "ACE: Partition %lu is very large (%lu vectors, expected ~%.1f, threshold: %lu). This may " - "indicate imbalance and can lead to memory issues in restricted environments.", + "ACE build: partition %lu is very large (%lu vectors, expected ~%.1f, threshold=%lu); " + "partition imbalance may increase memory pressure", c, total_count, expected_avg_vectors, @@ -482,11 +491,10 @@ void ace_reorder_and_store_dataset( size_t dataset_dim = dataset.extent(1); size_t n_partitions = partition_histogram.extent(0); - RAFT_LOG_DEBUG( - "ACE: Reordering and storing dataset to disk (%lu vectors, %lu dimensions, %lu partitions)", - dataset_size, - dataset_dim, - n_partitions); + RAFT_LOG_DEBUG("ACE build: reordering dataset to disk (rows=%lu dim=%lu partitions=%lu)", + dataset_size, + dataset_dim, + n_partitions); // Calculate total sizes for pre-allocation size_t total_core_vectors = 0; @@ -509,10 +517,10 @@ void ace_reorder_and_store_dataset( size_t reordered_file_size = total_core_vectors * vector_size; size_t augmented_file_size = total_augmented_vectors * vector_size; - RAFT_LOG_DEBUG("ACE: Reordered dataset: %lu core vectors (%.2f GiB)", + RAFT_LOG_DEBUG("ACE build: reordered dataset section rows=%lu size=%.2f GiB", total_core_vectors, reordered_file_size / (1024.0 * 1024.0 * 1024.0)); - RAFT_LOG_DEBUG("ACE: Augmented dataset: %lu secondary vectors (%.2f GiB)", + RAFT_LOG_DEBUG("ACE build: augmented dataset section rows=%lu size=%.2f GiB", total_augmented_vectors, augmented_file_size / (1024.0 * 1024.0 * 1024.0)); @@ -541,7 +549,7 @@ void ace_reorder_and_store_dataset( disk_write_size = std::min(disk_write_size, 64 * 1024 * 1024); size_t vectors_per_buffer = std::max(64, disk_write_size / vector_size); - RAFT_LOG_DEBUG("ACE: Reorder buffers: %lu vectors per buffer (%.2f MiB)", + RAFT_LOG_DEBUG("ACE build: reorder buffer rows=%lu size=%.2f MiB", vectors_per_buffer, to_mib(vectors_per_buffer * vector_size)); @@ -593,8 +601,8 @@ void ace_reorder_and_store_dataset( } }; - size_t vectors_processed = 0; - const size_t log_interval = std::max(dataset_size / 10, size_t(1)); + size_t vectors_processed = 0; + size_t next_progress_percent = 10; for (size_t i = 0; i < dataset_size; i++) { size_t core_partition = partition_labels(i, 0); size_t secondary_partition = partition_labels(i, 1); @@ -623,16 +631,18 @@ void ace_reorder_and_store_dataset( } vectors_processed++; - if (vectors_processed % log_interval == 0) { - RAFT_LOG_INFO("ACE: Processed %lu/%lu vectors (%.1f%%)", + const auto report_percent = progress_step_10(vectors_processed, dataset_size); + if (report_percent >= next_progress_percent) { + RAFT_LOG_INFO("ACE build: dataset reorder progress %zu%% (%zu/%zu rows)", + report_percent, vectors_processed, - dataset_size, - 100.0 * vectors_processed / dataset_size); + dataset_size); + next_progress_percent = report_percent + 10; } } // Flush all remaining buffers - RAFT_LOG_DEBUG("ACE: Flushing remaining buffers..."); + RAFT_LOG_DEBUG("ACE build: flushing remaining reorder buffers"); #pragma omp parallel sections { #pragma omp section @@ -662,12 +672,12 @@ void ace_reorder_and_store_dataset( elapsed_ms > 0 ? to_mib(total_bytes_written) / (elapsed_ms / 1000.0) : 0.0; RAFT_LOG_INFO( - "ACE: Dataset (%.2f GiB reordered, %.2f GiB augmented, %.2f GiB mapping) reordering completed " - "in %ld ms (%.1f MiB/s)", + "ACE build: dataset reorder completed in %ld ms (reordered=%.2f GiB augmented=%.2f GiB " + "mapping=%.2f GiB, %.1f MiB/s)", + elapsed_ms, reordered_file_size / (1024.0 * 1024.0 * 1024.0), augmented_file_size / (1024.0 * 1024.0 * 1024.0), mapping_file_size / (1024.0 * 1024.0 * 1024.0), - elapsed_ms, throughput_mb_s); } @@ -685,17 +695,17 @@ void ace_load_partition_dataset_from_disk( { size_t n_partitions = partition_histogram.extent(0); - RAFT_LOG_DEBUG("ACE: Loading partition %lu dataset from disk", partition_id); + RAFT_LOG_DEBUG("ACE build: loading partition %lu dataset from disk", partition_id); size_t core_size = partition_histogram(partition_id, 0); size_t augmented_size = partition_histogram(partition_id, 1); size_t total_partition_size = core_size + augmented_size; - RAFT_LOG_DEBUG("ACE: Partition %lu: %lu core + %lu augmented = %lu total vectors", + RAFT_LOG_DEBUG("ACE build: partition %lu rows=%lu (core=%lu augmented=%lu)", partition_id, + total_partition_size, core_size, - augmented_size, - total_partition_size); + augmented_size); RAFT_EXPECTS(static_cast(sub_dataset.extent(0)) == total_partition_size, "sub_dataset rows (%lu) must match total partition size (%lu)", @@ -712,10 +722,10 @@ void ace_load_partition_dataset_from_disk( const std::string augmented_dataset_path = build_dir + "/augmented_dataset.npy"; if (!std::filesystem::exists(reordered_dataset_path)) { - RAFT_FAIL("ACE: Required file does not exist: %s", reordered_dataset_path.c_str()); + RAFT_FAIL("ACE build: required file does not exist: %s", reordered_dataset_path.c_str()); } if (!std::filesystem::exists(augmented_dataset_path)) { - RAFT_FAIL("ACE: Required file does not exist: %s", augmented_dataset_path.c_str()); + RAFT_FAIL("ACE build: required file does not exist: %s", augmented_dataset_path.c_str()); } size_t core_header_size = 0; @@ -748,7 +758,7 @@ void ace_load_partition_dataset_from_disk( core_file_offset += core_header_size; augmented_file_offset += augmented_header_size; - RAFT_LOG_DEBUG("ACE: Core file offset: %lu bytes, Augmented file offset: %lu bytes", + RAFT_LOG_DEBUG("ACE build: core file offset=%lu bytes augmented file offset=%lu bytes", core_file_offset, augmented_file_offset); @@ -763,7 +773,7 @@ void ace_load_partition_dataset_from_disk( try { if (core_size > 0) { RAFT_LOG_DEBUG( - "ACE: Reading %lu core vectors from offset %lu", core_size, core_file_offset); + "ACE build: reading %lu core vectors from offset %lu", core_size, core_file_offset); cuvs::util::file_descriptor reordered_fd(reordered_dataset_path, O_RDONLY); const size_t core_bytes = core_size * vector_size; cuvs::util::read_large_file( @@ -777,7 +787,7 @@ void ace_load_partition_dataset_from_disk( { try { if (augmented_size > 0) { - RAFT_LOG_DEBUG("ACE: Reading %lu augmented vectors from offset %lu", + RAFT_LOG_DEBUG("ACE build: reading %lu augmented vectors from offset %lu", augmented_size, augmented_file_offset); cuvs::util::file_descriptor augmented_fd(augmented_dataset_path, O_RDONLY); @@ -879,7 +889,8 @@ bool ace_check_use_disk_mode(bool use_disk, // Use overridden memory limits if provided (> 0), otherwise query actual system memory if (max_host_memory_gb.has_value() && max_host_memory_gb.value() > 0) { mem.available_host_memory = static_cast(max_host_memory_gb.value() * (1ULL << 30)); - RAFT_LOG_INFO("ACE: Using overridden host memory limit: %.2f GiB", max_host_memory_gb.value()); + RAFT_LOG_INFO("ACE build: using overridden host memory limit %.2f GiB", + max_host_memory_gb.value()); } else { mem.available_host_memory = cuvs::util::get_free_host_memory(); } @@ -903,7 +914,7 @@ bool ace_check_use_disk_mode(bool use_disk, mem.total_size = mem.partition_labels_size + mem.id_mapping_size + mem.sub_dataset_size + mem.sub_graph_size + mem.cagra_graph_size; - RAFT_LOG_INFO("ACE: Estimated host memory required: %.2f GiB, available: %.2f GiB", + RAFT_LOG_INFO("ACE build: host memory estimate required=%.2f GiB available=%.2f GiB", to_gib(mem.total_size), to_gib(mem.available_host_memory)); @@ -915,7 +926,8 @@ bool ace_check_use_disk_mode(bool use_disk, // TODO: Extend model or use managed memory if running out of GPU memory. if (max_gpu_memory_gb.has_value() && max_gpu_memory_gb.value() > 0) { mem.available_gpu_memory = static_cast(max_gpu_memory_gb.value() * (1ULL << 30)); - RAFT_LOG_INFO("ACE: Using overridden GPU memory limit: %.2f GiB", max_gpu_memory_gb.value()); + RAFT_LOG_INFO("ACE build: using overridden GPU memory limit %.2f GiB", + max_gpu_memory_gb.value()); } else { mem.available_gpu_memory = rmm::available_device_memory().second; } @@ -923,7 +935,7 @@ bool ace_check_use_disk_mode(bool use_disk, static_cast(usable_gpu_memory_fraction * mem.available_gpu_memory) < std::max(mem.sub_graph_size, mem.sub_dataset_size); - RAFT_LOG_INFO("ACE: Estimated GPU memory required: %.2f GiB, available: %.2f GiB", + RAFT_LOG_INFO("ACE build: GPU memory estimate required=%.2f GiB available=%.2f GiB", to_gib(mem.cagra_graph_size), to_gib(mem.available_gpu_memory)); @@ -934,7 +946,7 @@ bool ace_check_use_disk_mode(bool use_disk, valid_build_dir &= build_dir.find('\0') == std::string::npos; valid_build_dir &= build_dir.find("//") == std::string::npos; if (!valid_build_dir) { - RAFT_LOG_WARN("ACE: Invalid build_dir path, resetting to default: /tmp/ace_build"); + RAFT_LOG_WARN("ACE build: invalid build_dir path, resetting to /tmp/ace_build"); build_dir = "/tmp/ace_build"; } if (mkdir(build_dir.c_str(), 0755) != 0 && errno != EEXIST) { @@ -943,24 +955,19 @@ bool ace_check_use_disk_mode(bool use_disk, } if (host_memory_limited && gpu_memory_limited) { - RAFT_LOG_INFO( - "ACE: Graph does not fit in host and GPU memory. Using disk-mode with temporary storage %s", - build_dir.c_str()); + RAFT_LOG_INFO("ACE build: graph does not fit in host or GPU memory; using disk mode at %s", + build_dir.c_str()); } else if (host_memory_limited) { - RAFT_LOG_INFO( - "ACE: Graph does not fit in host memory. Using disk-mode with temporary storage %s", - build_dir.c_str()); + RAFT_LOG_INFO("ACE build: graph does not fit in host memory; using disk mode at %s", + build_dir.c_str()); } else if (gpu_memory_limited) { - RAFT_LOG_INFO( - "ACE: Graph does not fit in GPU memory. Using disk-mode with temporary storage %s", - build_dir.c_str()); + RAFT_LOG_INFO("ACE build: graph does not fit in GPU memory; using disk mode at %s", + build_dir.c_str()); } else if (use_disk) { - RAFT_LOG_INFO( - "ACE: Graph fits in host and GPU memory but disk mode is forced. Using disk-mode with " - "temporary storage %s", - build_dir.c_str()); + RAFT_LOG_INFO("ACE build: graph fits in host and GPU memory but disk mode is forced; using %s", + build_dir.c_str()); } else { - RAFT_LOG_INFO("ACE: Graph fits in host and GPU memory. Using in-memory mode."); + RAFT_LOG_INFO("ACE build: graph fits in host and GPU memory; using in-memory mode"); } return use_disk_mode; @@ -1005,8 +1012,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions, disk_mode_host_required) { host_memory_insufficient = true; RAFT_LOG_WARN( - "ACE: Host memory insufficient for disk mode. Required: %.2f GiB, available: %.2f GiB. " - "Per-partition breakdown: dataset %.2f GiB, graph %.2f GiB, workspace %.2f GiB", + "ACE build: host memory is insufficient for disk mode: required=%.2f GiB available=%.2f " + "GiB (partition dataset=%.2f GiB graph=%.2f GiB workspace=%.2f GiB)", to_gib(disk_mode_host_required), to_gib(mem.available_host_memory), to_gib(mem.sub_dataset_size), @@ -1017,8 +1024,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions, double available_for_scaling = usable_cpu_memory_fraction * mem.available_host_memory - mem.partition_labels_size - mem.id_mapping_size; RAFT_EXPECTS(available_for_scaling > 0, - "ACE: Host memory insufficient even for constant overhead (labels + id_mapping). " - "Required: %.2f GiB, available: %.2f GiB", + "ACE build: host memory is insufficient even for constant overhead " + "(labels + id_mapping): required=%.2f GiB available=%.2f GiB", to_gib(mem.partition_labels_size + mem.id_mapping_size), to_gib(usable_cpu_memory_fraction * mem.available_host_memory)); host_suggested_partitions = static_cast( @@ -1040,9 +1047,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions, disk_mode_gpu_required) { gpu_memory_insufficient = true; RAFT_LOG_WARN( - "ACE: GPU memory insufficient for per-partition processing. Required: %.2f GiB, " - "available: %.2f GiB. Per-partition breakdown: dataset %.2f GiB, graph %.2f GiB, " - "workspace %.2f GiB", + "ACE build: GPU memory is insufficient for partition processing: required=%.2f GiB " + "available=%.2f GiB (partition dataset=%.2f GiB graph=%.2f GiB workspace=%.2f GiB)", to_gib(disk_mode_gpu_required), to_gib(mem.available_gpu_memory), to_gib(mem.sub_dataset_size), @@ -1060,8 +1066,7 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions, size_t new_n_partitions = std::max(host_suggested_partitions, gpu_suggested_partitions); RAFT_LOG_WARN( - "ACE: Automatically increasing number of partitions from %zu to %zu to satisfy memory " - "constraints.%s%s", + "ACE build: increasing partitions from %zu to %zu to satisfy memory constraints.%s%s", original_n_partitions, new_n_partitions, host_memory_insufficient @@ -1088,8 +1093,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions, guarantee_connectivity); RAFT_LOG_INFO( - "ACE: Updated per-partition memory estimates: dataset %.2f GiB, graph %.2f GiB, " - "host workspace %.2f GiB, GPU workspace %.2f GiB", + "ACE build: updated partition memory estimates dataset=%.2f GiB graph=%.2f GiB host " + "workspace=%.2f GiB GPU workspace=%.2f GiB", to_gib(mem.sub_dataset_size), to_gib(mem.sub_graph_size), to_gib(new_opt_host_ws), @@ -1131,15 +1136,15 @@ index build_ace(raft::resources const& res, size_t dataset_size = dataset.extent(0); size_t dataset_dim = dataset.extent(1); - RAFT_EXPECTS(dataset_size > 0, "ACE: Dataset must not be empty"); + RAFT_EXPECTS(dataset_size > 0, "ACE build: dataset must not be empty"); if (dataset_size < 1000) { - RAFT_LOG_WARN("ACE: Very small dataset size (%zu), consider using regular CAGRA build instead.", + RAFT_LOG_WARN("ACE build: very small dataset size (%zu); regular CAGRA may be simpler", dataset_size); } - RAFT_EXPECTS(dataset_dim > 0, "ACE: Dataset dimension must be greater than 0"); + RAFT_EXPECTS(dataset_dim > 0, "ACE build: dataset dimension must be greater than 0"); RAFT_EXPECTS(params.intermediate_graph_degree > 0, - "ACE: Intermediate graph degree must be greater than 0"); - RAFT_EXPECTS(params.graph_degree > 0, "ACE: Graph degree must be greater than 0"); + "ACE build: intermediate graph degree must be greater than 0"); + RAFT_EXPECTS(params.graph_degree > 0, "ACE build: graph degree must be greater than 0"); size_t n_partitions = npartitions; if (n_partitions == 0) { @@ -1152,17 +1157,22 @@ index build_ace(raft::resources const& res, n_partitions = dataset_size / min_required_per_partition; if (n_partitions < 2) { RAFT_LOG_WARN( - "ACE: Reduced number of partitions to the minimum of 2 to avoid tiny partitions. Consider " - "using regular CAGRA build instead."); + "ACE build: reduced partitions to the minimum of 2 to avoid tiny partitions; regular " + "CAGRA may be simpler"); n_partitions = 2; } else { - RAFT_LOG_WARN("ACE: Reduced number of partitions to %zu to avoid tiny partitions", - n_partitions); + RAFT_LOG_WARN("ACE build: reduced partitions to %zu to avoid tiny partitions", n_partitions); } } auto total_start = std::chrono::high_resolution_clock::now(); - RAFT_LOG_INFO("ACE: Starting partitioned CAGRA build with %zu partitions", n_partitions); + RAFT_LOG_INFO( + "ACE build: start rows=%zu dim=%zu partitions=%zu graph_degree=%zu intermediate_degree=%zu", + dataset_size, + dataset_dim, + n_partitions, + static_cast(params.graph_degree), + static_cast(params.intermediate_graph_degree)); size_t intermediate_degree = params.intermediate_graph_degree; size_t graph_degree = params.graph_degree; @@ -1228,7 +1238,7 @@ index build_ace(raft::resources const& res, build_dir + "/cagra_graph.npy", {dataset_size, graph_degree}); RAFT_LOG_DEBUG( - "ACE: Wrote numpy headers (reordered: %zu, augmented: %zu, mapping: %zu, graph: %zu bytes)", + "ACE build: wrote numpy headers (reordered=%zu augmented=%zu mapping=%zu graph=%zu bytes)", reordered_header_size, augmented_header_size, mapping_header_size, @@ -1259,11 +1269,9 @@ index build_ace(raft::resources const& res, auto partition_elapsed = std::chrono::duration_cast(partition_end - partition_start) .count(); - RAFT_LOG_INFO( - "ACE: Partition labeling completed in %ld ms (min_partition_size: " - "%lu)", - partition_elapsed, - min_partition_size); + RAFT_LOG_INFO("ACE build: partition labeling completed in %ld ms (min_partition_size=%lu)", + partition_elapsed, + min_partition_size); // Create vector lists for each partition auto vectorlist_start = std::chrono::high_resolution_clock::now(); @@ -1288,7 +1296,7 @@ index build_ace(raft::resources const& res, auto vectorlist_elapsed = std::chrono::duration_cast(vectorlist_end - vectorlist_start) .count(); - RAFT_LOG_INFO("ACE: Vector list creation completed in %ld ms", vectorlist_elapsed); + RAFT_LOG_INFO("ACE build: partition mapping completed in %ld ms", vectorlist_elapsed); // Reorder the dataset based on partitions and store to disk. Uses write buffers to improve // performance. @@ -1319,7 +1327,7 @@ index build_ace(raft::resources const& res, // Process each partition auto partition_processing_start = std::chrono::high_resolution_clock::now(); for (size_t partition_id = 0; partition_id < n_partitions; partition_id++) { - RAFT_LOG_DEBUG("ACE: Processing partition %lu/%lu", partition_id + 1, n_partitions); + RAFT_LOG_DEBUG("ACE build: processing partition %lu/%lu", partition_id + 1, n_partitions); auto start = std::chrono::high_resolution_clock::now(); // Extract vectors for this partition @@ -1328,10 +1336,11 @@ index build_ace(raft::resources const& res, size_t sub_dataset_size = core_sub_dataset_size + augmented_sub_dataset_size; if (sub_dataset_size == 0) { - RAFT_LOG_WARN("ACE: Skipping empty partition %lu", partition_id); + RAFT_LOG_WARN("ACE build: Skipping empty partition %lu", partition_id); continue; } - RAFT_LOG_DEBUG("ACE: Sub-dataset size: %lu (%lu + %lu)", + RAFT_LOG_DEBUG("ACE build: partition %lu rows=%lu (core=%lu augmented=%lu)", + partition_id, sub_dataset_size, core_sub_dataset_size, augmented_sub_dataset_size); @@ -1444,12 +1453,12 @@ index build_ace(raft::resources const& res, ? to_mib(core_sub_dataset_size * dataset_dim * sizeof(T)) / (write_elapsed / 1000.0) : 0.0; RAFT_LOG_INFO( - "ACE: Partition %4lu (%8lu + %8lu) completed in %6ld ms: read %6ld ms (%7.1f MiB/s), " - "optimize %6ld ms, adjust %6ld ms, write %6ld ms (%7.1f MiB/s)", + "ACE build: partition %4lu completed in %6ld ms (core=%8lu augmented=%8lu read=%6ld ms " + "%.1f MiB/s optimize=%6ld ms adjust=%6ld ms write=%6ld ms %.1f MiB/s)", partition_id, + elapsed_ms, core_sub_dataset_size, augmented_sub_dataset_size, - elapsed_ms, read_elapsed, read_throughput, optimize_elapsed, @@ -1462,7 +1471,7 @@ index build_ace(raft::resources const& res, auto partition_processing_elapsed = std::chrono::duration_cast( partition_processing_end - partition_processing_start) .count(); - RAFT_LOG_INFO("ACE: All partition processing completed in %ld ms (%zu partitions)", + RAFT_LOG_INFO("ACE build: partition graph build completed in %ld ms (partitions=%zu)", partition_processing_elapsed, n_partitions); @@ -1472,7 +1481,7 @@ index build_ace(raft::resources const& res, const std::string augmented_dataset_path = build_dir + "/augmented_dataset.npy"; if (std::filesystem::exists(augmented_dataset_path)) { std::filesystem::remove(augmented_dataset_path); - RAFT_LOG_INFO("ACE: Removed augmented dataset file to save disk space"); + RAFT_LOG_INFO("ACE build: removed temporary augmented dataset"); } } @@ -1488,12 +1497,12 @@ index build_ace(raft::resources const& res, idx.update_dataset(res, dataset); } catch (std::bad_alloc& e) { RAFT_LOG_WARN( - "Insufficient GPU memory to attach dataset to ACE index. Only the graph will be " - "stored."); + "ACE build: insufficient GPU memory to attach dataset to index; only the graph will " + "be stored"); } catch (raft::logic_error& e) { RAFT_LOG_WARN( - "Insufficient GPU memory to attach dataset to ACE index. Only the graph will be " - "stored."); + "ACE build: insufficient GPU memory to attach dataset to index; only the graph will " + "be stored"); } } } else { @@ -1501,37 +1510,36 @@ index build_ace(raft::resources const& res, idx.update_graph(res, std::move(graph_fd)); idx.update_mapping(res, std::move(mapping_fd)); - RAFT_LOG_INFO( - "ACE: Set disk storage at %s (dataset shape [%zu, %zu], graph shape [%zu, %zu])", - build_dir.c_str(), - idx.size(), - idx.dim(), - idx.size(), - idx.graph_degree()); + RAFT_LOG_INFO("ACE build: disk artifacts ready at %s (dataset=[%zu,%zu] graph=[%zu,%zu])", + build_dir.c_str(), + idx.size(), + idx.dim(), + idx.size(), + idx.graph_degree()); } auto index_creation_end = std::chrono::high_resolution_clock::now(); auto index_creation_elapsed = std::chrono::duration_cast( index_creation_end - index_creation_start) .count(); - RAFT_LOG_INFO("ACE: Final index creation completed in %ld ms", index_creation_elapsed); + RAFT_LOG_INFO("ACE build: final index initialized in %ld ms", index_creation_elapsed); auto total_end = std::chrono::high_resolution_clock::now(); auto total_elapsed = std::chrono::duration_cast(total_end - total_start).count(); - RAFT_LOG_INFO("ACE: Partitioned CAGRA build completed in %ld ms total", total_elapsed); + RAFT_LOG_INFO("ACE build: completed in %ld ms", total_elapsed); return idx; } catch (const std::exception& e) { // Clean up build directory on failure if we created it - RAFT_LOG_ERROR("ACE: Build failed with exception: %s", e.what()); + RAFT_LOG_ERROR("ACE build: failed with exception: %s", e.what()); if (cleanup_on_failure && !build_dir.empty()) { - RAFT_LOG_INFO("ACE: Cleaning up build directory: %s", build_dir.c_str()); + RAFT_LOG_INFO("ACE build: cleaning up build directory %s", build_dir.c_str()); try { std::filesystem::remove_all(build_dir); - RAFT_LOG_INFO("ACE: Successfully removed build directory"); + RAFT_LOG_INFO("ACE build: removed build directory"); } catch (const std::exception& cleanup_error) { - RAFT_LOG_WARN("ACE: Failed to clean up build directory: %s", cleanup_error.what()); + RAFT_LOG_WARN("ACE build: failed to clean up build directory: %s", cleanup_error.what()); } } // Re-throw the original exception diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp index d090de07f8..5be875d804 100644 --- a/cpp/src/neighbors/detail/hnsw.hpp +++ b/cpp/src/neighbors/detail/hnsw.hpp @@ -411,6 +411,13 @@ inline auto throughput_gib_per_s(size_t bytes, std::chrono::steady_clock::durati return elapsed_s > 0.0 ? to_gib(bytes) / elapsed_s : 0.0; } +inline auto progress_step_10(size_t completed, size_t total) -> size_t +{ + if (total == 0) { return 100; } + const auto percent = (std::min(completed, total) * 100) / total; + return completed >= total ? 100 : (percent / 10) * 10; +} + inline auto make_hnsw_level_plan_from_levels(size_t n_rows, std::vector&& levels, bool build_reverse_order, @@ -486,7 +493,7 @@ void build_hnsw_upper_layer_graphs( const auto row_count = plan.n_rows - start_idx; const auto neighbor_size = hnsw_upper_layer_degree(row_count, M); - RAFT_LOG_INFO("Compute hierarchy neighbors level %zu", pt_level); + RAFT_LOG_INFO("Layered HNSW artifact: computing upper-layer neighbors level=%zu", pt_level); auto host_neighbors = raft::make_host_matrix( static_cast(row_count), static_cast(neighbor_size)); @@ -814,7 +821,7 @@ void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index= sizeof(hnswlib::linklistsizeint) + maxM0 * sizeof(IdxT), "Base link row size is too small"); - RAFT_LOG_INFO("HNSW remap: loading ACE reordered-to-original mapping (%zu rows)", n_rows); + RAFT_LOG_INFO("Layered HNSW artifact: loading ACE row mapping (%zu rows)", n_rows); const auto mapping_start_time = std::chrono::steady_clock::now(); const auto mapping_bytes = n_rows * sizeof(IdxT); std::vector reordered_to_original(n_rows); @@ -828,11 +835,18 @@ void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index= next_report_percent) { + const auto elapsed = elapsed_ms_since(total_start_time); + RAFT_LOG_INFO( + "Layered HNSW artifact: base topology progress %zu%% (%zu/%zu rows, read=%.2f GiB " + "write=%.2f GiB, %ld ms)", + report_percent, + completed_rows, + n_rows, + to_gib(graph_bytes_read), + to_gib(node_bytes_written + link_bytes_written), + elapsed); + next_report_percent = report_percent + 10; + } } const auto total_elapsed_ms = elapsed_ms_since(total_start_time); RAFT_LOG_INFO( - "HNSW remap: base topology written in %ld ms (graph_read=%.2f GiB %.2f GiB/s, " + "Layered HNSW artifact: base topology written in %ld ms (graph_read=%.2f GiB %.2f GiB/s, " "artifact_write=%.2f GiB %.2f GiB/s, nodes=%.2f GiB links=%.2f GiB)", total_elapsed_ms, to_gib(graph_bytes_read), @@ -945,14 +983,15 @@ auto serialize_to_layered_hnsw_from_disk( auto appr_algo = std::make_unique::type>>( hnsw_index->get_space(), 1, (graph_degree_int + 1) / 2, params.ef_construction); - RAFT_LOG_INFO("Layered HNSW: generating hierarchy levels"); + RAFT_LOG_INFO("Layered HNSW artifact: generating hierarchy levels"); const auto hierarchy_start_time = std::chrono::steady_clock::now(); auto hierarchy = make_random_hnsw_level_plan(n_rows, *appr_algo, nullptr); const auto hierarchy_elapsed_ms = elapsed_ms_since(hierarchy_start_time); - RAFT_LOG_INFO("Layered HNSW: hierarchy levels generated in %ld ms (max_level=%d, promoted=%zu)", - hierarchy_elapsed_ms, - hierarchy.max_level(), - hierarchy.promoted_count()); + RAFT_LOG_INFO( + "Layered HNSW artifact: hierarchy levels generated in %ld ms (max_level=%d promoted=%zu)", + hierarchy_elapsed_ms, + hierarchy.max_level(), + hierarchy.promoted_count()); layered_hnsw_file_metadata metadata; metadata.n_rows = n_rows; @@ -1015,12 +1054,12 @@ auto serialize_to_layered_hnsw_from_disk( cuvs::util::write_large_file( artifact_fd, hierarchy.levels.data(), metadata.levels_bytes, levels_offset); const auto levels_elapsed_ms = elapsed_ms_since(levels_start_time); - RAFT_LOG_INFO("Layered HNSW: wrote levels section in %ld ms (%.2f GiB, %.2f GiB/s)", + RAFT_LOG_INFO("Layered HNSW artifact: levels section written in %ld ms (%.2f GiB, %.2f GiB/s)", levels_elapsed_ms, to_gib(metadata.levels_bytes), throughput_gib_per_s(metadata.levels_bytes, levels_elapsed_ms)); - RAFT_LOG_INFO("Layered HNSW: writing hnswlib-ready base links"); + RAFT_LOG_INFO("Layered HNSW artifact: writing hnswlib-ready base topology section"); const auto layer0_start_time = std::chrono::steady_clock::now(); write_layered_base_links_from_disk(index_, artifact_fd, @@ -1029,14 +1068,16 @@ auto serialize_to_layered_hnsw_from_disk( metadata.base_link_row_bytes, metadata.maxM0); const auto layer0_elapsed_ms = elapsed_ms_since(layer0_start_time); - RAFT_LOG_INFO("Layered HNSW: base links written in %ld ms (%.2f GiB, %.2f GiB/s effective)", - layer0_elapsed_ms, - to_gib(metadata.base_links_bytes), - throughput_gib_per_s(metadata.base_links_bytes, layer0_elapsed_ms)); + static_cast(layer0_elapsed_ms); + RAFT_LOG_INFO( + "Layered HNSW artifact: base topology section written in %ld ms (%.2f GiB, %.2f GiB/s)", + layer0_elapsed_ms, + to_gib(metadata.base_nodes_bytes + metadata.base_links_bytes), + throughput_gib_per_s(metadata.base_nodes_bytes + metadata.base_links_bytes, layer0_elapsed_ms)); size_t upper_graph_bytes_written = 0; if (hierarchy.hist.size() > 1) { - RAFT_LOG_INFO("Layered HNSW: gathering promoted vectors"); + RAFT_LOG_INFO("Layered HNSW artifact: gathering promoted vectors"); const auto gather_start_time = std::chrono::steady_clock::now(); auto host_query_set = raft::make_host_matrix(static_cast(hierarchy.promoted_count()), dim); @@ -1050,10 +1091,11 @@ auto serialize_to_layered_hnsw_from_disk( } const auto gather_elapsed_ms = elapsed_ms_since(gather_start_time); const auto gathered_bytes = hierarchy.promoted_count() * dim * sizeof(T); - RAFT_LOG_INFO("Layered HNSW: gathered promoted vectors in %ld ms (%.2f GiB copied, %.2f GiB/s)", - gather_elapsed_ms, - to_gib(gathered_bytes), - throughput_gib_per_s(gathered_bytes, gather_elapsed_ms)); + RAFT_LOG_INFO( + "Layered HNSW artifact: promoted vectors gathered in %ld ms (%.2f GiB copied, %.2f GiB/s)", + gather_elapsed_ms, + to_gib(gathered_bytes), + throughput_gib_per_s(gathered_bytes, gather_elapsed_ms)); auto promoted_dataset = raft::make_host_matrix_view( host_query_set.data_handle(), host_query_set.extent(0), host_query_set.extent(1)); @@ -1066,10 +1108,10 @@ auto serialize_to_layered_hnsw_from_disk( index_.metric(), [&](size_t pt_level, size_t start_idx, auto& host_neighbors) { const auto& layer = metadata.layers[pt_level - 1]; - RAFT_LOG_INFO("Layered HNSW: writing upper layer %zu (%zu rows, degree %zu)", - layer.level, - layer.row_count, - layer.degree); + RAFT_LOG_DEBUG("Layered HNSW artifact: writing upper layer level=%zu rows=%zu degree=%zu", + layer.level, + layer.row_count, + layer.degree); const auto layer_write_start_time = std::chrono::steady_clock::now(); const size_t target_batch_bytes = 64 * 1024 * 1024; const size_t row_bytes = sizeof(IdxT) + metadata.upper_link_row_bytes; @@ -1112,28 +1154,29 @@ auto serialize_to_layered_hnsw_from_disk( const auto layer_bytes = layer.row_count * (sizeof(IdxT) + metadata.upper_link_row_bytes); upper_graph_bytes_written += layer_bytes; const auto layer_write_elapsed_ms = elapsed_ms_since(layer_write_start_time); - RAFT_LOG_INFO("Layered HNSW: upper layer %zu written in %ld ms (%.2f GiB, %.2f GiB/s)", - layer.level, - layer_write_elapsed_ms, - to_gib(layer_bytes), - throughput_gib_per_s(layer_bytes, layer_write_elapsed_ms)); + static_cast(layer_write_elapsed_ms); + RAFT_LOG_INFO( + "Layered HNSW artifact: upper layer level=%zu written in %ld ms (%.2f GiB, %.2f GiB/s)", + layer.level, + layer_write_elapsed_ms, + to_gib(layer_bytes), + throughput_gib_per_s(layer_bytes, layer_write_elapsed_ms)); }); const auto upper_layers_elapsed_ms = elapsed_ms_since(upper_layers_start_time); RAFT_LOG_INFO( - "Layered HNSW: upper layers generated and written in %ld ms (%.2f GiB written, %.2f " - "GiB/s effective)", + "Layered HNSW artifact: upper layers generated and written in %ld ms (%.2f GiB, %.2f " + "GiB/s)", upper_layers_elapsed_ms, to_gib(upper_graph_bytes_written), throughput_gib_per_s(upper_graph_bytes_written, upper_layers_elapsed_ms)); } const auto total_elapsed_ms = elapsed_ms_since(total_start_time); - RAFT_LOG_INFO( - "Layered HNSW index written to: %s in %ld ms (artifact %.2f GiB, %.2f GiB/s effective)", - artifact_file.string().c_str(), - total_elapsed_ms, - to_gib(final_file_size), - throughput_gib_per_s(final_file_size, total_elapsed_ms)); + RAFT_LOG_INFO("Layered HNSW artifact: wrote %s in %ld ms (artifact=%.2f GiB, %.2f GiB/s)", + artifact_file.string().c_str(), + total_elapsed_ms, + to_gib(final_file_size), + throughput_gib_per_s(final_file_size, total_elapsed_ms)); return artifact_file.string(); } @@ -2096,7 +2139,13 @@ auto deserialize_layered_hnsw(raft::resources const& res, RAFT_EXPECTS(dataset_file.shape.size() == 2 && dataset_file.shape[0] == metadata.n_rows && dataset_file.shape[1] == metadata.dim, - "Layered HNSW dataset shape mismatch"); + "Layered HNSW dataset shape mismatch: artifact rows=%zu dim=%zu, dataset rows=%zu " + "dim=%zu path=%s", + metadata.n_rows, + metadata.dim, + dataset_file.shape.size() > 0 ? dataset_file.shape[0] : 0, + dataset_file.shape.size() > 1 ? dataset_file.shape[1] : 0, + params.dataset_path.c_str()); RAFT_EXPECTS(metadata.levels_bytes == metadata.n_rows * sizeof(uint8_t), "Layered HNSW levels section size mismatch"); RAFT_EXPECTS(metadata.base_nodes_bytes == metadata.n_rows * sizeof(uint32_t), @@ -2117,9 +2166,32 @@ auto deserialize_layered_hnsw(raft::resources const& res, dataset_open_elapsed_ms, params.dataset_path.c_str()); + const auto dataset_total_bytes = metadata.n_rows * metadata.dim * sizeof(T); + const auto deserialize_progress_total_bytes = + metadata.levels_bytes + dataset_total_bytes + metadata.base_nodes_bytes + + metadata.base_links_bytes + metadata.upper_nodes_bytes + metadata.upper_links_bytes; + size_t deserialize_progress_bytes = 0; + size_t deserialize_next_progress_percent = 10; + auto log_deserialize_progress = [&](size_t bytes_loaded) { + if (deserialize_progress_total_bytes == 0) { return; } + deserialize_progress_bytes = + std::min(deserialize_progress_total_bytes, deserialize_progress_bytes + bytes_loaded); + const auto progress_percent = + (deserialize_progress_bytes * 100) / deserialize_progress_total_bytes; + const auto progress_step = std::min(100, (progress_percent / 10) * 10); + if (progress_step >= deserialize_next_progress_percent) { + RAFT_LOG_INFO("Layered HNSW load: deserialize progress %zu%% (%.2f/%.2f GiB)", + progress_step, + to_gib(deserialize_progress_bytes), + to_gib(deserialize_progress_total_bytes)); + deserialize_next_progress_percent = progress_step + 10; + } + }; + const auto levels_start_time = std::chrono::steady_clock::now(); std::vector levels_u8(metadata.n_rows); cuvs::util::read_large_file(artifact_fd, levels_u8.data(), metadata.levels_bytes, levels_offset); + log_deserialize_progress(metadata.levels_bytes); const auto max_level_in_levels = *std::max_element(levels_u8.begin(), levels_u8.end()); RAFT_EXPECTS(static_cast(max_level_in_levels) == metadata.maxlevel, "Layered HNSW levels max level (%d) does not match artifact maxlevel (%d)", @@ -2203,6 +2275,7 @@ auto deserialize_layered_hnsw(raft::resources const& res, throw std::runtime_error("Not enough memory to allocate HNSW upper linklists"); } base_copy_time += std::chrono::steady_clock::now() - batch_timer; + log_deserialize_progress(current_dataset_bytes); } const auto base_elapsed_ms = elapsed_ms_since(base_start_time); RAFT_LOG_INFO( @@ -2257,6 +2330,7 @@ auto deserialize_layered_hnsw(raft::resources const& res, } RAFT_EXPECTS(!invalid_node_id, "Invalid base-layer node id in layered HNSW artifact"); base_topology_copy_time += std::chrono::steady_clock::now() - batch_timer; + log_deserialize_progress(current_base_topology_bytes); } const auto base_topology_elapsed_ms = elapsed_ms_since(base_topology_start_time); RAFT_LOG_INFO( @@ -2320,6 +2394,7 @@ auto deserialize_layered_hnsw(raft::resources const& res, RAFT_EXPECTS(!invalid_node_level, "Layered HNSW artifact references a node at an invalid upper level"); upper_copy_time += std::chrono::steady_clock::now() - batch_timer; + log_deserialize_progress(current_upper_bytes); } } const auto upper_elapsed_ms = elapsed_ms_since(upper_start_time); @@ -2412,15 +2487,14 @@ std::unique_ptr> build(raft::resources const& res, cagra_ace_params.max_gpu_memory_gb = ace_params.max_gpu_memory_gb; cagra_params.graph_build_params = cagra_ace_params; - RAFT_LOG_INFO( - "hnsw::build - Building HNSW index using ACE with %zu partitions, ef_construction=%zu", - ace_params.npartitions, - ace_params.ef_construction); + RAFT_LOG_INFO("HNSW ACE build: building CAGRA graph (partitions=%zu ef_construction=%zu)", + cagra_ace_params.npartitions, + cagra_ace_params.ef_construction); // Build CAGRA index using ACE auto cagra_index = cuvs::neighbors::cagra::build(res, cagra_params, dataset); - RAFT_LOG_INFO("hnsw::build - Converting CAGRA index to HNSW format"); + RAFT_LOG_INFO("HNSW ACE build: converting CAGRA graph to HNSW hierarchy"); // Convert CAGRA index to HNSW index return from_cagra(res, params, cagra_index, dataset);