diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h index 57b47d97db..6ef2ae25c2 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h +++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h @@ -341,24 +341,52 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index params.guarantee_connectivity = conf.at("guarantee_connectivity"); } - // Override the graph_build_algo if requested explicitly + if (conf.contains("variable_graph_degree_fraction")) { + params.variable_graph_degree_fraction = conf.at("variable_graph_degree_fraction"); + } + + // Extract build-algo-specific parameters + nlohmann::json ivf_pq_build_conf = collect_conf_with_prefix(conf, "ivf_pq_build_"); + nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_"); + nlohmann::json nn_descent_conf = collect_conf_with_prefix(conf, "nn_descent_"); + nlohmann::json ace_conf = collect_conf_with_prefix(conf, "ace_"); + + // Determine and initialize graph build algorithm. + // Priority 1: explicit "graph_build_algo" config key. + // Priority 2: infer from algorithm-specific prefixed config keys (only when monostate). + // Priority 3: leave as-is (from prior heuristics or monostate for AUTO at build time). + std::string graph_build_algo; if (conf.contains("graph_build_algo")) { - if (conf.at("graph_build_algo") == "IVF_PQ") { + graph_build_algo = conf.at("graph_build_algo"); + } else if (std::holds_alternative(params.graph_build_params)) { + if (!ivf_pq_build_conf.empty() || !ivf_pq_search_conf.empty()) { + graph_build_algo = "IVF_PQ"; + } else if (!nn_descent_conf.empty()) { + graph_build_algo = "NN_DESCENT"; + } else if (!ace_conf.empty()) { + graph_build_algo = "ACE"; + } + // else: leave as monostate → AUTO in cagra_build.cuh + } + + if (!graph_build_algo.empty()) { + if (graph_build_algo == "IVF_PQ") { if (!std::holds_alternative( params.graph_build_params)) { params.graph_build_params = cuvs::neighbors::graph_build_params::ivf_pq_params{}; } - } else if (conf.at("graph_build_algo") == "NN_DESCENT") { + } else if (graph_build_algo == "NN_DESCENT") { if (!std::holds_alternative( params.graph_build_params)) { - params.graph_build_params = cuvs::neighbors::graph_build_params::nn_descent_params{}; + params.graph_build_params = cuvs::neighbors::graph_build_params::nn_descent_params( + params.intermediate_graph_degree, params.metric); } - } else if (conf.at("graph_build_algo") == "ACE") { + } else if (graph_build_algo == "ACE") { if (!std::holds_alternative( params.graph_build_params)) { params.graph_build_params = cuvs::neighbors::graph_build_params::ace_params{}; } - } else if (conf.at("graph_build_algo") == "ITERATIVE_SEARCH") { + } else if (graph_build_algo == "ITERATIVE_SEARCH") { if (!std::holds_alternative( params.graph_build_params)) { params.graph_build_params = cuvs::neighbors::graph_build_params::iterative_search_params{}; @@ -366,26 +394,6 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index } } - // Parse build-algo-specific parameters and use them to decide on the algo type - nlohmann::json ivf_pq_build_conf = collect_conf_with_prefix(conf, "ivf_pq_build_"); - nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_"); - nlohmann::json nn_descent_conf = collect_conf_with_prefix(conf, "nn_descent_"); - nlohmann::json ace_conf = collect_conf_with_prefix(conf, "ace_"); - - // When graph_build_algo is not specified, leave graph_build_params as monostate so the - // CAGRA build uses AUTO selection (NN_DESCENT or IVF_PQ based on dataset/heuristics). - // Only infer from algo-specific config keys when present. - if (std::holds_alternative(params.graph_build_params)) { - if (!ivf_pq_build_conf.empty() || !ivf_pq_search_conf.empty()) { - params.graph_build_params = cuvs::neighbors::graph_build_params::ivf_pq_params{}; - } else if (!nn_descent_conf.empty()) { - params.graph_build_params = cuvs::neighbors::graph_build_params::nn_descent_params{}; - } else if (!ace_conf.empty()) { - params.graph_build_params = cuvs::neighbors::graph_build_params::ace_params{}; - } - // else: leave as monostate → AUTO in cagra_build.cuh - } - // Apply build-algo-specific parameters std::visit( [&](auto& arg) { diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index 637e40c340..7c533d0737 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -146,11 +146,37 @@ enum class hnsw_heuristic_type : uint32_t { SAME_GRAPH_FOOTPRINT = 1 }; +/** + * Sentinel marking an invalid / absent neighbor in a CAGRA graph. Variable-degree + * graphs (see index_params::variable_graph_degree_fraction) pad unused neighbor + * slots with this value, and consumers should treat it as "end of neighbor list". + */ +template +constexpr static IdxT kInvalidNeighbor = static_cast(-1); + struct index_params : cuvs::neighbors::index_params { /** Degree of input graph for pruning. */ size_t intermediate_graph_degree = 128; /** Degree of output graph. */ size_t graph_degree = 64; + /** + * Fraction of output graph_degree to define the minimum output graph degree, + * allowing variable-degree neighbor graphs. + * + * This fraction is used as the target for low-detour edges + * during the pruning step. Must be in (0, 1]. The default value of 1.0 + * disables variable-degree logic (normal CAGRA behavior). Values < 1.0 + * enable variable-degree graphs: the optimize step finds the minimum detour + * threshold that covers at least ceil(graph_degree * fraction) edges per node, + * then lets reverse edges expand the degree further. Unused slots are filled + * with a sentinel value (`kInvalidNeighbor`). + * + * This is intended for the CAGRA-to-HNSW conversion pipeline: the resulting + * graph, when imported into hnswlib, produces variable-degree neighbor lists + * similar to natively-built HNSW graphs. Do not use this with CAGRA's native + * GPU search. + */ + double variable_graph_degree_fraction = 1.0; /** * Specify compression parameters if compression is desired. If set, overrides the * attach_dataset_on_build (and the compressed dataset is always added to the index). diff --git a/cpp/src/neighbors/cagra.cpp b/cpp/src/neighbors/cagra.cpp index 6aa5737e36..cd63de9a24 100644 --- a/cpp/src/neighbors/cagra.cpp +++ b/cpp/src/neighbors/cagra.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -40,13 +40,15 @@ cagra::index_params index_params::from_hnsw_params(raft::matrix_extent cagra::index_params params; switch (heuristic) { case hnsw_heuristic_type::SAME_GRAPH_FOOTPRINT: - params.graph_degree = M * 2; - params.intermediate_graph_degree = M * 3; + params.graph_degree = M * 2; + params.intermediate_graph_degree = M * 3; + params.variable_graph_degree_fraction = 0.35; break; case hnsw_heuristic_type::SIMILAR_SEARCH_PERFORMANCE: default: - params.graph_degree = 2 + M * 2 / 3; - params.intermediate_graph_degree = M + M * ef_construction / 256; + params.graph_degree = M; + params.intermediate_graph_degree = M + M * ef_construction / 256; + params.variable_graph_degree_fraction = 0.7; break; } params.graph_build_params = diff --git a/cpp/src/neighbors/cagra.cuh b/cpp/src/neighbors/cagra.cuh index ee87c2c0ab..be789e04e8 100644 --- a/cpp/src/neighbors/cagra.cuh +++ b/cpp/src/neighbors/cagra.cuh @@ -259,9 +259,11 @@ void optimize( raft::resources const& res, raft::mdspan, raft::row_major, g_accessor> knn_graph, raft::host_matrix_view new_graph, - const bool guarantee_connectivity = false) + const bool guarantee_connectivity = false, + const double variable_graph_degree_fraction = 1.0) { - detail::optimize(res, knn_graph, new_graph, guarantee_connectivity); + detail::optimize( + res, knn_graph, new_graph, guarantee_connectivity, variable_graph_degree_fraction); } template ) { + // Variable-degree padding / invalid-neighbor sentinel: propagate as-is. + search_graph(i_original, k) = kInvalidNeighbor; + continue; + } + if (j < core_sub_dataset_size) { // core partition neighbor: local → core reordered → original size_t j_reordered = j + core_partition_offsets(partition_id); @@ -443,6 +449,11 @@ void ace_adjust_sub_graph_ids_disk( for (size_t i = 0; i < core_sub_dataset_size; i++) { for (size_t k = 0; k < graph_degree; k++) { size_t j = sub_search_graph(i, k); + if (j == kInvalidNeighbor) { + // Variable-degree padding / invalid-neighbor sentinel: propagate as-is. + sub_search_graph(i, k) = kInvalidNeighbor; + continue; + } if (j < core_sub_dataset_size) { // core partition neighbor: local → core reordered sub_search_graph(i, k) = j + core_partition_offsets(partition_id); @@ -1930,7 +1941,8 @@ void optimize( raft::resources const& res, raft::mdspan, raft::row_major, g_accessor> knn_graph, raft::host_matrix_view new_graph, - const bool guarantee_connectivity = false) + const bool guarantee_connectivity = false, + const double variable_graph_degree_fraction = 1.0) { using internal_IdxT = typename std::make_unsigned::type; @@ -1947,8 +1959,12 @@ void optimize( knn_graph.extent(0), knn_graph.extent(1)); - cagra::detail::graph::optimize( - res, knn_graph_internal, new_graph_internal, guarantee_connectivity); + cagra::detail::graph::optimize(res, + knn_graph_internal, + new_graph_internal, + guarantee_connectivity, + true, + variable_graph_degree_fraction); } // RAII wrapper for allocating memory with Transparent HugePage @@ -2168,8 +2184,11 @@ auto iterative_build_graph( auto next_graph_size = curr_query_size; cagra_graph = raft::make_host_matrix(0, 0); // delete existing grahp cagra_graph = raft::make_host_matrix(next_graph_size, next_graph_degree); - optimize( - res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0); + optimize(res, + neighbors_view, + cagra_graph.view(), + flag_last ? params.guarantee_connectivity : false, + flag_last ? params.variable_graph_degree_fraction : 1.0); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_ms = std::chrono::duration_cast(end - start).count(); @@ -2289,7 +2308,11 @@ index build( cagra_graph = raft::make_host_matrix(dataset.extent(0), graph_degree); RAFT_LOG_TRACE("optimizing graph"); - optimize(res, knn_graph->view(), cagra_graph.view(), params.guarantee_connectivity); + optimize(res, + knn_graph->view(), + cagra_graph.view(), + params.guarantee_connectivity, + params.variable_graph_degree_fraction); // free intermediate graph before trying to create the index knn_graph.reset(); diff --git a/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp b/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp index 3f79df47dd..f13f831eeb 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp +++ b/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp @@ -65,6 +65,8 @@ std::tuple optimize_workspace_size(size_t n_rows size_t prune_dev = n_rows * intermediate_degree * index_size; // d_input_graph prune_dev += prune_dev_fixed; + // d_natural_degree (only allocated when variable_graph_degree_fraction < 1.0) + prune_dev += n_rows * sizeof(uint32_t); // Reverse graph stage memory size_t rev_dev = n_rows * graph_degree * index_size; // d_rev_graph diff --git a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh index f106b82500..cc19867ed0 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh @@ -198,8 +198,14 @@ void serialize_to_hnswlib( size_t bytes_written = 0; float GiB = 1 << 30; for (std::size_t i = 0; i < index_.size(); i++) { - auto graph_degree = static_cast(index_.graph_degree()); - os.write(reinterpret_cast(&graph_degree), sizeof(int)); + int actual_degree = static_cast(index_.graph_degree()); + for (int j = 0; j < actual_degree; j++) { + if (host_graph(i, j) == static_cast(-1)) { + actual_degree = j; + break; + } + } + os.write(reinterpret_cast(&actual_degree), sizeof(int)); IdxT* graph_row = &host_graph(i, 0); os.write(reinterpret_cast(graph_row), sizeof(IdxT) * index_.graph_degree()); diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 52b4542798..b5e0fd6118 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -7,6 +7,8 @@ #include "cagra_helpers.hpp" #include "utils.hpp" +#include + #include #include #include @@ -14,6 +16,7 @@ #include #include #include +#include #include // TODO: This shouldn't be invoking anything from spatial/knn @@ -174,22 +177,34 @@ __global__ void kern_sort(const DATA_T* const dataset, // [dataset_chunk_size, } } -template +// `output_graph` may be either the full 2D graph ([graph_size, degree], indexed with column `k`) +// or a single pre-extracted column passed as a 1D vector ([graph_size], indexed by row only). The +// rank is inspected at compile time; `k` is always the logical column index and is used for the +// natural-degree mask regardless of the view rank. +// +// When VariableDegree is true, edges past natural_degree(src_id) are masked out (do not contribute +// to the reverse graph). natural_degree is otherwise unused and may be a default-constructed view. +template __global__ void kern_make_rev_graph_k( - OutputMatrixView output_graph, // [graph_size, degree] - raft::device_matrix_view rev_graph, // [graph_size, degree] + OutputView output_graph, // [graph_size, degree] or [graph_size] + raft::device_matrix_view rev_graph, // [graph_size, degree] raft::device_vector_view rev_graph_count, // [graph_size] - uint64_t k) + uint64_t k, + raft::device_vector_view natural_degree) { const uint64_t tid = threadIdx.x + (blockDim.x * blockIdx.x); const uint64_t tnum = blockDim.x * gridDim.x; - const uint64_t graph_size = rev_graph.extent(0); - const uint32_t rev_graph_degree = rev_graph.extent(1); - const uint32_t output_graph_degree = output_graph.extent(1); + const uint64_t graph_size = rev_graph.extent(0); + const uint32_t rev_graph_degree = rev_graph.extent(1); for (uint64_t src_id = tid; src_id < graph_size; src_id += tnum) { - IdxT dest_id = output_graph(src_id, k); + IdxT dest_id; + if constexpr (OutputView::rank() == 2) { + dest_id = output_graph(src_id, k); + } else { + dest_id = output_graph(src_id); + } if (dest_id >= graph_size) continue; const uint32_t pos = atomicAdd(&rev_graph_count(dest_id), 1); @@ -202,13 +217,26 @@ __global__ void kern_make_rev_graph_k( // layout_stride (the result of cuda::std::submdspan), and any accessor that is // device-accessible (default_accessor, raft::host_device_accessor with a // device-accessible memory_type, etc.). -template +// +// When VariableDegree is true, the kernel additionally writes a per-node +// "natural degree" (count of low-detour edges) into d_natural_degree. +// target_pruned_degree is the user-provided floor controlling where the natural +// detour-band boundary is detected (see comments in optimize()). When +// VariableDegree is false, d_natural_degree and target_pruned_degree are unused +// and the kernel behaves identically to its pre-feature form. +template __global__ void kern_fused_prune(KnnGraphView knn_graph, // [graph_chunk_size, graph_degree] OutputGraphView output_graph, // [batch_size, output_graph_degree] const uint32_t batch_size, const uint32_t batch_id, uint32_t* const d_invalid_neighbor_list, - uint64_t* const stats) + uint64_t* const stats, + raft::device_vector_view d_natural_degree, + const uint32_t target_pruned_degree) { // Check assumption we have at least one warp per row of the batch assert(blockDim.x == raft::WarpSize * num_warps); @@ -297,6 +325,15 @@ __global__ void kern_fused_prune(KnnGraphView knn_graph, // [graph_chunk_ } #endif + // Detour level recorded at iteration (target_pruned_degree - 1). The natural-degree boundary is + // the first iteration `i >= target_pruned_degree` for which warp_min_count strictly exceeds + // this level. Initialized to maxval16 so the comparison below is well-defined before we reach + // the target index. + uint32_t target_detour_level = maxval16; + // Final natural degree to write back to global memory; the default `output_graph_degree` means + // "all edges fit under the target detour level" (i.e. no padding will be needed). + uint32_t natural_degree = output_graph_degree; + for (uint32_t i = 0; i < output_graph_degree; i++) { uint32_t local_min = maxval16; uint32_t local_idx = maxval16; @@ -317,6 +354,17 @@ __global__ void kern_fused_prune(KnnGraphView knn_graph, // [graph_chunk_ break; } + if constexpr (VariableDegree) { + if (i + 1 == target_pruned_degree) { + // Freeze the detour level after we've placed exactly target_pruned_degree edges. + target_detour_level = warp_min_count; + } else if (i >= target_pruned_degree && warp_min_count > target_detour_level && + natural_degree == output_graph_degree) { + // The detour level just rose above the target band. Record the natural degree once. + natural_degree = i; + } + } + IdxT selected_node = smem_indices[warp_local_idx]; for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) { @@ -326,6 +374,10 @@ __global__ void kern_fused_prune(KnnGraphView knn_graph, // [graph_chunk_ if (lane_id == 0) { output_graph(nid_batch, i) = selected_node; } } + + if constexpr (VariableDegree) { + if (lane_id == 0) { d_natural_degree(nid) = natural_degree; } + } } // Helper functions for merging the graph @@ -367,8 +419,16 @@ __device__ void warp_shift_array_one_right(uint32_t lane_id, T* array, uint64_t // OutputGraphView, MstGraphView and MstNumEdgesView are 2D mdspans that may // have layout_right or layout_stride and any device-accessible accessor; see // the comment on kern_fused_prune for details. +// +// When VariableDegree is true, `d_natural_degree` (size graph_size) drives: +// * the per-node `effective_degree` used to cap `num_protected_edges` +// * the `kInvalidNeighbor` sentinel padding for slots past the final degree +// * the write-back of the final effective degree +// * popularity-based thinning of reverse-graph candidates (see the merge loop) +// When VariableDegree is false, the parameter is unused (and may be a default view). template @@ -381,7 +441,8 @@ __global__ void kern_merge_graph( const uint32_t batch_size, const uint32_t batch_id, bool guarantee_connectivity, - uint32_t* check_num_protected_edges) + raft::device_scalar_view check_num_protected_edges, + raft::device_vector_view d_natural_degree) { // Check assumption we have at least one warp per row of the batch assert(blockDim.x == raft::WarpSize * num_warps); @@ -405,6 +466,10 @@ __global__ void kern_merge_graph( if (nid >= graph_size) { return; } + // Per-node "in" degree before reverse-graph back-fill. In the constant-degree path this is just + // `output_graph_degree`. In the variable-degree path it comes from the prune step. + uint32_t effective_degree = VariableDegree ? d_natural_degree(nid) : output_graph_degree; + const auto current_mst_graph_num_edges = guarantee_connectivity ? mst_graph_num_edges(nid_batch, 0) : 0; // If guarantee_connectivity == true, use a temporal list to merge the @@ -442,13 +507,22 @@ __global__ void kern_merge_graph( __syncwarp(); } - const auto num_protected_edges = max(current_mst_graph_num_edges, output_graph_degree / 2); + // In the variable-degree path, `num_protected_edges` is additionally capped by the per-node + // `effective_degree`: protecting more than the natural degree would prevent rev-graph edges + // from being inserted (since they always go into slot `num_protected_edges`) and would mask + // them with the prune-step contents that have not yet been validated as kept. + const auto num_protected_edges = + max(current_mst_graph_num_edges, min(effective_degree, output_graph_degree / 2)); - if (num_protected_edges > output_graph_degree) { - check_num_protected_edges[0] = 0u; + if (num_protected_edges > effective_degree) { + check_num_protected_edges(0) = 0u; return; } - if (num_protected_edges == output_graph_degree) { return; } + // Variable-degree path always needs to write the -1 padding (and write-back natural_degree), + // so we cannot early-return like the constant-degree path does when there is no room to insert. + if constexpr (!VariableDegree) { + if (num_protected_edges == output_graph_degree) { return; } + } auto kr = min(rev_graph_count(nid), output_graph_degree); @@ -456,6 +530,10 @@ __global__ void kern_merge_graph( kr -= 1; const auto rev_graph_value = rev_graph(nid, kr); if (rev_graph_value < graph_size) { + if constexpr (VariableDegree) { + const uint32_t in_degree = rev_graph_count(rev_graph_value); + if (in_degree < output_graph_degree) { continue; } + } uint64_t pos = warp_pos_in_array(rev_graph_value, smem_sorted_output_graph, output_graph_degree); if (pos < num_protected_edges) { continue; } @@ -465,11 +543,26 @@ __global__ void kern_merge_graph( lane_id, smem_sorted_output_graph + num_protected_edges, num_shift); if (lane_id == 0) { smem_sorted_output_graph[num_protected_edges] = rev_graph_value; } __syncwarp(); + // A new rev-graph edge has been inserted at position `num_protected_edges`, growing the + // node's effective degree up to (but not beyond) the full output_graph_degree. In the + // constant-degree path effective_degree already equals output_graph_degree, so this is a + // no-op and needs no VariableDegree guard. + if (effective_degree < output_graph_degree) { effective_degree++; } } } + // The write-back stays guarded: d_natural_degree is an empty view in the constant-degree path. + if constexpr (VariableDegree) { + if (lane_id == 0) { d_natural_degree(nid) = effective_degree; } + } + + // Slots past the final effective degree are written with the kInvalidNeighbor sentinel that + // downstream consumers (HNSW serialization, out-of-range check) recognize as "end of neighbor + // list". In the constant-degree path effective_degree == output_graph_degree, so no slot is + // padded. for (uint32_t i = lane_id; i < output_graph_degree; i += raft::WarpSize) { - output_graph(nid_batch, i) = smem_sorted_output_graph[i]; + output_graph(nid_batch, i) = + (i < effective_degree) ? smem_sorted_output_graph[i] : kInvalidNeighbor; } } @@ -804,6 +897,10 @@ void check_duplicates_and_out_of_range( for (uint32_t j = 0; j < output_graph_degree; j++) { const auto neighbor_a = my_out_graph[j]; + // The variable-degree path pads unused slots with kInvalidNeighbor. Without this guard the + // sentinel would (correctly) trigger the `>= graph_size` out-of-range check below. + if (neighbor_a == kInvalidNeighbor) { continue; } + if (neighbor_a >= graph_size) { num_oor++; continue; @@ -824,7 +921,7 @@ void check_duplicates_and_out_of_range( num_oor == 0, "%lu out-of-range index node(s) are found in the generated CAGRA graph", num_oor); } -template +template void merge_graph_gpu( raft::resources const& res, raft::mdspan, raft::row_major, AccessorOutputGraph> @@ -833,7 +930,8 @@ void merge_graph_gpu( raft::device_vector_view d_rev_graph_count, raft::host_matrix_view mst_graph, raft::host_vector_view mst_graph_num_edges, - bool guarantee_connectivity) + bool guarantee_connectivity, + raft::device_vector_view d_natural_degree) { const uint64_t graph_size = output_graph.extent(0); const uint64_t output_graph_degree = output_graph.extent(1); @@ -888,7 +986,7 @@ void merge_graph_gpu( auto mst_graph_view = (*d_mst_graph).view(); auto mst_graph_num_edges_view = (*d_mst_graph_num_edges).view(); auto output_view = (*d_output_graph).view(); - kern_merge_graph + kern_merge_graph <<>>( output_view, d_rev_graph, @@ -898,7 +996,8 @@ void merge_graph_gpu( batch_size, i_batch, guarantee_connectivity, - d_check_num_protected_edges.data_handle()); + d_check_num_protected_edges.view(), + d_natural_degree); d_output_graph.prefetch_next_batch(); d_mst_graph.prefetch_next_batch(); @@ -923,13 +1022,17 @@ void merge_graph_gpu( (merge_graph_end - merge_graph_start) * 1000.0); } -template +// When VariableDegree is true, `d_natural_degree` (size graph_size) gates which slots of the +// output graph contribute to the reverse graph: only positions [0, natural_degree(i)) of node i +// are considered, the rest are skipped. +template void make_reverse_graph_gpu( raft::resources const& res, raft::mdspan, raft::row_major, AccessorOutputGraph> output_graph, raft::device_matrix_view d_rev_graph, - raft::device_vector_view d_rev_graph_count) + raft::device_vector_view d_rev_graph_count, + raft::device_vector_view d_natural_degree) { const uint64_t graph_size = output_graph.extent(0); const uint64_t output_graph_degree = output_graph.extent(1); @@ -940,7 +1043,7 @@ void make_reverse_graph_gpu( // // Make reverse graph // - raft::matrix::fill(res, d_rev_graph, IdxT(-1)); + raft::matrix::fill(res, d_rev_graph, kInvalidNeighbor); raft::matrix::fill(res, d_rev_graph_count, uint32_t(0)); if constexpr (AccessorOutputGraph::is_device_accessible) { @@ -948,12 +1051,21 @@ void make_reverse_graph_gpu( dim3 threads(256, 1, 1); dim3 blocks(1024, 1, 1); for (uint64_t k = 0; k < output_graph_degree; k++) { - kern_make_rev_graph_k<<>>( - output_graph, d_rev_graph, d_rev_graph_count, k); + kern_make_rev_graph_k + <<>>( + output_graph, + d_rev_graph, + d_rev_graph_count, + k, + raft::make_const_mdspan(d_natural_degree)); } } else { - auto d_dest_nodes = raft::make_device_matrix(res, graph_size, 1); - auto dest_nodes = raft::make_host_vector(graph_size); + // Host variant: the output graph is host-only, so we extract one column at a time into a + // device vector and hand that to the kernel. The natural-degree mask is applied inside the + // kernel using the real column index `k` (the kernel indexes the 1D `d_dest_nodes` by row). + auto d_dest_nodes = raft::make_device_vector(res, graph_size); + auto dest_nodes = raft::make_host_vector(res, graph_size); + for (uint64_t k = 0; k < output_graph_degree; k++) { #pragma omp parallel for for (uint64_t i = 0; i < graph_size; i++) { @@ -963,8 +1075,13 @@ void make_reverse_graph_gpu( dim3 threads(256, 1, 1); dim3 blocks(1024, 1, 1); - kern_make_rev_graph_k<<>>( - d_dest_nodes.view(), d_rev_graph, d_rev_graph_count, 0); + kern_make_rev_graph_k + <<>>( + d_dest_nodes.view(), + d_rev_graph, + d_rev_graph_count, + k, + raft::make_const_mdspan(d_natural_degree)); raft::resource::sync_stream(res); RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u \r", k, output_graph_degree); } @@ -1592,12 +1709,17 @@ void mst_optimization( // specified number of edges are picked up for each node, starting with the edge with // the lowest number of 2-hop detours. // -template +template void prune_graph_gpu( raft::resources const& res, raft::mdspan, raft::row_major, AccessorKnnGraph> knn_graph, raft::mdspan, raft::row_major, AccessorOutputGraph> - output_graph) + output_graph, + raft::device_vector_view d_natural_degree, + uint32_t target_pruned_degree) { const uint64_t graph_size = output_graph.extent(0); const uint64_t knn_graph_degree = knn_graph.extent(1); @@ -1652,14 +1774,16 @@ void prune_graph_gpu( for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { auto output_view = (*d_output_graph).view(); - kern_fused_prune + kern_fused_prune <<>>( input_view, output_view, batch_size, i_batch, d_invalid_neighbor_list.data_handle(), - dev_stats.data_handle()); + dev_stats.data_handle(), + d_natural_degree, + target_pruned_degree); d_output_graph.prefetch_next_batch(); ++d_output_graph; @@ -1707,8 +1831,9 @@ void optimize( raft::resources const& res_const, raft::mdspan, raft::row_major, AccessorKnnGraph> knn_graph, raft::mdspan, raft::row_major, AccessorOutputGraph> new_graph, - const bool guarantee_connectivity = true, - const bool use_gpu_for_mst_optimization = true) + const bool guarantee_connectivity = true, + const bool use_gpu_for_mst_optimization = true, + const double variable_graph_degree_fraction = 1.0) { RAFT_LOG_DEBUG( "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1)); @@ -1735,13 +1860,32 @@ void optimize( RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1), "output graph cannot have more columns than input graph"); // const uint64_t input_graph_degree = knn_graph.extent(1); - const uint64_t knn_graph_degree = knn_graph.extent(1); - const uint64_t output_graph_degree = new_graph.extent(1); - const uint64_t graph_size = new_graph.extent(0); + const uint64_t knn_graph_degree = knn_graph.extent(1); + const uint64_t output_graph_degree = new_graph.extent(1); + const uint64_t graph_size = new_graph.extent(0); + const uint64_t target_pruned_degree = std::max( + 1, static_cast(std::ceil(output_graph_degree * variable_graph_degree_fraction))); + const bool variable_graph_degree = (target_pruned_degree < output_graph_degree); + + if (variable_graph_degree) { + RAFT_LOG_INFO("# Pruning kNN graph (size=%lu, degree=%lu, target_pruned_degree=%lu)", + graph_size, + knn_graph_degree, + target_pruned_degree); + } raft::common::nvtx::range fun_scope( "cagra::graph::optimize(%zu, %zu, %u)", graph_size, knn_graph_degree, output_graph_degree); + // Per-node "natural" degree produced by the prune step. Only allocated when the variable-degree + // path is enabled. The prune kernel records here the count of edges with detour level <= + // detour level at position (target_pruned_degree - 1). The merge kernel then both reads and + // updates this array (after rev-graph back-fill the value becomes the final effective degree). + auto d_natural_degree = raft::make_device_mdarray( + res, + default_ws_mr, + raft::make_extents(variable_graph_degree ? static_cast(graph_size) : 0)); + // MST optimization // currently, only using GPU path for MST optimization int64_t mst_graph_size = guarantee_connectivity ? graph_size : 0; @@ -1765,7 +1909,13 @@ void optimize( // prune graph -- will always use GPU path { - prune_graph_gpu(res, knn_graph, new_graph); + if (variable_graph_degree) { + prune_graph_gpu( + res, knn_graph, new_graph, d_natural_degree.view(), target_pruned_degree); + } else { + prune_graph_gpu( + res, knn_graph, new_graph, d_natural_degree.view(), target_pruned_degree); + } } // reverse graph creation will always use the GPU / large workspace resource @@ -1778,7 +1928,13 @@ void optimize( const double time_make_start = cur_time(); - make_reverse_graph_gpu(res, new_graph, d_rev_graph.view(), d_rev_graph_count.view()); + if (variable_graph_degree) { + make_reverse_graph_gpu( + res, new_graph, d_rev_graph.view(), d_rev_graph_count.view(), d_natural_degree.view()); + } else { + make_reverse_graph_gpu( + res, new_graph, d_rev_graph.view(), d_rev_graph_count.view(), d_natural_degree.view()); + } raft::resource::sync_stream(res); @@ -1788,17 +1944,52 @@ void optimize( // merge graph -- will always use GPU path { - merge_graph_gpu(res, - new_graph, - d_rev_graph.view(), - d_rev_graph_count.view(), - mst_graph.view(), - mst_graph_num_edges.view(), - guarantee_connectivity); + if (variable_graph_degree) { + merge_graph_gpu(res, + new_graph, + d_rev_graph.view(), + d_rev_graph_count.view(), + mst_graph.view(), + mst_graph_num_edges.view(), + guarantee_connectivity, + d_natural_degree.view()); + } else { + merge_graph_gpu(res, + new_graph, + d_rev_graph.view(), + d_rev_graph_count.view(), + mst_graph.view(), + mst_graph_num_edges.view(), + guarantee_connectivity, + d_natural_degree.view()); + } + } + + auto d_avg_natural = raft::make_device_scalar(res, 0); + double avg_natural = 0; + if (variable_graph_degree) { + // main_op: cast each degree to double (innermost op, which also drops the reduction-index + // argument), then divide by graph_size, so the reduction accumulates the mean directly. + auto normalize_mean = raft::compose_op{ + raft::div_const_op{static_cast(graph_size)}, raft::cast_op{}}; + raft::linalg::reduce( + res, + raft::make_device_matrix_view( + d_natural_degree.data_handle(), int64_t{1}, static_cast(graph_size)), + raft::make_device_vector_view(d_avg_natural.data_handle(), int64_t{1}), + 0.0, + false, + normalize_mean); + raft::copy(res, raft::make_host_scalar_view(&avg_natural), d_avg_natural.view()); } raft::resource::sync_stream(res); + if (variable_graph_degree) { + RAFT_LOG_INFO( + "# Variable graph degree: avg natural degree = %.2f / %lu", avg_natural, output_graph_degree); + } + // These host-side checks are expensive (O(N*D^2)) and only used as debug // diagnostics, so only run them when debug logging is active at runtime. if (raft::default_logger().should_log(rapids_logger::level_enum::debug)) { diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp index 88580de929..3031b739ee 100644 --- a/cpp/src/neighbors/detail/hnsw.hpp +++ b/cpp/src/neighbors/detail/hnsw.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ @@ -310,12 +310,16 @@ std::enable_if_t>> fro for (size_t i = 0; i < static_cast(host_graph_view.extent(0)); ++i) { auto hnsw_internal_id = appr_algo->label_lookup_.find(i)->second; auto ll_i = appr_algo->get_linklist0(hnsw_internal_id); - appr_algo->setListCount(ll_i, host_graph_view.extent(1)); - auto* data = (uint32_t*)(ll_i + 1); + size_t actual_count = 0; + auto* data = (uint32_t*)(ll_i + 1); for (size_t j = 0; j < static_cast(host_graph_view.extent(1)); ++j) { - auto neighbor_internal_id = appr_algo->label_lookup_.find(host_graph(i, j))->second; - data[j] = neighbor_internal_id; + auto neighbor_id = host_graph(i, j); + if (neighbor_id == static_cast(-1)) { break; } + auto neighbor_internal_id = appr_algo->label_lookup_.find(neighbor_id)->second; + data[actual_count] = neighbor_internal_id; + actual_count++; } + appr_algo->setListCount(ll_i, actual_count); } hnsw_index->set_index(std::move(appr_algo)); @@ -543,9 +547,17 @@ void serialize_to_hnswlib_batched(raft::resources const& res, for (int64_t batch_idx = 0; batch_idx < current_batch_size; batch_idx++) { const int64_t i = batch_start + batch_idx; - os.write(reinterpret_cast(&graph_degree_int), sizeof(int)); - + // Variable-degree graphs pad unused neighbor slots with kInvalidNeighbor; the per-node + // hnswlib link-list size is the number of valid neighbors before the first sentinel. const IdxT* graph_row = &graph_buffer(batch_idx, 0); + int actual_degree_int = graph_degree_int; + for (int gj = 0; gj < graph_degree_int; gj++) { + if (graph_row[gj] == cuvs::neighbors::cagra::kInvalidNeighbor) { + actual_degree_int = gj; + break; + } + } + os.write(reinterpret_cast(&actual_degree_int), sizeof(int)); os.write(reinterpret_cast(graph_row), sizeof(IdxT) * graph_degree_int); if (odd_graph_degree) { @@ -1123,15 +1135,22 @@ std::enable_if_t>> fro // iterate over the points in the descending order of their levels for (size_t pt_level = hist.size() - 1; pt_level >= 1; pt_level--) { common::nvtx::range level_scope("level %zu", pt_level); - auto start_idx = offsets[pt_level - 1]; - auto end_idx = offsets[hist.size() - 1]; - auto num_pts = end_idx - start_idx; - auto neighbor_size = num_pts > appr_algo->M_ ? appr_algo->M_ : num_pts - 1; + auto start_idx = offsets[pt_level - 1]; + auto end_idx = offsets[hist.size() - 1]; + auto num_pts = end_idx - start_idx; if (num_pts <= 1) { // this means only 1 point in the level continue; } + // Final per-layer degree (capped at the upper-layer limit maxM_ == M_) and the + // denser intermediate kNN degree we prune from (capped at maxM0_ == 2*M_). This + // mirrors CAGRA's own build_knn_graph -> optimize pipeline: build a richer kNN + // graph, then prune detourable edges and add reverse edges down to the target + // degree. Both are clamped so a tiny top layer is at most fully connected. + const int64_t knn_degree = std::min(appr_algo->maxM0_, num_pts - 1); + const int64_t out_degree = std::min(appr_algo->maxM_, knn_degree); + // gather points from dataset to form query set on host auto host_query_set = raft::make_host_matrix(num_pts, dim); // TODO: Use `raft::matrix::gather` when available as a public API @@ -1145,11 +1164,21 @@ std::enable_if_t>> fro } // find neighbors of the query set - auto host_neighbors = raft::make_host_matrix(num_pts, neighbor_size); - all_neighbors_graph(res, - raft::make_const_mdspan(host_query_set.view()), - host_neighbors.view(), - cagra_index.metric()); + auto host_neighbors = raft::make_host_matrix(num_pts, out_degree); + if (knn_degree > out_degree) { + // Build a denser intermediate kNN graph, then prune it down to out_degree with + // variable degree disabled (mirrors CAGRA's build_knn_graph -> optimize). The + // constant-degree optimize output is full fixed degree with no invalid padding. + auto host_knn = raft::make_host_matrix(num_pts, knn_degree); + all_neighbors_graph( + res, raft::make_const_mdspan(host_query_set.view()), host_knn.view(), cagra_index.metric()); + cuvs::neighbors::cagra::helpers::optimize(res, host_knn.view(), host_neighbors.view()); + } else { + all_neighbors_graph(res, + raft::make_const_mdspan(host_query_set.view()), + host_neighbors.view(), + cagra_index.metric()); + } { common::nvtx::range copy_scope( @@ -1186,12 +1215,16 @@ std::enable_if_t>> fro common::nvtx::range copy_scope("get_linklist0"); #pragma omp parallel for num_threads(num_threads) for (int64_t i = 0; i < n_rows; i++) { - auto ll_i = appr_algo->get_linklist0(i); - appr_algo->setListCount(ll_i, degree); - auto* data = (uint32_t*)(ll_i + 1); + auto ll_i = appr_algo->get_linklist0(i); + auto* data = (uint32_t*)(ll_i + 1); + int64_t actual_count = 0; for (int64_t j = 0; j < degree; j++) { - data[j] = graph_ptr[i * degree + j]; + auto neighbor_id = graph_ptr[i * degree + j]; + if (neighbor_id == static_cast(-1)) { break; } + data[actual_count] = neighbor_id; + actual_count++; } + appr_algo->setListCount(ll_i, actual_count); } } else { common::nvtx::range copy_scope("get_linklist0"); @@ -1203,11 +1236,20 @@ std::enable_if_t>> fro n_rows, cudaMemcpyDefault, raft::resource::get_cuda_stream(res))); + raft::resource::sync_stream(res); #pragma omp parallel for num_threads(num_threads) for (int64_t i = 0; i < n_rows; i++) { - appr_algo->setListCount(appr_algo->get_linklist0(i), degree); + auto ll_i = appr_algo->get_linklist0(i); + auto* data = (uint32_t*)(ll_i + 1); + int64_t actual_count = degree; + for (int64_t j = 0; j < degree; j++) { + if (data[j] == static_cast(-1)) { + actual_count = j; + break; + } + } + appr_algo->setListCount(ll_i, actual_count); } - raft::resource::sync_stream(res); } hnsw_index->set_index(std::move(appr_algo)); return hnsw_index;