diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
index 57b47d97db..6ef2ae25c2 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -341,24 +341,52 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index
     params.guarantee_connectivity = conf.at("guarantee_connectivity");
   }
 
-  // Override the graph_build_algo if requested explicitly
+  if (conf.contains("variable_graph_degree_fraction")) {
+    params.variable_graph_degree_fraction = conf.at("variable_graph_degree_fraction");
+  }
+
+  // Extract build-algo-specific parameters
+  nlohmann::json ivf_pq_build_conf  = collect_conf_with_prefix(conf, "ivf_pq_build_");
+  nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_");
+  nlohmann::json nn_descent_conf    = collect_conf_with_prefix(conf, "nn_descent_");
+  nlohmann::json ace_conf           = collect_conf_with_prefix(conf, "ace_");
+
+  // Determine and initialize graph build algorithm.
+  // Priority 1: explicit "graph_build_algo" config key.
+  // Priority 2: infer from algorithm-specific prefixed config keys (only when monostate).
+  // Priority 3: leave as-is (from prior heuristics or monostate for AUTO at build time).
+  std::string graph_build_algo;
   if (conf.contains("graph_build_algo")) {
-    if (conf.at("graph_build_algo") == "IVF_PQ") {
+    graph_build_algo = conf.at("graph_build_algo");
+  } else if (std::holds_alternative<std::monostate>(params.graph_build_params)) {
+    if (!ivf_pq_build_conf.empty() || !ivf_pq_search_conf.empty()) {
+      graph_build_algo = "IVF_PQ";
+    } else if (!nn_descent_conf.empty()) {
+      graph_build_algo = "NN_DESCENT";
+    } else if (!ace_conf.empty()) {
+      graph_build_algo = "ACE";
+    }
+    // else: leave as monostate → AUTO in cagra_build.cuh
+  }
+
+  if (!graph_build_algo.empty()) {
+    if (graph_build_algo == "IVF_PQ") {
       if (!std::holds_alternative<cuvs::neighbors::graph_build_params::ivf_pq_params>(
             params.graph_build_params)) {
         params.graph_build_params = cuvs::neighbors::graph_build_params::ivf_pq_params{};
       }
-    } else if (conf.at("graph_build_algo") == "NN_DESCENT") {
+    } else if (graph_build_algo == "NN_DESCENT") {
       if (!std::holds_alternative<cuvs::neighbors::graph_build_params::nn_descent_params>(
             params.graph_build_params)) {
-        params.graph_build_params = cuvs::neighbors::graph_build_params::nn_descent_params{};
+        params.graph_build_params = cuvs::neighbors::graph_build_params::nn_descent_params(
+          params.intermediate_graph_degree, params.metric);
       }
-    } else if (conf.at("graph_build_algo") == "ACE") {
+    } else if (graph_build_algo == "ACE") {
       if (!std::holds_alternative<cuvs::neighbors::graph_build_params::ace_params>(
             params.graph_build_params)) {
         params.graph_build_params = cuvs::neighbors::graph_build_params::ace_params{};
       }
-    } else if (conf.at("graph_build_algo") == "ITERATIVE_SEARCH") {
+    } else if (graph_build_algo == "ITERATIVE_SEARCH") {
       if (!std::holds_alternative<cuvs::neighbors::graph_build_params::iterative_search_params>(
             params.graph_build_params)) {
         params.graph_build_params = cuvs::neighbors::graph_build_params::iterative_search_params{};
@@ -366,26 +394,6 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index
     }
   }
 
-  // Parse build-algo-specific parameters and use them to decide on the algo type
-  nlohmann::json ivf_pq_build_conf  = collect_conf_with_prefix(conf, "ivf_pq_build_");
-  nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_");
-  nlohmann::json nn_descent_conf    = collect_conf_with_prefix(conf, "nn_descent_");
-  nlohmann::json ace_conf           = collect_conf_with_prefix(conf, "ace_");
-
-  // When graph_build_algo is not specified, leave graph_build_params as monostate so the
-  // CAGRA build uses AUTO selection (NN_DESCENT or IVF_PQ based on dataset/heuristics).
-  // Only infer from algo-specific config keys when present.
-  if (std::holds_alternative<std::monostate>(params.graph_build_params)) {
-    if (!ivf_pq_build_conf.empty() || !ivf_pq_search_conf.empty()) {
-      params.graph_build_params = cuvs::neighbors::graph_build_params::ivf_pq_params{};
-    } else if (!nn_descent_conf.empty()) {
-      params.graph_build_params = cuvs::neighbors::graph_build_params::nn_descent_params{};
-    } else if (!ace_conf.empty()) {
-      params.graph_build_params = cuvs::neighbors::graph_build_params::ace_params{};
-    }
-    // else: leave as monostate → AUTO in cagra_build.cuh
-  }
-
   // Apply build-algo-specific parameters
   std::visit(
     [&](auto& arg) {
diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 637e40c340..7c533d0737 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -146,11 +146,37 @@ enum class hnsw_heuristic_type : uint32_t {
   SAME_GRAPH_FOOTPRINT = 1
 };
 
+/**
+ * Sentinel marking an invalid / absent neighbor in a CAGRA graph. Variable-degree
+ * graphs (see index_params::variable_graph_degree_fraction) pad unused neighbor
+ * slots with this value, and consumers should treat it as "end of neighbor list".
+ */
+template <typename IdxT>
+constexpr static IdxT kInvalidNeighbor = static_cast<IdxT>(-1);
+
 struct index_params : cuvs::neighbors::index_params {
   /** Degree of input graph for pruning. */
   size_t intermediate_graph_degree = 128;
   /** Degree of output graph. */
   size_t graph_degree = 64;
+  /**
+   * Fraction of output graph_degree to define the minimum output graph degree,
+   * allowing variable-degree neighbor graphs.
+   *
+   * This fraction is used as the target for low-detour edges
+   * during the pruning step. Must be in (0, 1]. The default value of 1.0
+   * disables variable-degree logic (normal CAGRA behavior). Values < 1.0
+   * enable variable-degree graphs: the optimize step finds the minimum detour
+   * threshold that covers at least ceil(graph_degree * fraction) edges per node,
+   * then lets reverse edges expand the degree further. Unused slots are filled
+   * with a sentinel value (`kInvalidNeighbor`).
+   *
+   * This is intended for the CAGRA-to-HNSW conversion pipeline: the resulting
+   * graph, when imported into hnswlib, produces variable-degree neighbor lists
+   * similar to natively-built HNSW graphs. Do not use this with CAGRA's native
+   * GPU search.
+   */
+  double variable_graph_degree_fraction = 1.0;
   /**
    * Specify compression parameters if compression is desired. If set, overrides the
    * attach_dataset_on_build (and the compressed dataset is always added to the index).
diff --git a/cpp/src/neighbors/cagra.cpp b/cpp/src/neighbors/cagra.cpp
index 6aa5737e36..cd63de9a24 100644
--- a/cpp/src/neighbors/cagra.cpp
+++ b/cpp/src/neighbors/cagra.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -40,13 +40,15 @@ cagra::index_params index_params::from_hnsw_params(raft::matrix_extent<int64_t>
   cagra::index_params params;
   switch (heuristic) {
     case hnsw_heuristic_type::SAME_GRAPH_FOOTPRINT:
-      params.graph_degree              = M * 2;
-      params.intermediate_graph_degree = M * 3;
+      params.graph_degree                   = M * 2;
+      params.intermediate_graph_degree      = M * 3;
+      params.variable_graph_degree_fraction = 0.35;
       break;
     case hnsw_heuristic_type::SIMILAR_SEARCH_PERFORMANCE:
     default:
-      params.graph_degree              = 2 + M * 2 / 3;
-      params.intermediate_graph_degree = M + M * ef_construction / 256;
+      params.graph_degree                   = M;
+      params.intermediate_graph_degree      = M + M * ef_construction / 256;
+      params.variable_graph_degree_fraction = 0.7;
       break;
   }
   params.graph_build_params =
diff --git a/cpp/src/neighbors/cagra.cuh b/cpp/src/neighbors/cagra.cuh
index ee87c2c0ab..be789e04e8 100644
--- a/cpp/src/neighbors/cagra.cuh
+++ b/cpp/src/neighbors/cagra.cuh
@@ -259,9 +259,11 @@ void optimize(
   raft::resources const& res,
   raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, g_accessor> knn_graph,
   raft::host_matrix_view<IdxT, int64_t, raft::row_major> new_graph,
-  const bool guarantee_connectivity = false)
+  const bool guarantee_connectivity           = false,
+  const double variable_graph_degree_fraction = 1.0)
 {
-  detail::optimize(res, knn_graph, new_graph, guarantee_connectivity);
+  detail::optimize(
+    res, knn_graph, new_graph, guarantee_connectivity, variable_graph_degree_fraction);
 }
 
 template <typename T,
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index a7c15b4161..904e575125 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -411,6 +411,12 @@ void ace_adjust_sub_graph_ids(
       size_t j = sub_search_graph(i, k);
       size_t j_original;
 
+      if (j == kInvalidNeighbor<IdxT>) {
+        // Variable-degree padding / invalid-neighbor sentinel: propagate as-is.
+        search_graph(i_original, k) = kInvalidNeighbor<IdxT>;
+        continue;
+      }
+
       if (j < core_sub_dataset_size) {
         // core partition neighbor: local → core reordered → original
         size_t j_reordered = j + core_partition_offsets(partition_id);
@@ -443,6 +449,11 @@ void ace_adjust_sub_graph_ids_disk(
   for (size_t i = 0; i < core_sub_dataset_size; i++) {
     for (size_t k = 0; k < graph_degree; k++) {
       size_t j = sub_search_graph(i, k);
+      if (j == kInvalidNeighbor<IdxT>) {
+        // Variable-degree padding / invalid-neighbor sentinel: propagate as-is.
+        sub_search_graph(i, k) = kInvalidNeighbor<IdxT>;
+        continue;
+      }
       if (j < core_sub_dataset_size) {
         // core partition neighbor: local → core reordered
         sub_search_graph(i, k) = j + core_partition_offsets(partition_id);
@@ -1930,7 +1941,8 @@ void optimize(
   raft::resources const& res,
   raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, g_accessor> knn_graph,
   raft::host_matrix_view<IdxT, int64_t, raft::row_major> new_graph,
-  const bool guarantee_connectivity = false)
+  const bool guarantee_connectivity           = false,
+  const double variable_graph_degree_fraction = 1.0)
 {
   using internal_IdxT = typename std::make_unsigned<IdxT>::type;
 
@@ -1947,8 +1959,12 @@ void optimize(
       knn_graph.extent(0),
       knn_graph.extent(1));
 
-  cagra::detail::graph::optimize(
-    res, knn_graph_internal, new_graph_internal, guarantee_connectivity);
+  cagra::detail::graph::optimize(res,
+                                 knn_graph_internal,
+                                 new_graph_internal,
+                                 guarantee_connectivity,
+                                 true,
+                                 variable_graph_degree_fraction);
 }
 
 // RAII wrapper for allocating memory with Transparent HugePage
@@ -2168,8 +2184,11 @@ auto iterative_build_graph(
     auto next_graph_size = curr_query_size;
     cagra_graph          = raft::make_host_matrix<IdxT, int64_t>(0, 0);  // delete existing grahp
     cagra_graph = raft::make_host_matrix<IdxT, int64_t>(next_graph_size, next_graph_degree);
-    optimize<IdxT>(
-      res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0);
+    optimize<IdxT>(res,
+                   neighbors_view,
+                   cagra_graph.view(),
+                   flag_last ? params.guarantee_connectivity : false,
+                   flag_last ? params.variable_graph_degree_fraction : 1.0);
 
     auto end        = std::chrono::high_resolution_clock::now();
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
@@ -2289,7 +2308,11 @@ index<T, IdxT> build(
     cagra_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), graph_degree);
 
     RAFT_LOG_TRACE("optimizing graph");
-    optimize<IdxT>(res, knn_graph->view(), cagra_graph.view(), params.guarantee_connectivity);
+    optimize<IdxT>(res,
+                   knn_graph->view(),
+                   cagra_graph.view(),
+                   params.guarantee_connectivity,
+                   params.variable_graph_degree_fraction);
 
     // free intermediate graph before trying to create the index
     knn_graph.reset();
diff --git a/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp b/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp
index 3f79df47dd..f13f831eeb 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp
+++ b/cpp/src/neighbors/detail/cagra/cagra_helpers.cpp
@@ -65,6 +65,8 @@ std::tuple<size_t, size_t, size_t, size_t> optimize_workspace_size(size_t n_rows
 
   size_t prune_dev = n_rows * intermediate_degree * index_size;  // d_input_graph
   prune_dev += prune_dev_fixed;
+  // d_natural_degree (only allocated when variable_graph_degree_fraction < 1.0)
+  prune_dev += n_rows * sizeof(uint32_t);
 
   // Reverse graph stage memory
   size_t rev_dev = n_rows * graph_degree * index_size;  // d_rev_graph
diff --git a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
index f106b82500..cc19867ed0 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
@@ -198,8 +198,14 @@ void serialize_to_hnswlib(
   size_t bytes_written = 0;
   float GiB            = 1 << 30;
   for (std::size_t i = 0; i < index_.size(); i++) {
-    auto graph_degree = static_cast<int>(index_.graph_degree());
-    os.write(reinterpret_cast<char*>(&graph_degree), sizeof(int));
+    int actual_degree = static_cast<int>(index_.graph_degree());
+    for (int j = 0; j < actual_degree; j++) {
+      if (host_graph(i, j) == static_cast<IdxT>(-1)) {
+        actual_degree = j;
+        break;
+      }
+    }
+    os.write(reinterpret_cast<char*>(&actual_degree), sizeof(int));
 
     IdxT* graph_row = &host_graph(i, 0);
     os.write(reinterpret_cast<char*>(graph_row), sizeof(IdxT) * index_.graph_degree());
diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 52b4542798..b5e0fd6118 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -7,6 +7,8 @@
 #include "cagra_helpers.hpp"
 #include "utils.hpp"
 
+#include <cuvs/neighbors/cagra.hpp>
+
 #include <raft/core/copy.cuh>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_device_accessor.hpp>
@@ -14,6 +16,7 @@
 #include <raft/core/mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/linalg/reduce.cuh>
 #include <raft/matrix/init.cuh>
 
 // TODO: This shouldn't be invoking anything from spatial/knn
@@ -174,22 +177,34 @@ __global__ void kern_sort(const DATA_T* const dataset,  // [dataset_chunk_size,
   }
 }
 
-template <typename IdxT, typename OutputMatrixView>
+// `output_graph` may be either the full 2D graph ([graph_size, degree], indexed with column `k`)
+// or a single pre-extracted column passed as a 1D vector ([graph_size], indexed by row only). The
+// rank is inspected at compile time; `k` is always the logical column index and is used for the
+// natural-degree mask regardless of the view rank.
+//
+// When VariableDegree is true, edges past natural_degree(src_id) are masked out (do not contribute
+// to the reverse graph). natural_degree is otherwise unused and may be a default-constructed view.
+template <typename IdxT, bool VariableDegree, typename OutputView>
 __global__ void kern_make_rev_graph_k(
-  OutputMatrixView output_graph,                                // [graph_size, degree]
-  raft::device_matrix_view<IdxT, int64_t> rev_graph,            // [graph_size, degree]
+  OutputView output_graph,                            // [graph_size, degree] or [graph_size]
+  raft::device_matrix_view<IdxT, int64_t> rev_graph,  // [graph_size, degree]
   raft::device_vector_view<uint32_t, int64_t> rev_graph_count,  // [graph_size]
-  uint64_t k)
+  uint64_t k,
+  raft::device_vector_view<const uint32_t, int64_t> natural_degree)
 {
   const uint64_t tid  = threadIdx.x + (blockDim.x * blockIdx.x);
   const uint64_t tnum = blockDim.x * gridDim.x;
 
-  const uint64_t graph_size          = rev_graph.extent(0);
-  const uint32_t rev_graph_degree    = rev_graph.extent(1);
-  const uint32_t output_graph_degree = output_graph.extent(1);
+  const uint64_t graph_size       = rev_graph.extent(0);
+  const uint32_t rev_graph_degree = rev_graph.extent(1);
 
   for (uint64_t src_id = tid; src_id < graph_size; src_id += tnum) {
-    IdxT dest_id = output_graph(src_id, k);
+    IdxT dest_id;
+    if constexpr (OutputView::rank() == 2) {
+      dest_id = output_graph(src_id, k);
+    } else {
+      dest_id = output_graph(src_id);
+    }
     if (dest_id >= graph_size) continue;
 
     const uint32_t pos = atomicAdd(&rev_graph_count(dest_id), 1);
@@ -202,13 +217,26 @@ __global__ void kern_make_rev_graph_k(
 // layout_stride (the result of cuda::std::submdspan), and any accessor that is
 // device-accessible (default_accessor, raft::host_device_accessor with a
 // device-accessible memory_type, etc.).
-template <class IdxT, uint32_t num_warps, class KnnGraphView, class OutputGraphView>
+//
+// When VariableDegree is true, the kernel additionally writes a per-node
+// "natural degree" (count of low-detour edges) into d_natural_degree.
+// target_pruned_degree is the user-provided floor controlling where the natural
+// detour-band boundary is detected (see comments in optimize()). When
+// VariableDegree is false, d_natural_degree and target_pruned_degree are unused
+// and the kernel behaves identically to its pre-feature form.
+template <class IdxT,
+          uint32_t num_warps,
+          bool VariableDegree,
+          class KnnGraphView,
+          class OutputGraphView>
 __global__ void kern_fused_prune(KnnGraphView knn_graph,        // [graph_chunk_size, graph_degree]
                                  OutputGraphView output_graph,  // [batch_size, output_graph_degree]
                                  const uint32_t batch_size,
                                  const uint32_t batch_id,
                                  uint32_t* const d_invalid_neighbor_list,
-                                 uint64_t* const stats)
+                                 uint64_t* const stats,
+                                 raft::device_vector_view<uint32_t, int64_t> d_natural_degree,
+                                 const uint32_t target_pruned_degree)
 {
   // Check assumption we have at least one warp per row of the batch
   assert(blockDim.x == raft::WarpSize * num_warps);
@@ -297,6 +325,15 @@ __global__ void kern_fused_prune(KnnGraphView knn_graph,        // [graph_chunk_
   }
 #endif
 
+  // Detour level recorded at iteration (target_pruned_degree - 1). The natural-degree boundary is
+  // the first iteration `i >= target_pruned_degree` for which warp_min_count strictly exceeds
+  // this level. Initialized to maxval16 so the comparison below is well-defined before we reach
+  // the target index.
+  uint32_t target_detour_level = maxval16;
+  // Final natural degree to write back to global memory; the default `output_graph_degree` means
+  // "all edges fit under the target detour level" (i.e. no padding will be needed).
+  uint32_t natural_degree = output_graph_degree;
+
   for (uint32_t i = 0; i < output_graph_degree; i++) {
     uint32_t local_min = maxval16;
     uint32_t local_idx = maxval16;
@@ -317,6 +354,17 @@ __global__ void kern_fused_prune(KnnGraphView knn_graph,        // [graph_chunk_
       break;
     }
 
+    if constexpr (VariableDegree) {
+      if (i + 1 == target_pruned_degree) {
+        // Freeze the detour level after we've placed exactly target_pruned_degree edges.
+        target_detour_level = warp_min_count;
+      } else if (i >= target_pruned_degree && warp_min_count > target_detour_level &&
+                 natural_degree == output_graph_degree) {
+        // The detour level just rose above the target band. Record the natural degree once.
+        natural_degree = i;
+      }
+    }
+
     IdxT selected_node = smem_indices[warp_local_idx];
 
     for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) {
@@ -326,6 +374,10 @@ __global__ void kern_fused_prune(KnnGraphView knn_graph,        // [graph_chunk_
 
     if (lane_id == 0) { output_graph(nid_batch, i) = selected_node; }
   }
+
+  if constexpr (VariableDegree) {
+    if (lane_id == 0) { d_natural_degree(nid) = natural_degree; }
+  }
 }
 
 // Helper functions for merging the graph
@@ -367,8 +419,16 @@ __device__ void warp_shift_array_one_right(uint32_t lane_id, T* array, uint64_t
 // OutputGraphView, MstGraphView and MstNumEdgesView are 2D mdspans that may
 // have layout_right or layout_stride and any device-accessible accessor; see
 // the comment on kern_fused_prune for details.
+//
+// When VariableDegree is true, `d_natural_degree` (size graph_size) drives:
+//   * the per-node `effective_degree` used to cap `num_protected_edges`
+//   * the `kInvalidNeighbor` sentinel padding for slots past the final degree
+//   * the write-back of the final effective degree
+//   * popularity-based thinning of reverse-graph candidates (see the merge loop)
+// When VariableDegree is false, the parameter is unused (and may be a default view).
 template <typename IdxT,
           uint32_t num_warps,
+          bool VariableDegree,
           class OutputGraphView,
           class MstGraphView,
           class MstNumEdgesView>
@@ -381,7 +441,8 @@ __global__ void kern_merge_graph(
   const uint32_t batch_size,
   const uint32_t batch_id,
   bool guarantee_connectivity,
-  uint32_t* check_num_protected_edges)
+  raft::device_scalar_view<uint32_t> check_num_protected_edges,
+  raft::device_vector_view<uint32_t, int64_t> d_natural_degree)
 {
   // Check assumption we have at least one warp per row of the batch
   assert(blockDim.x == raft::WarpSize * num_warps);
@@ -405,6 +466,10 @@ __global__ void kern_merge_graph(
 
   if (nid >= graph_size) { return; }
 
+  // Per-node "in" degree before reverse-graph back-fill. In the constant-degree path this is just
+  // `output_graph_degree`. In the variable-degree path it comes from the prune step.
+  uint32_t effective_degree = VariableDegree ? d_natural_degree(nid) : output_graph_degree;
+
   const auto current_mst_graph_num_edges =
     guarantee_connectivity ? mst_graph_num_edges(nid_batch, 0) : 0;
   // If guarantee_connectivity == true, use a temporal list to merge the
@@ -442,13 +507,22 @@ __global__ void kern_merge_graph(
     __syncwarp();
   }
 
-  const auto num_protected_edges = max(current_mst_graph_num_edges, output_graph_degree / 2);
+  // In the variable-degree path, `num_protected_edges` is additionally capped by the per-node
+  // `effective_degree`: protecting more than the natural degree would prevent rev-graph edges
+  // from being inserted (since they always go into slot `num_protected_edges`) and would mask
+  // them with the prune-step contents that have not yet been validated as kept.
+  const auto num_protected_edges =
+    max(current_mst_graph_num_edges, min(effective_degree, output_graph_degree / 2));
 
-  if (num_protected_edges > output_graph_degree) {
-    check_num_protected_edges[0] = 0u;
+  if (num_protected_edges > effective_degree) {
+    check_num_protected_edges(0) = 0u;
     return;
   }
-  if (num_protected_edges == output_graph_degree) { return; }
+  // Variable-degree path always needs to write the -1 padding (and write-back natural_degree),
+  // so we cannot early-return like the constant-degree path does when there is no room to insert.
+  if constexpr (!VariableDegree) {
+    if (num_protected_edges == output_graph_degree) { return; }
+  }
 
   auto kr = min(rev_graph_count(nid), output_graph_degree);
 
@@ -456,6 +530,10 @@ __global__ void kern_merge_graph(
     kr -= 1;
     const auto rev_graph_value = rev_graph(nid, kr);
     if (rev_graph_value < graph_size) {
+      if constexpr (VariableDegree) {
+        const uint32_t in_degree = rev_graph_count(rev_graph_value);
+        if (in_degree < output_graph_degree) { continue; }
+      }
       uint64_t pos =
         warp_pos_in_array<IdxT>(rev_graph_value, smem_sorted_output_graph, output_graph_degree);
       if (pos < num_protected_edges) { continue; }
@@ -465,11 +543,26 @@ __global__ void kern_merge_graph(
         lane_id, smem_sorted_output_graph + num_protected_edges, num_shift);
       if (lane_id == 0) { smem_sorted_output_graph[num_protected_edges] = rev_graph_value; }
       __syncwarp();
+      // A new rev-graph edge has been inserted at position `num_protected_edges`, growing the
+      // node's effective degree up to (but not beyond) the full output_graph_degree. In the
+      // constant-degree path effective_degree already equals output_graph_degree, so this is a
+      // no-op and needs no VariableDegree guard.
+      if (effective_degree < output_graph_degree) { effective_degree++; }
     }
   }
 
+  // The write-back stays guarded: d_natural_degree is an empty view in the constant-degree path.
+  if constexpr (VariableDegree) {
+    if (lane_id == 0) { d_natural_degree(nid) = effective_degree; }
+  }
+
+  // Slots past the final effective degree are written with the kInvalidNeighbor sentinel that
+  // downstream consumers (HNSW serialization, out-of-range check) recognize as "end of neighbor
+  // list". In the constant-degree path effective_degree == output_graph_degree, so no slot is
+  // padded.
   for (uint32_t i = lane_id; i < output_graph_degree; i += raft::WarpSize) {
-    output_graph(nid_batch, i) = smem_sorted_output_graph[i];
+    output_graph(nid_batch, i) =
+      (i < effective_degree) ? smem_sorted_output_graph[i] : kInvalidNeighbor<IdxT>;
   }
 }
 
@@ -804,6 +897,10 @@ void check_duplicates_and_out_of_range(
     for (uint32_t j = 0; j < output_graph_degree; j++) {
       const auto neighbor_a = my_out_graph[j];
 
+      // The variable-degree path pads unused slots with kInvalidNeighbor. Without this guard the
+      // sentinel would (correctly) trigger the `>= graph_size` out-of-range check below.
+      if (neighbor_a == kInvalidNeighbor<IdxT>) { continue; }
+
       if (neighbor_a >= graph_size) {
         num_oor++;
         continue;
@@ -824,7 +921,7 @@ void check_duplicates_and_out_of_range(
     num_oor == 0, "%lu out-of-range index node(s) are found in the generated CAGRA graph", num_oor);
 }
 
-template <typename IdxT, typename AccessorOutputGraph>
+template <typename IdxT, typename AccessorOutputGraph, bool VariableDegree>
 void merge_graph_gpu(
   raft::resources const& res,
   raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, AccessorOutputGraph>
@@ -833,7 +930,8 @@ void merge_graph_gpu(
   raft::device_vector_view<uint32_t, int64_t> d_rev_graph_count,
   raft::host_matrix_view<IdxT, int64_t> mst_graph,
   raft::host_vector_view<uint32_t, int64_t> mst_graph_num_edges,
-  bool guarantee_connectivity)
+  bool guarantee_connectivity,
+  raft::device_vector_view<uint32_t, int64_t> d_natural_degree)
 {
   const uint64_t graph_size          = output_graph.extent(0);
   const uint64_t output_graph_degree = output_graph.extent(1);
@@ -888,7 +986,7 @@ void merge_graph_gpu(
     auto mst_graph_view           = (*d_mst_graph).view();
     auto mst_graph_num_edges_view = (*d_mst_graph_num_edges).view();
     auto output_view              = (*d_output_graph).view();
-    kern_merge_graph<IdxT, num_warps>
+    kern_merge_graph<IdxT, num_warps, VariableDegree>
       <<<blocks_merge, threads_merge, merge_smem_size, raft::resource::get_cuda_stream(res)>>>(
         output_view,
         d_rev_graph,
@@ -898,7 +996,8 @@ void merge_graph_gpu(
         batch_size,
         i_batch,
         guarantee_connectivity,
-        d_check_num_protected_edges.data_handle());
+        d_check_num_protected_edges.view(),
+        d_natural_degree);
 
     d_output_graph.prefetch_next_batch();
     d_mst_graph.prefetch_next_batch();
@@ -923,13 +1022,17 @@ void merge_graph_gpu(
                  (merge_graph_end - merge_graph_start) * 1000.0);
 }
 
-template <typename IdxT, typename AccessorOutputGraph>
+// When VariableDegree is true, `d_natural_degree` (size graph_size) gates which slots of the
+// output graph contribute to the reverse graph: only positions [0, natural_degree(i)) of node i
+// are considered, the rest are skipped.
+template <typename IdxT, typename AccessorOutputGraph, bool VariableDegree>
 void make_reverse_graph_gpu(
   raft::resources const& res,
   raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, AccessorOutputGraph>
     output_graph,
   raft::device_matrix_view<IdxT, int64_t> d_rev_graph,
-  raft::device_vector_view<uint32_t, int64_t> d_rev_graph_count)
+  raft::device_vector_view<uint32_t, int64_t> d_rev_graph_count,
+  raft::device_vector_view<uint32_t, int64_t> d_natural_degree)
 {
   const uint64_t graph_size          = output_graph.extent(0);
   const uint64_t output_graph_degree = output_graph.extent(1);
@@ -940,7 +1043,7 @@ void make_reverse_graph_gpu(
   //
   // Make reverse graph
   //
-  raft::matrix::fill(res, d_rev_graph, IdxT(-1));
+  raft::matrix::fill(res, d_rev_graph, kInvalidNeighbor<IdxT>);
   raft::matrix::fill(res, d_rev_graph_count, uint32_t(0));
 
   if constexpr (AccessorOutputGraph::is_device_accessible) {
@@ -948,12 +1051,21 @@ void make_reverse_graph_gpu(
     dim3 threads(256, 1, 1);
     dim3 blocks(1024, 1, 1);
     for (uint64_t k = 0; k < output_graph_degree; k++) {
-      kern_make_rev_graph_k<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
-        output_graph, d_rev_graph, d_rev_graph_count, k);
+      kern_make_rev_graph_k<IdxT, VariableDegree>
+        <<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
+          output_graph,
+          d_rev_graph,
+          d_rev_graph_count,
+          k,
+          raft::make_const_mdspan(d_natural_degree));
     }
   } else {
-    auto d_dest_nodes = raft::make_device_matrix<IdxT, int64_t>(res, graph_size, 1);
-    auto dest_nodes   = raft::make_host_vector<IdxT, int64_t>(graph_size);
+    // Host variant: the output graph is host-only, so we extract one column at a time into a
+    // device vector and hand that to the kernel. The natural-degree mask is applied inside the
+    // kernel using the real column index `k` (the kernel indexes the 1D `d_dest_nodes` by row).
+    auto d_dest_nodes = raft::make_device_vector<IdxT, int64_t>(res, graph_size);
+    auto dest_nodes   = raft::make_host_vector<IdxT, int64_t>(res, graph_size);
+
     for (uint64_t k = 0; k < output_graph_degree; k++) {
 #pragma omp parallel for
       for (uint64_t i = 0; i < graph_size; i++) {
@@ -963,8 +1075,13 @@ void make_reverse_graph_gpu(
 
       dim3 threads(256, 1, 1);
       dim3 blocks(1024, 1, 1);
-      kern_make_rev_graph_k<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
-        d_dest_nodes.view(), d_rev_graph, d_rev_graph_count, 0);
+      kern_make_rev_graph_k<IdxT, VariableDegree>
+        <<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
+          d_dest_nodes.view(),
+          d_rev_graph,
+          d_rev_graph_count,
+          k,
+          raft::make_const_mdspan(d_natural_degree));
       raft::resource::sync_stream(res);
       RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u    \r", k, output_graph_degree);
     }
@@ -1592,12 +1709,17 @@ void mst_optimization(
 // specified number of edges are picked up for each node, starting with the edge with
 // the lowest number of 2-hop detours.
 //
-template <typename IdxT, typename AccessorKnnGraph, typename AccessorOutputGraph>
+template <typename IdxT,
+          typename AccessorKnnGraph,
+          typename AccessorOutputGraph,
+          bool VariableDegree>
 void prune_graph_gpu(
   raft::resources const& res,
   raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, AccessorKnnGraph> knn_graph,
   raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, AccessorOutputGraph>
-    output_graph)
+    output_graph,
+  raft::device_vector_view<uint32_t, int64_t> d_natural_degree,
+  uint32_t target_pruned_degree)
 {
   const uint64_t graph_size          = output_graph.extent(0);
   const uint64_t knn_graph_degree    = knn_graph.extent(1);
@@ -1652,14 +1774,16 @@ void prune_graph_gpu(
 
   for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
     auto output_view = (*d_output_graph).view();
-    kern_fused_prune<IdxT, num_warps>
+    kern_fused_prune<IdxT, num_warps, VariableDegree>
       <<<blocks_prune, threads_prune, prune_smem_size, raft::resource::get_cuda_stream(res)>>>(
         input_view,
         output_view,
         batch_size,
         i_batch,
         d_invalid_neighbor_list.data_handle(),
-        dev_stats.data_handle());
+        dev_stats.data_handle(),
+        d_natural_degree,
+        target_pruned_degree);
 
     d_output_graph.prefetch_next_batch();
     ++d_output_graph;
@@ -1707,8 +1831,9 @@ void optimize(
   raft::resources const& res_const,
   raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, AccessorKnnGraph> knn_graph,
   raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, AccessorOutputGraph> new_graph,
-  const bool guarantee_connectivity       = true,
-  const bool use_gpu_for_mst_optimization = true)
+  const bool guarantee_connectivity           = true,
+  const bool use_gpu_for_mst_optimization     = true,
+  const double variable_graph_degree_fraction = 1.0)
 {
   RAFT_LOG_DEBUG(
     "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
@@ -1735,13 +1860,32 @@ void optimize(
   RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1),
                "output graph cannot have more columns than input graph");
   // const uint64_t input_graph_degree  = knn_graph.extent(1);
-  const uint64_t knn_graph_degree    = knn_graph.extent(1);
-  const uint64_t output_graph_degree = new_graph.extent(1);
-  const uint64_t graph_size          = new_graph.extent(0);
+  const uint64_t knn_graph_degree     = knn_graph.extent(1);
+  const uint64_t output_graph_degree  = new_graph.extent(1);
+  const uint64_t graph_size           = new_graph.extent(0);
+  const uint64_t target_pruned_degree = std::max<uint64_t>(
+    1, static_cast<uint64_t>(std::ceil(output_graph_degree * variable_graph_degree_fraction)));
+  const bool variable_graph_degree = (target_pruned_degree < output_graph_degree);
+
+  if (variable_graph_degree) {
+    RAFT_LOG_INFO("# Pruning kNN graph (size=%lu, degree=%lu, target_pruned_degree=%lu)",
+                  graph_size,
+                  knn_graph_degree,
+                  target_pruned_degree);
+  }
 
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::graph::optimize(%zu, %zu, %u)", graph_size, knn_graph_degree, output_graph_degree);
 
+  // Per-node "natural" degree produced by the prune step. Only allocated when the variable-degree
+  // path is enabled. The prune kernel records here the count of edges with detour level <=
+  // detour level at position (target_pruned_degree - 1). The merge kernel then both reads and
+  // updates this array (after rev-graph back-fill the value becomes the final effective degree).
+  auto d_natural_degree = raft::make_device_mdarray<uint32_t>(
+    res,
+    default_ws_mr,
+    raft::make_extents<int64_t>(variable_graph_degree ? static_cast<int64_t>(graph_size) : 0));
+
   // MST optimization
   // currently, only using GPU path for MST optimization
   int64_t mst_graph_size = guarantee_connectivity ? graph_size : 0;
@@ -1765,7 +1909,13 @@ void optimize(
 
   // prune graph -- will always use GPU path
   {
-    prune_graph_gpu<IdxT>(res, knn_graph, new_graph);
+    if (variable_graph_degree) {
+      prune_graph_gpu<IdxT, AccessorKnnGraph, AccessorOutputGraph, true>(
+        res, knn_graph, new_graph, d_natural_degree.view(), target_pruned_degree);
+    } else {
+      prune_graph_gpu<IdxT, AccessorKnnGraph, AccessorOutputGraph, false>(
+        res, knn_graph, new_graph, d_natural_degree.view(), target_pruned_degree);
+    }
   }
 
   // reverse graph creation will always use the GPU / large workspace resource
@@ -1778,7 +1928,13 @@ void optimize(
 
   const double time_make_start = cur_time();
 
-  make_reverse_graph_gpu<IdxT>(res, new_graph, d_rev_graph.view(), d_rev_graph_count.view());
+  if (variable_graph_degree) {
+    make_reverse_graph_gpu<IdxT, AccessorOutputGraph, true>(
+      res, new_graph, d_rev_graph.view(), d_rev_graph_count.view(), d_natural_degree.view());
+  } else {
+    make_reverse_graph_gpu<IdxT, AccessorOutputGraph, false>(
+      res, new_graph, d_rev_graph.view(), d_rev_graph_count.view(), d_natural_degree.view());
+  }
 
   raft::resource::sync_stream(res);
 
@@ -1788,17 +1944,52 @@ void optimize(
 
   // merge graph -- will always use GPU path
   {
-    merge_graph_gpu<IdxT>(res,
-                          new_graph,
-                          d_rev_graph.view(),
-                          d_rev_graph_count.view(),
-                          mst_graph.view(),
-                          mst_graph_num_edges.view(),
-                          guarantee_connectivity);
+    if (variable_graph_degree) {
+      merge_graph_gpu<IdxT, AccessorOutputGraph, true>(res,
+                                                       new_graph,
+                                                       d_rev_graph.view(),
+                                                       d_rev_graph_count.view(),
+                                                       mst_graph.view(),
+                                                       mst_graph_num_edges.view(),
+                                                       guarantee_connectivity,
+                                                       d_natural_degree.view());
+    } else {
+      merge_graph_gpu<IdxT, AccessorOutputGraph, false>(res,
+                                                        new_graph,
+                                                        d_rev_graph.view(),
+                                                        d_rev_graph_count.view(),
+                                                        mst_graph.view(),
+                                                        mst_graph_num_edges.view(),
+                                                        guarantee_connectivity,
+                                                        d_natural_degree.view());
+    }
+  }
+
+  auto d_avg_natural = raft::make_device_scalar<double>(res, 0);
+  double avg_natural = 0;
+  if (variable_graph_degree) {
+    // main_op: cast each degree to double (innermost op, which also drops the reduction-index
+    // argument), then divide by graph_size, so the reduction accumulates the mean directly.
+    auto normalize_mean = raft::compose_op{
+      raft::div_const_op<double>{static_cast<double>(graph_size)}, raft::cast_op<double>{}};
+    raft::linalg::reduce<raft::Apply::ALONG_ROWS>(
+      res,
+      raft::make_device_matrix_view<const uint32_t, int64_t, raft::row_major>(
+        d_natural_degree.data_handle(), int64_t{1}, static_cast<int64_t>(graph_size)),
+      raft::make_device_vector_view<double, int64_t>(d_avg_natural.data_handle(), int64_t{1}),
+      0.0,
+      false,
+      normalize_mean);
+    raft::copy(res, raft::make_host_scalar_view(&avg_natural), d_avg_natural.view());
   }
 
   raft::resource::sync_stream(res);
 
+  if (variable_graph_degree) {
+    RAFT_LOG_INFO(
+      "# Variable graph degree: avg natural degree = %.2f / %lu", avg_natural, output_graph_degree);
+  }
+
   // These host-side checks are expensive (O(N*D^2)) and only used as debug
   // diagnostics, so only run them when debug logging is active at runtime.
   if (raft::default_logger().should_log(rapids_logger::level_enum::debug)) {
diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index 88580de929..3031b739ee 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -310,12 +310,16 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
   for (size_t i = 0; i < static_cast<size_t>(host_graph_view.extent(0)); ++i) {
     auto hnsw_internal_id = appr_algo->label_lookup_.find(i)->second;
     auto ll_i             = appr_algo->get_linklist0(hnsw_internal_id);
-    appr_algo->setListCount(ll_i, host_graph_view.extent(1));
-    auto* data = (uint32_t*)(ll_i + 1);
+    size_t actual_count   = 0;
+    auto* data            = (uint32_t*)(ll_i + 1);
     for (size_t j = 0; j < static_cast<size_t>(host_graph_view.extent(1)); ++j) {
-      auto neighbor_internal_id = appr_algo->label_lookup_.find(host_graph(i, j))->second;
-      data[j]                   = neighbor_internal_id;
+      auto neighbor_id = host_graph(i, j);
+      if (neighbor_id == static_cast<uint32_t>(-1)) { break; }
+      auto neighbor_internal_id = appr_algo->label_lookup_.find(neighbor_id)->second;
+      data[actual_count]        = neighbor_internal_id;
+      actual_count++;
     }
+    appr_algo->setListCount(ll_i, actual_count);
   }
 
   hnsw_index->set_index(std::move(appr_algo));
@@ -543,9 +547,17 @@ void serialize_to_hnswlib_batched(raft::resources const& res,
     for (int64_t batch_idx = 0; batch_idx < current_batch_size; batch_idx++) {
       const int64_t i = batch_start + batch_idx;
 
-      os.write(reinterpret_cast<char*>(&graph_degree_int), sizeof(int));
-
+      // Variable-degree graphs pad unused neighbor slots with kInvalidNeighbor; the per-node
+      // hnswlib link-list size is the number of valid neighbors before the first sentinel.
       const IdxT* graph_row = &graph_buffer(batch_idx, 0);
+      int actual_degree_int = graph_degree_int;
+      for (int gj = 0; gj < graph_degree_int; gj++) {
+        if (graph_row[gj] == cuvs::neighbors::cagra::kInvalidNeighbor<IdxT>) {
+          actual_degree_int = gj;
+          break;
+        }
+      }
+      os.write(reinterpret_cast<char*>(&actual_degree_int), sizeof(int));
       os.write(reinterpret_cast<const char*>(graph_row), sizeof(IdxT) * graph_degree_int);
 
       if (odd_graph_degree) {
@@ -1123,15 +1135,22 @@ std::enable_if_t<hierarchy == HnswHierarchy::GPU, std::unique_ptr<index<T>>> fro
   // iterate over the points in the descending order of their levels
   for (size_t pt_level = hist.size() - 1; pt_level >= 1; pt_level--) {
     common::nvtx::range<common::nvtx::domain::cuvs> level_scope("level %zu", pt_level);
-    auto start_idx     = offsets[pt_level - 1];
-    auto end_idx       = offsets[hist.size() - 1];
-    auto num_pts       = end_idx - start_idx;
-    auto neighbor_size = num_pts > appr_algo->M_ ? appr_algo->M_ : num_pts - 1;
+    auto start_idx = offsets[pt_level - 1];
+    auto end_idx   = offsets[hist.size() - 1];
+    auto num_pts   = end_idx - start_idx;
     if (num_pts <= 1) {
       // this means only 1 point in the level
       continue;
     }
 
+    // Final per-layer degree (capped at the upper-layer limit maxM_ == M_) and the
+    // denser intermediate kNN degree we prune from (capped at maxM0_ == 2*M_). This
+    // mirrors CAGRA's own build_knn_graph -> optimize pipeline: build a richer kNN
+    // graph, then prune detourable edges and add reverse edges down to the target
+    // degree. Both are clamped so a tiny top layer is at most fully connected.
+    const int64_t knn_degree = std::min<int64_t>(appr_algo->maxM0_, num_pts - 1);
+    const int64_t out_degree = std::min<int64_t>(appr_algo->maxM_, knn_degree);
+
     // gather points from dataset to form query set on host
     auto host_query_set = raft::make_host_matrix<T, int64_t>(num_pts, dim);
     // TODO: Use `raft::matrix::gather` when available as a public API
@@ -1145,11 +1164,21 @@ std::enable_if_t<hierarchy == HnswHierarchy::GPU, std::unique_ptr<index<T>>> fro
     }
 
     // find neighbors of the query set
-    auto host_neighbors = raft::make_host_matrix<uint32_t, int64_t>(num_pts, neighbor_size);
-    all_neighbors_graph(res,
-                        raft::make_const_mdspan(host_query_set.view()),
-                        host_neighbors.view(),
-                        cagra_index.metric());
+    auto host_neighbors = raft::make_host_matrix<uint32_t, int64_t>(num_pts, out_degree);
+    if (knn_degree > out_degree) {
+      // Build a denser intermediate kNN graph, then prune it down to out_degree with
+      // variable degree disabled (mirrors CAGRA's build_knn_graph -> optimize). The
+      // constant-degree optimize output is full fixed degree with no invalid padding.
+      auto host_knn = raft::make_host_matrix<uint32_t, int64_t>(num_pts, knn_degree);
+      all_neighbors_graph(
+        res, raft::make_const_mdspan(host_query_set.view()), host_knn.view(), cagra_index.metric());
+      cuvs::neighbors::cagra::helpers::optimize(res, host_knn.view(), host_neighbors.view());
+    } else {
+      all_neighbors_graph(res,
+                          raft::make_const_mdspan(host_query_set.view()),
+                          host_neighbors.view(),
+                          cagra_index.metric());
+    }
 
     {
       common::nvtx::range<common::nvtx::domain::cuvs> copy_scope(
@@ -1186,12 +1215,16 @@ std::enable_if_t<hierarchy == HnswHierarchy::GPU, std::unique_ptr<index<T>>> fro
     common::nvtx::range<common::nvtx::domain::cuvs> copy_scope("get_linklist0<host>");
 #pragma omp parallel for num_threads(num_threads)
     for (int64_t i = 0; i < n_rows; i++) {
-      auto ll_i = appr_algo->get_linklist0(i);
-      appr_algo->setListCount(ll_i, degree);
-      auto* data = (uint32_t*)(ll_i + 1);
+      auto ll_i            = appr_algo->get_linklist0(i);
+      auto* data           = (uint32_t*)(ll_i + 1);
+      int64_t actual_count = 0;
       for (int64_t j = 0; j < degree; j++) {
-        data[j] = graph_ptr[i * degree + j];
+        auto neighbor_id = graph_ptr[i * degree + j];
+        if (neighbor_id == static_cast<uint32_t>(-1)) { break; }
+        data[actual_count] = neighbor_id;
+        actual_count++;
       }
+      appr_algo->setListCount(ll_i, actual_count);
     }
   } else {
     common::nvtx::range<common::nvtx::domain::cuvs> copy_scope("get_linklist0<device>");
@@ -1203,11 +1236,20 @@ std::enable_if_t<hierarchy == HnswHierarchy::GPU, std::unique_ptr<index<T>>> fro
                                     n_rows,
                                     cudaMemcpyDefault,
                                     raft::resource::get_cuda_stream(res)));
+    raft::resource::sync_stream(res);
 #pragma omp parallel for num_threads(num_threads)
     for (int64_t i = 0; i < n_rows; i++) {
-      appr_algo->setListCount(appr_algo->get_linklist0(i), degree);
+      auto ll_i            = appr_algo->get_linklist0(i);
+      auto* data           = (uint32_t*)(ll_i + 1);
+      int64_t actual_count = degree;
+      for (int64_t j = 0; j < degree; j++) {
+        if (data[j] == static_cast<uint32_t>(-1)) {
+          actual_count = j;
+          break;
+        }
+      }
+      appr_algo->setListCount(ll_i, actual_count);
     }
-    raft::resource::sync_stream(res);
   }
   hnsw_index->set_index(std::move(appr_algo));
   return hnsw_index;