From 52997da2051966e63c2ae480be3f33e7a4e7921d Mon Sep 17 00:00:00 2001
From: Julian Miller <julianmiller@nvidia.com>
Date: Tue, 26 May 2026 08:32:53 +0200
Subject: [PATCH 1/6] Add HNSW layered hierarchy

---
 cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu  |   11 +-
 .../ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h |   30 +-
 cpp/include/cuvs/neighbors/hnsw.hpp           |   18 +-
 cpp/src/neighbors/detail/hnsw.hpp             | 1125 ++++++++++++++++-
 cpp/tests/neighbors/ann_hnsw_ace.cuh          |  257 ++++
 .../ann_hnsw_ace/test_float_uint32_t.cu       |   10 +
 .../ann_hnsw_ace/test_half_uint32_t.cu        |   20 +
 .../ann_hnsw_ace/test_int8_t_uint32_t.cu      |   20 +
 .../ann_hnsw_ace/test_uint8_t_uint32_t.cu     |   20 +
 examples/cpp/CMakeLists.txt                   |    2 +
 examples/cpp/src/hnsw_ace_layered_example.cu  |  278 ++++
 11 files changed, 1723 insertions(+), 68 deletions(-)
 create mode 100644 examples/cpp/src/hnsw_ace_layered_example.cu
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
index 26028b6d98..b535ced763 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
@@ -1,9 +1,10 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "../common/ann_types.hpp"
+#include "../common/conf.hpp"
 #include "cuvs_ann_bench_param_parser.h"
 #include "cuvs_cagra_hnswlib_wrapper.h"
 
@@ -27,6 +28,9 @@ auto parse_build_param(const nlohmann::json& conf) ->
       hnsw_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::CPU;
     } else if (conf.at("hierarchy") == "gpu") {
       hnsw_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::GPU;
+    } else if (conf.at("hierarchy") == "gpu_layered_on_disk" ||
+               conf.at("hierarchy") == "gpu_layered" || conf.at("hierarchy") == "layered") {
+      hnsw_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK;
     } else {
       THROW("Invalid value for hierarchy: %s", conf.at("hierarchy").get<std::string>().c_str());
     }
@@ -36,6 +40,11 @@ auto parse_build_param(const nlohmann::json& conf) ->
   if (conf.contains("ef_construction")) {
     hnsw_params.ef_construction = conf.at("ef_construction");
   }
+  if (conf.contains("dataset_path")) {
+    hnsw_params.dataset_path = conf.at("dataset_path");
+  } else if (hnsw_params.hierarchy == cuvs::neighbors::hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK) {
+    hnsw_params.dataset_path = configuration::singleton().get_dataset_conf().base_file;
+  }
   if (conf.contains("num_threads")) { hnsw_params.num_threads = conf.at("num_threads"); }
 
   // Reuse the CAGRA wrapper params parser
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
index 2f0c54e1bd..0104ccdb3e 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -9,10 +9,26 @@
 #include <raft/core/logger.hpp>
 
 #include <chrono>
+#include <filesystem>
 #include <memory>
 
 namespace cuvs::bench {
 
+inline void copy_file_overwrite(const std::filesystem::path& src, const std::filesystem::path& dst)
+{
+  std::error_code ec;
+  if (src == dst ||
+      (std::filesystem::exists(dst, ec) && std::filesystem::equivalent(src, dst, ec))) {
+    return;
+  }
+  if (!dst.parent_path().empty()) { std::filesystem::create_directories(dst.parent_path()); }
+
+  std::filesystem::copy_file(src, dst, std::filesystem::copy_options::overwrite_existing, ec);
+  const auto src_str = src.string();
+  const auto dst_str = dst.string();
+  RAFT_EXPECTS(!ec, "Failed to copy '%s' to '%s'.", src_str.c_str(), dst_str.c_str());
+}
+
 template <typename T, typename IdxT>
 class cuvs_cagra_hnswlib : public algo<T>, public algo_gpu {
  public:
@@ -130,6 +146,18 @@ void cuvs_cagra_hnswlib<T, IdxT>::set_search_param(const search_param_base& para
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::save(const std::string& file) const
 {
+  if (build_param_.hnsw_index_params.hierarchy ==
+      cuvs::neighbors::hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK) {
+    const auto src_artifact = std::filesystem::path(hnsw_index_->file_path());
+    RAFT_EXPECTS(!src_artifact.empty(), "Layered HNSW artifact path is not available.");
+    RAFT_EXPECTS(std::filesystem::exists(src_artifact),
+                 "Layered HNSW artifact '%s' does not exist.",
+                 src_artifact.c_str());
+
+    copy_file_overwrite(src_artifact, std::filesystem::path(file));
+    return;
+  }
+
   if (cagra_ace_build_) {
     std::string index_filename = hnsw_index_->file_path();
     RAFT_EXPECTS(!index_filename.empty(), "HNSW index file path is not available.");
diff --git a/cpp/include/cuvs/neighbors/hnsw.hpp b/cpp/include/cuvs/neighbors/hnsw.hpp
index fb726fed71..5800e05a96 100644
--- a/cpp/include/cuvs/neighbors/hnsw.hpp
+++ b/cpp/include/cuvs/neighbors/hnsw.hpp
@@ -20,6 +20,7 @@
 #include <cstdint>
 #include <cuvs/core/export.hpp>
 #include <memory>
+#include <string>
 #include <type_traits>
 #include <variant>
 
@@ -41,9 +42,10 @@ namespace graph_build_params = cuvs::neighbors::graph_build_params;
  * NOTE: When the value is `NONE`, the HNSW index is built as a base-layer-only index.
  */
 enum class HnswHierarchy {
-  NONE,  // base-layer-only index
-  CPU,   // full index with CPU-built hierarchy
-  GPU    // full index with GPU-built hierarchy
+  NONE,                // base-layer-only index
+  CPU,                 // full index with CPU-built hierarchy
+  GPU,                 // full index with GPU-built hierarchy
+  GPU_LAYERED_ON_DISK  // GPU-built hierarchy stored as layered on-disk topology
 };
 
 struct index_params : cuvs::neighbors::index_params {
@@ -64,6 +66,16 @@ struct index_params : cuvs::neighbors::index_params {
    */
   size_t M = 32;
 
+  /** Local dataset path used by layered HNSW deserialization.
+   *
+   * When `hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK`, the index artifact stores graph
+   * topology only. `deserialize` loads vectors from this local dataset path to reconstruct an
+   * in-memory HNSW index.
+   * Currently supported local dataset formats are `.npy` and ANN benchmark `*.bin` files with a
+   * `[uint32 rows, uint32 cols]` header.
+   */
+  std::string dataset_path;
+
   /** Parameters to fine tune GPU graph building. By default we select the parameters based on
    * dataset shape and HNSW build parameters. You can override these parameters to fine tune the
    * graph building process as described in the CAGRA build docs.
diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index 4914a0fa1b..44dc4e688e 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -23,15 +23,28 @@
 #include <hnswlib/hnswalg.h>
 #include <hnswlib/hnswlib.h>
 
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
 #include <fcntl.h>
 #include <filesystem>
 #include <fstream>
+#include <limits>
 #include <memory>
 #include <new>
+#include <numeric>
+#include <optional>
 #include <random>
+#include <sstream>
+#include <stdexcept>
 #include <sys/mman.h>
 #include <thread>
 #include <unistd.h>
+#include <utility>
+#include <vector>
 
 namespace cuvs::neighbors::hnsw::detail {
 
@@ -166,6 +179,9 @@ struct index_impl : index<T> {
     RAFT_LOG_INFO("Loading HNSW index from disk: %s", filepath.c_str());
 
     try {
+      RAFT_EXPECTS(this->hierarchy() != HnswHierarchy::GPU_LAYERED_ON_DISK,
+                   "Layered HNSW indexes must be loaded with hnsw::deserialize so a local dataset "
+                   "can be provided through index_params.dataset_path.");
       appr_alg_ = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
         space_.get(), filepath);
       if (this->hierarchy() == HnswHierarchy::NONE) { appr_alg_->base_layer_only = true; }
@@ -341,6 +357,796 @@ void all_neighbors_graph(raft::resources const& res,
   }
 }
 
+struct hnsw_level_plan {
+  size_t n_rows = 0;
+  std::vector<size_t> hist;
+  // Bucket-end offsets after order construction. Level L starts at offsets[L - 1].
+  std::vector<size_t> offsets;
+  std::vector<size_t> order;
+  std::vector<size_t> order_bw;
+  std::vector<uint8_t> levels;
+
+  [[nodiscard]] auto max_level() const -> int
+  {
+    return hist.empty() ? 0 : static_cast<int>(hist.size() - 1);
+  }
+
+  [[nodiscard]] auto promoted_count() const -> size_t
+  {
+    return hist.empty() ? 0 : n_rows - hist[0];
+  }
+};
+
+inline auto hnsw_upper_layer_degree(size_t row_count, size_t M) -> size_t
+{
+  return row_count > M ? M : row_count > 0 ? row_count - 1 : 0;
+}
+
+inline auto elapsed_ms_since(std::chrono::steady_clock::time_point start) -> int64_t
+{
+  return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() -
+                                                               start)
+    .count();
+}
+
+inline auto to_gib(size_t bytes) -> double
+{
+  return static_cast<double>(bytes) / (1024.0 * 1024.0 * 1024.0);
+}
+
+inline auto throughput_gib_per_s(size_t bytes, int64_t elapsed_ms) -> double
+{
+  return elapsed_ms > 0 ? to_gib(bytes) / (elapsed_ms / 1000.0) : 0.0;
+}
+
+inline auto make_hnsw_level_plan_from_levels(size_t n_rows,
+                                             std::vector<uint8_t>&& levels,
+                                             bool build_reverse_order,
+                                             const char* log_prefix = nullptr) -> hnsw_level_plan
+{
+  hnsw_level_plan plan;
+  plan.n_rows = n_rows;
+  plan.levels = std::move(levels);
+  RAFT_EXPECTS(n_rows > 0, "HNSW hierarchy requires at least one row");
+  RAFT_EXPECTS(plan.levels.size() == n_rows,
+               "HNSW level count (%zu) must match row count (%zu)",
+               plan.levels.size(),
+               n_rows);
+
+  for (auto level : plan.levels) {
+    while (static_cast<size_t>(level) >= plan.hist.size()) {
+      plan.hist.push_back(0);
+    }
+    plan.hist[level]++;
+  }
+
+  plan.offsets.resize(plan.hist.size() + 1, 0);
+  for (size_t i = 0; i < plan.hist.size() - 1; ++i) {
+    plan.offsets[i + 1] = plan.offsets[i] + plan.hist[i];
+    if (log_prefix != nullptr) {
+      RAFT_LOG_INFO("%s%zu : %zu", log_prefix, i + 1, n_rows - plan.offsets[i + 1]);
+    }
+  }
+
+  plan.order.resize(n_rows);
+  if (build_reverse_order) { plan.order_bw.resize(n_rows); }
+  for (size_t i = 0; i < n_rows; ++i) {
+    const auto level = static_cast<size_t>(plan.levels[i]);
+    if (build_reverse_order) { plan.order_bw[i] = plan.offsets[level]; }
+    plan.order[plan.offsets[level]++] = i;
+  }
+
+  return plan;
+}
+
+template <typename HnswAlgo>
+auto make_random_hnsw_level_plan(size_t n_rows, HnswAlgo& appr_algo, const char* log_prefix)
+  -> hnsw_level_plan
+{
+  std::vector<uint8_t> levels(n_rows);
+  for (int64_t i = 0; i < static_cast<int64_t>(n_rows); ++i) {
+    const auto pt_level = appr_algo.getRandomLevel(appr_algo.mult_);
+    RAFT_EXPECTS(pt_level <= std::numeric_limits<uint8_t>::max(),
+                 "HNSW serialization only supports levels up to %u",
+                 static_cast<unsigned int>(std::numeric_limits<uint8_t>::max()));
+    levels[i] = static_cast<uint8_t>(pt_level);
+  }
+
+  return make_hnsw_level_plan_from_levels(n_rows, std::move(levels), true, log_prefix);
+}
+
+template <typename T, typename IdxT, typename Callback>
+void build_hnsw_upper_layer_graphs(
+  raft::resources const& res,
+  raft::host_matrix_view<const T, int64_t, raft::row_major> promoted_dataset,
+  const hnsw_level_plan& plan,
+  size_t M,
+  cuvs::distance::DistanceType metric,
+  Callback&& callback)
+{
+  static_assert(std::is_same_v<IdxT, uint32_t>,
+                "HNSW upper-layer graph construction expects uint32_t neighbor ids");
+
+  const auto dim = static_cast<size_t>(promoted_dataset.extent(1));
+  for (size_t pt_level = 1; pt_level < plan.hist.size(); ++pt_level) {
+    const auto start_idx     = plan.offsets[pt_level - 1];
+    const auto removed_rows  = start_idx - plan.offsets[0];
+    const auto row_count     = plan.n_rows - start_idx;
+    const auto neighbor_size = hnsw_upper_layer_degree(row_count, M);
+
+    RAFT_LOG_INFO("Compute hierarchy neighbors level %zu", pt_level);
+    auto host_neighbors = raft::make_host_matrix<IdxT, int64_t>(
+      static_cast<int64_t>(row_count), static_cast<int64_t>(neighbor_size));
+
+    if (neighbor_size > 0) {
+      auto layer_dataset_view = raft::make_host_matrix_view<const T, int64_t, raft::row_major>(
+        promoted_dataset.data_handle() + removed_rows * dim,
+        static_cast<int64_t>(row_count),
+        promoted_dataset.extent(1));
+      all_neighbors_graph(res, layer_dataset_view, host_neighbors.view(), metric);
+    }
+
+    callback(pt_level, start_idx, host_neighbors);
+  }
+}
+
+struct layered_hnsw_layer_info {
+  size_t level       = 0;
+  size_t row_count   = 0;
+  size_t degree      = 0;
+  size_t node_offset = 0;
+  size_t link_offset = 0;
+};
+
+struct layered_hnsw_file_metadata {
+  size_t n_rows               = 0;
+  size_t dim                  = 0;
+  size_t M                    = 0;
+  size_t maxM                 = 0;
+  size_t maxM0                = 0;
+  size_t ef_construction      = 0;
+  size_t base_degree          = 0;
+  size_t levels_bytes         = 0;
+  size_t base_link_row_bytes  = 0;
+  size_t base_links_bytes     = 0;
+  size_t upper_nodes_count    = 0;
+  size_t upper_nodes_bytes    = 0;
+  size_t upper_link_row_bytes = 0;
+  size_t upper_links_bytes    = 0;
+  double mult                 = 0.0;
+  int maxlevel                = 0;
+  int enterpoint_node         = 0;
+  std::vector<layered_hnsw_layer_info> layers;
+};
+
+struct layered_hnsw_file_header {
+  char magic[32];
+  uint32_t version;
+  uint32_t reserved;
+  uint64_t metadata_size;
+  uint64_t metadata_offset;
+};
+
+constexpr const char* layered_hnsw_magic = "CUVS_HNSW_LAYERED";
+constexpr uint32_t layered_hnsw_version  = 1;
+constexpr size_t layered_hnsw_alignment  = 64;
+
+inline auto align_up(size_t value, size_t alignment) -> size_t
+{
+  return ((value + alignment - 1) / alignment) * alignment;
+}
+
+template <typename T>
+auto layered_dtype_name() -> const char*
+{
+  if constexpr (std::is_same_v<T, float>) {
+    return "float32";
+  } else if constexpr (std::is_same_v<T, half>) {
+    return "float16";
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    return "uint8";
+  } else if constexpr (std::is_same_v<T, int8_t>) {
+    return "int8";
+  } else {
+    return "unknown";
+  }
+}
+
+inline auto metric_name(cuvs::distance::DistanceType metric) -> const char*
+{
+  switch (metric) {
+    case cuvs::distance::DistanceType::L2Expanded: return "L2Expanded";
+    case cuvs::distance::DistanceType::InnerProduct: return "InnerProduct";
+    default: return "Unknown";
+  }
+}
+
+template <typename T>
+auto make_layered_hnsw_metadata_json(const layered_hnsw_file_metadata& metadata,
+                                     cuvs::distance::DistanceType metric) -> std::string
+{
+  std::stringstream os;
+  os << "{\n";
+  os << "  \"format\": \"cuvs_hnsw_layered\",\n";
+  os << "  \"version\": 1,\n";
+  os << "  \"n_rows\": " << metadata.n_rows << ",\n";
+  os << "  \"dim\": " << metadata.dim << ",\n";
+  os << "  \"dtype\": \"" << layered_dtype_name<T>() << "\",\n";
+  os << "  \"metric\": \"" << metric_name(metric) << "\",\n";
+  os << "  \"row_order\": \"original_id\",\n";
+  os << "  \"M\": " << metadata.M << ",\n";
+  os << "  \"maxM\": " << metadata.maxM << ",\n";
+  os << "  \"maxM0\": " << metadata.maxM0 << ",\n";
+  os << "  \"ef_construction\": " << metadata.ef_construction << ",\n";
+  os << "  \"mult\": " << metadata.mult << ",\n";
+  os << "  \"maxlevel\": " << metadata.maxlevel << ",\n";
+  os << "  \"enterpoint_node\": " << metadata.enterpoint_node << ",\n";
+  os << "  \"base_degree\": " << metadata.base_degree << ",\n";
+  os << "  \"levels_bytes\": " << metadata.levels_bytes << ",\n";
+  os << "  \"base_link_row_bytes\": " << metadata.base_link_row_bytes << ",\n";
+  os << "  \"base_links_bytes\": " << metadata.base_links_bytes << ",\n";
+  os << "  \"upper_nodes_count\": " << metadata.upper_nodes_count << ",\n";
+  os << "  \"upper_nodes_bytes\": " << metadata.upper_nodes_bytes << ",\n";
+  os << "  \"upper_link_row_bytes\": " << metadata.upper_link_row_bytes << ",\n";
+  os << "  \"upper_links_bytes\": " << metadata.upper_links_bytes << ",\n";
+  os << "  \"layers\": [\n";
+  for (size_t i = 0; i < metadata.layers.size(); ++i) {
+    const auto& layer = metadata.layers[i];
+    os << "    {\"level\": " << layer.level << ", \"row_count\": " << layer.row_count
+       << ", \"degree\": " << layer.degree << ", \"node_offset\": " << layer.node_offset
+       << ", \"link_offset\": " << layer.link_offset << "}";
+    os << (i + 1 == metadata.layers.size() ? "\n" : ",\n");
+  }
+  os << "  ]\n";
+  os << "}\n";
+  return os.str();
+}
+
+inline auto json_find_key(const std::string& json, const std::string& key, size_t start = 0)
+  -> size_t
+{
+  const auto pos = json.find("\"" + key + "\"", start);
+  RAFT_EXPECTS(
+    pos != std::string::npos, "Cannot find key '%s' in layered HNSW metadata", key.c_str());
+  return pos;
+}
+
+inline auto json_parse_size(const std::string& json, const std::string& key) -> size_t
+{
+  auto pos   = json_find_key(json, key);
+  auto colon = json.find(':', pos);
+  RAFT_EXPECTS(colon != std::string::npos, "Malformed JSON near key '%s'", key.c_str());
+  auto begin = json.find_first_of("0123456789", colon + 1);
+  auto end   = json.find_first_not_of("0123456789", begin);
+  RAFT_EXPECTS(
+    begin != std::string::npos, "Malformed integer JSON value for key '%s'", key.c_str());
+  return static_cast<size_t>(std::stoull(json.substr(begin, end - begin)));
+}
+
+inline auto json_parse_double(const std::string& json, const std::string& key) -> double
+{
+  auto pos   = json_find_key(json, key);
+  auto colon = json.find(':', pos);
+  RAFT_EXPECTS(colon != std::string::npos, "Malformed JSON near key '%s'", key.c_str());
+  auto begin = json.find_first_of("0123456789-.", colon + 1);
+  auto end   = json.find_first_not_of("0123456789-.eE+", begin);
+  RAFT_EXPECTS(begin != std::string::npos, "Malformed double JSON value for key '%s'", key.c_str());
+  return std::stod(json.substr(begin, end - begin));
+}
+
+inline auto json_parse_layer_field(const std::string& object, const std::string& key) -> size_t
+{
+  return json_parse_size(object, key);
+}
+
+inline auto parse_layered_hnsw_metadata(const std::string& json) -> layered_hnsw_file_metadata
+{
+  layered_hnsw_file_metadata metadata;
+  metadata.n_rows               = json_parse_size(json, "n_rows");
+  metadata.dim                  = json_parse_size(json, "dim");
+  metadata.M                    = json_parse_size(json, "M");
+  metadata.maxM                 = json_parse_size(json, "maxM");
+  metadata.maxM0                = json_parse_size(json, "maxM0");
+  metadata.ef_construction      = json_parse_size(json, "ef_construction");
+  metadata.mult                 = json_parse_double(json, "mult");
+  metadata.maxlevel             = static_cast<int>(json_parse_size(json, "maxlevel"));
+  metadata.enterpoint_node      = static_cast<int>(json_parse_size(json, "enterpoint_node"));
+  metadata.base_degree          = json_parse_size(json, "base_degree");
+  metadata.levels_bytes         = json_parse_size(json, "levels_bytes");
+  metadata.base_link_row_bytes  = json_parse_size(json, "base_link_row_bytes");
+  metadata.base_links_bytes     = json_parse_size(json, "base_links_bytes");
+  metadata.upper_nodes_count    = json_parse_size(json, "upper_nodes_count");
+  metadata.upper_nodes_bytes    = json_parse_size(json, "upper_nodes_bytes");
+  metadata.upper_link_row_bytes = json_parse_size(json, "upper_link_row_bytes");
+  metadata.upper_links_bytes    = json_parse_size(json, "upper_links_bytes");
+
+  auto layers_pos = json_find_key(json, "layers");
+  auto array_open = json.find('[', layers_pos);
+  auto array_end  = json.find(']', array_open);
+  RAFT_EXPECTS(array_open != std::string::npos && array_end != std::string::npos,
+               "Malformed layers array in layered HNSW metadata");
+
+  size_t pos = array_open;
+  while (true) {
+    auto object_open = json.find('{', pos);
+    if (object_open == std::string::npos || object_open > array_end) { break; }
+    auto object_end = json.find('}', object_open);
+    RAFT_EXPECTS(object_end != std::string::npos && object_end < array_end,
+                 "Malformed layer object in layered HNSW metadata");
+    const auto object = json.substr(object_open, object_end - object_open + 1);
+    metadata.layers.push_back({json_parse_layer_field(object, "level"),
+                               json_parse_layer_field(object, "row_count"),
+                               json_parse_layer_field(object, "degree"),
+                               json_parse_layer_field(object, "node_offset"),
+                               json_parse_layer_field(object, "link_offset")});
+    pos = object_end + 1;
+  }
+  return metadata;
+}
+
+struct npy_file {
+  cuvs::util::file_descriptor fd;
+  size_t header_size = 0;
+  std::vector<size_t> shape;
+};
+
+inline auto open_npy_file(const std::string& path) -> npy_file
+{
+  std::ifstream stream(path, std::ios::binary);
+  RAFT_EXPECTS(stream.good(), "Failed to open numpy file: %s", path.c_str());
+  auto header      = raft::detail::numpy_serializer::read_header(stream);
+  auto header_size = static_cast<size_t>(stream.tellg());
+  auto fd          = cuvs::util::file_descriptor(path, O_RDONLY);
+  return {std::move(fd), header_size, header.shape};
+}
+
+inline auto ends_with(const std::string& value, const std::string& suffix) -> bool
+{
+  return value.size() >= suffix.size() &&
+         value.compare(value.size() - suffix.size(), suffix.size(), suffix) == 0;
+}
+
+inline void copy_file_overwrite(const std::filesystem::path& src, const std::filesystem::path& dst)
+{
+  std::error_code ec;
+  if (src == dst ||
+      (std::filesystem::exists(dst, ec) && std::filesystem::equivalent(src, dst, ec))) {
+    return;
+  }
+  if (!dst.parent_path().empty()) { std::filesystem::create_directories(dst.parent_path()); }
+  std::filesystem::copy_file(src, dst, std::filesystem::copy_options::overwrite_existing);
+}
+
+template <typename T>
+inline auto open_layered_dataset_file(const std::string& path) -> npy_file
+{
+  if (std::filesystem::path(path).extension() == ".npy") { return open_npy_file(path); }
+
+  std::ifstream stream(path, std::ios::binary);
+  RAFT_EXPECTS(stream.good(), "Failed to open dataset file: %s", path.c_str());
+  std::array<uint32_t, 2> header{};
+  stream.read(reinterpret_cast<char*>(header.data()), sizeof(uint32_t) * header.size());
+  RAFT_EXPECTS(stream.gcount() == static_cast<std::streamsize>(sizeof(uint32_t) * header.size()),
+               "Failed to read ANN benchmark binary dataset header: %s",
+               path.c_str());
+
+  const auto ext = std::filesystem::path(path).extension().string();
+  if constexpr (std::is_same_v<T, float>) {
+    RAFT_EXPECTS(ext == ".fbin" && !ends_with(path, ".fp16.fbin"),
+                 "Expected a .fbin dataset for float layered HNSW load: %s",
+                 path.c_str());
+  } else if constexpr (std::is_same_v<T, half>) {
+    RAFT_EXPECTS(ext == ".f16bin" || ends_with(path, ".fp16.fbin"),
+                 "Expected a .f16bin or .fp16.fbin dataset for half layered HNSW load: %s",
+                 path.c_str());
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    RAFT_EXPECTS(
+      ext == ".u8bin", "Expected a .u8bin dataset for uint8 layered HNSW load: %s", path.c_str());
+  } else if constexpr (std::is_same_v<T, int8_t>) {
+    RAFT_EXPECTS(
+      ext == ".i8bin", "Expected a .i8bin dataset for int8 layered HNSW load: %s", path.c_str());
+  }
+
+  auto fd = cuvs::util::file_descriptor(path, O_RDONLY);
+  return {std::move(fd), sizeof(uint32_t) * header.size(), {header[0], header[1]}};
+}
+
+template <typename T, typename IdxT>
+void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index<T, IdxT>& index_,
+                                        const cuvs::util::file_descriptor& output_fd,
+                                        size_t output_offset,
+                                        size_t base_link_row_bytes,
+                                        size_t maxM0)
+{
+  const auto total_start_time = std::chrono::steady_clock::now();
+  const auto& graph_fd_opt    = index_.graph_fd();
+  const auto& mapping_fd_opt  = index_.mapping_fd();
+  RAFT_EXPECTS(graph_fd_opt.has_value() && graph_fd_opt->is_valid(),
+               "Graph file descriptor is not available");
+  RAFT_EXPECTS(mapping_fd_opt.has_value() && mapping_fd_opt->is_valid(),
+               "Mapping file descriptor is not available");
+
+  const auto graph_path   = graph_fd_opt->get_path();
+  const auto mapping_path = mapping_fd_opt->get_path();
+  RAFT_EXPECTS(!graph_path.empty(), "Unable to get path from graph file descriptor");
+  RAFT_EXPECTS(!mapping_path.empty(), "Unable to get path from mapping file descriptor");
+
+  auto graph_npy   = open_npy_file(graph_path);
+  auto mapping_npy = open_npy_file(mapping_path);
+
+  RAFT_EXPECTS(graph_npy.shape.size() == 2, "Graph file should be 2D");
+  RAFT_EXPECTS(mapping_npy.shape.size() == 1, "Mapping file should be 1D");
+  const auto n_rows = graph_npy.shape[0];
+  const auto degree = graph_npy.shape[1];
+  RAFT_EXPECTS(mapping_npy.shape[0] == n_rows,
+               "Mapping elements (%zu) != graph rows (%zu)",
+               mapping_npy.shape[0],
+               n_rows);
+  RAFT_EXPECTS(n_rows == static_cast<size_t>(index_.size()),
+               "Graph rows (%zu) != index size (%zu)",
+               n_rows,
+               static_cast<size_t>(index_.size()));
+  RAFT_EXPECTS(degree == static_cast<size_t>(index_.graph_degree()),
+               "Graph degree (%zu) != index graph degree (%zu)",
+               degree,
+               static_cast<size_t>(index_.graph_degree()));
+  RAFT_EXPECTS(degree > 0, "Graph degree must be nonzero");
+  RAFT_EXPECTS(
+    degree <= maxM0, "Base graph degree (%zu) must not exceed HNSW maxM0 (%zu)", degree, maxM0);
+  RAFT_EXPECTS(base_link_row_bytes >= sizeof(hnswlib::linklistsizeint) + maxM0 * sizeof(IdxT),
+               "Base link row size is too small");
+
+  RAFT_LOG_INFO("HNSW remap: loading ACE reordered-to-original mapping (%zu rows)", n_rows);
+  const auto mapping_start_time = std::chrono::steady_clock::now();
+  const auto mapping_bytes      = n_rows * sizeof(IdxT);
+  std::vector<IdxT> reordered_to_original(n_rows);
+  std::vector<IdxT> original_to_reordered(n_rows);
+  cuvs::util::read_large_file(
+    mapping_npy.fd, reordered_to_original.data(), mapping_bytes, mapping_npy.header_size);
+  for (size_t reordered_id = 0; reordered_id < n_rows; ++reordered_id) {
+    const auto original_id = static_cast<size_t>(reordered_to_original[reordered_id]);
+    RAFT_EXPECTS(original_id < n_rows,
+                 "Invalid original id %zu in ACE dataset mapping at row %zu",
+                 original_id,
+                 reordered_id);
+    original_to_reordered[original_id] = static_cast<IdxT>(reordered_id);
+  }
+  const auto mapping_elapsed_ms = elapsed_ms_since(mapping_start_time);
+  RAFT_LOG_INFO("HNSW remap: mapping loaded in %ld ms (%.2f GiB, %.2f GiB/s)",
+                mapping_elapsed_ms,
+                to_gib(mapping_bytes),
+                throughput_gib_per_s(mapping_bytes, mapping_elapsed_ms));
+
+  const size_t graph_row_bytes    = degree * sizeof(IdxT);
+  const size_t target_batch_bytes = 64 * 1024 * 1024;
+  const size_t batch_size         = std::max<size_t>(1, target_batch_bytes / base_link_row_bytes);
+  const size_t max_gap_rows       = std::max<size_t>(1, (128 * 1024) / graph_row_bytes);
+  const size_t max_span_rows      = std::max<size_t>(1, (8 * 1024 * 1024) / graph_row_bytes);
+
+  RAFT_LOG_INFO(
+    "HNSW remap: n_rows=%zu degree=%zu maxM0=%zu batch_size=%zu max_gap=%zu rows max_span=%zu rows",
+    n_rows,
+    degree,
+    maxM0,
+    batch_size,
+    max_gap_rows,
+    max_span_rows);
+
+  auto span_buffer = raft::make_host_matrix<IdxT, int64_t>(static_cast<int64_t>(max_span_rows),
+                                                           static_cast<int64_t>(degree));
+  std::vector<char> output_buffer(batch_size * base_link_row_bytes);
+  std::vector<IdxT> source_rows(batch_size);
+  std::vector<size_t> order(batch_size);
+
+  const auto graph_start_time = std::chrono::steady_clock::now();
+  size_t spans_read           = 0;
+  size_t bytes_read           = 0;
+  size_t bytes_written        = 0;
+
+  for (size_t batch_start = 0; batch_start < n_rows; batch_start += batch_size) {
+    const auto current_batch_size = std::min(batch_size, n_rows - batch_start);
+    std::fill(
+      output_buffer.begin(), output_buffer.begin() + current_batch_size * base_link_row_bytes, 0);
+    for (size_t i = 0; i < current_batch_size; ++i) {
+      source_rows[i] = original_to_reordered[batch_start + i];
+      order[i]       = i;
+    }
+    std::sort(order.begin(), order.begin() + current_batch_size, [&](size_t lhs, size_t rhs) {
+      return source_rows[lhs] < source_rows[rhs];
+    });
+
+    size_t sorted_pos = 0;
+    while (sorted_pos < current_batch_size) {
+      const auto span_start = static_cast<size_t>(source_rows[order[sorted_pos]]);
+      auto span_end         = span_start;
+      auto span_next        = sorted_pos + 1;
+      while (span_next < current_batch_size) {
+        const auto next_row = static_cast<size_t>(source_rows[order[span_next]]);
+        if (next_row - span_start + 1 > max_span_rows) { break; }
+        if (next_row > span_end + max_gap_rows + 1) { break; }
+        span_end = next_row;
+        ++span_next;
+      }
+
+      const auto span_rows  = span_end - span_start + 1;
+      const auto span_bytes = span_rows * graph_row_bytes;
+      cuvs::util::read_large_file(graph_npy.fd,
+                                  span_buffer.data_handle(),
+                                  span_bytes,
+                                  graph_npy.header_size + span_start * graph_row_bytes);
+      ++spans_read;
+      bytes_read += span_bytes;
+
+      for (size_t i = sorted_pos; i < span_next; ++i) {
+        const auto dst_row    = order[i];
+        const auto source_row = static_cast<size_t>(source_rows[dst_row]);
+        auto* src             = span_buffer.data_handle() + (source_row - span_start) * degree;
+        auto* dst_row_ptr     = output_buffer.data() + dst_row * base_link_row_bytes;
+        hnswlib::linklistsizeint list_count = static_cast<hnswlib::linklistsizeint>(degree);
+        std::memcpy(dst_row_ptr, &list_count, sizeof(list_count));
+        auto* dst = reinterpret_cast<IdxT*>(dst_row_ptr + sizeof(hnswlib::linklistsizeint));
+        for (size_t j = 0; j < degree; ++j) {
+          const auto neighbor = static_cast<size_t>(src[j]);
+          RAFT_EXPECTS(neighbor < n_rows,
+                       "Invalid reordered neighbor id %zu in ACE graph row %zu",
+                       neighbor,
+                       source_row);
+          dst[j] = reordered_to_original[neighbor];
+        }
+      }
+      sorted_pos = span_next;
+    }
+
+    const auto batch_bytes = current_batch_size * base_link_row_bytes;
+    cuvs::util::write_large_file(output_fd,
+                                 output_buffer.data(),
+                                 batch_bytes,
+                                 output_offset + batch_start * base_link_row_bytes);
+    bytes_written += batch_bytes;
+  }
+
+  const auto graph_elapsed_ms = elapsed_ms_since(graph_start_time);
+  const auto total_elapsed_ms = elapsed_ms_since(total_start_time);
+  const auto read_gib         = to_gib(bytes_read);
+  const auto written_gib      = to_gib(bytes_written);
+  const auto ideal_read_bytes = n_rows * graph_row_bytes;
+  const auto amplification =
+    ideal_read_bytes == 0 ? 0.0 : static_cast<double>(bytes_read) / ideal_read_bytes;
+  RAFT_LOG_INFO(
+    "HNSW remap: base links written in %ld ms: read %.2f GiB at %.2f GiB/s across %zu spans "
+    "(%.2fx amplification), wrote %.2f GiB at %.2f GiB/s",
+    graph_elapsed_ms,
+    read_gib,
+    throughput_gib_per_s(bytes_read, graph_elapsed_ms),
+    spans_read,
+    amplification,
+    written_gib,
+    throughput_gib_per_s(bytes_written, graph_elapsed_ms));
+  RAFT_LOG_INFO("HNSW remap: completed in %ld ms total", total_elapsed_ms);
+}
+
+template <typename T, typename IdxT>
+auto serialize_to_layered_hnsw_from_disk(
+  raft::resources const& res,
+  const cuvs::neighbors::hnsw::index_params& params,
+  const cuvs::neighbors::cagra::index<T, IdxT>& index_,
+  raft::host_matrix_view<const T, int64_t, raft::row_major> dataset) -> std::string
+{
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("hnsw::serialize_layered");
+  const auto total_start_time = std::chrono::steady_clock::now();
+
+  RAFT_EXPECTS(index_.graph_fd().has_value() && index_.mapping_fd().has_value(),
+               "Layered HNSW serialization requires a disk-backed ACE graph and mapping.");
+  RAFT_EXPECTS(static_cast<size_t>(dataset.extent(0)) == static_cast<size_t>(index_.size()),
+               "Dataset rows (%zu) must match index size (%zu)",
+               static_cast<size_t>(dataset.extent(0)),
+               static_cast<size_t>(index_.size()));
+  RAFT_EXPECTS(static_cast<size_t>(dataset.extent(1)) == static_cast<size_t>(index_.dim()),
+               "Dataset cols (%zu) must match index dimensions (%zu)",
+               static_cast<size_t>(dataset.extent(1)),
+               static_cast<size_t>(index_.dim()));
+
+  const auto graph_path = index_.graph_fd()->get_path();
+  RAFT_EXPECTS(!graph_path.empty(), "Unable to get path from graph file descriptor");
+  const auto ace_dir       = std::filesystem::path(graph_path).parent_path();
+  const auto artifact_file = ace_dir / "hnsw_index.cuvs";
+  std::filesystem::create_directories(ace_dir);
+
+  auto n_rows           = static_cast<size_t>(index_.size());
+  auto dim              = static_cast<size_t>(index_.dim());
+  auto graph_degree_int = static_cast<int>(index_.graph_degree());
+
+  auto hnsw_index =
+    std::make_unique<index_impl<T>>(index_.dim(), index_.metric(), params.hierarchy);
+  auto appr_algo = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    hnsw_index->get_space(), 1, (graph_degree_int + 1) / 2, params.ef_construction);
+
+  RAFT_LOG_INFO("Layered HNSW: generating hierarchy levels");
+  const auto hierarchy_start_time = std::chrono::steady_clock::now();
+  auto hierarchy = make_random_hnsw_level_plan(n_rows, *appr_algo, "Layered HNSW: Level ");
+  const auto hierarchy_elapsed_ms = elapsed_ms_since(hierarchy_start_time);
+  RAFT_LOG_INFO("Layered HNSW: hierarchy levels generated in %ld ms (max_level=%d, promoted=%zu)",
+                hierarchy_elapsed_ms,
+                hierarchy.max_level(),
+                hierarchy.promoted_count());
+
+  layered_hnsw_file_metadata metadata;
+  metadata.n_rows               = n_rows;
+  metadata.dim                  = dim;
+  metadata.M                    = appr_algo->M_;
+  metadata.maxM                 = appr_algo->maxM_;
+  metadata.maxM0                = appr_algo->maxM0_;
+  metadata.ef_construction      = appr_algo->ef_construction_;
+  metadata.mult                 = appr_algo->mult_;
+  metadata.maxlevel             = hierarchy.max_level();
+  metadata.enterpoint_node      = static_cast<int>(hierarchy.order.back());
+  metadata.base_degree          = static_cast<size_t>(graph_degree_int);
+  metadata.levels_bytes         = n_rows * sizeof(uint8_t);
+  metadata.base_link_row_bytes  = appr_algo->size_links_level0_;
+  metadata.base_links_bytes     = n_rows * metadata.base_link_row_bytes;
+  metadata.upper_link_row_bytes = appr_algo->size_links_per_element_;
+  size_t next_upper_node_offset = 0;
+  size_t next_upper_link_offset = 0;
+  for (size_t pt_level = 1; pt_level < hierarchy.hist.size(); pt_level++) {
+    auto start_idx    = hierarchy.offsets[pt_level - 1];
+    auto row_count    = n_rows - start_idx;
+    auto layer_degree = hnsw_upper_layer_degree(row_count, appr_algo->M_);
+    metadata.layers.push_back(
+      {pt_level, row_count, layer_degree, next_upper_node_offset, next_upper_link_offset});
+    next_upper_node_offset += row_count;
+    next_upper_link_offset += row_count;
+  }
+  metadata.upper_nodes_count = next_upper_node_offset;
+  metadata.upper_nodes_bytes = metadata.upper_nodes_count * sizeof(IdxT);
+  metadata.upper_links_bytes = next_upper_link_offset * metadata.upper_link_row_bytes;
+
+  const auto metadata_json = make_layered_hnsw_metadata_json<T>(metadata, index_.metric());
+  layered_hnsw_file_header header{};
+  std::strncpy(header.magic, layered_hnsw_magic, sizeof(header.magic) - 1);
+  header.version         = layered_hnsw_version;
+  header.metadata_size   = metadata_json.size();
+  header.metadata_offset = sizeof(layered_hnsw_file_header);
+
+  const auto payload_offset =
+    align_up(header.metadata_offset + header.metadata_size, layered_hnsw_alignment);
+  const auto levels_offset      = payload_offset;
+  const auto base_links_offset  = levels_offset + metadata.levels_bytes;
+  const auto upper_nodes_offset = base_links_offset + metadata.base_links_bytes;
+  const auto upper_links_offset = upper_nodes_offset + metadata.upper_nodes_bytes;
+  const auto final_file_size    = upper_links_offset + metadata.upper_links_bytes;
+
+  cuvs::util::file_descriptor artifact_fd(artifact_file.string(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+  const auto fallocate_result = posix_fallocate(artifact_fd.get(), 0, final_file_size);
+  RAFT_EXPECTS(fallocate_result == 0,
+               "Failed to pre-allocate layered HNSW artifact %s: %s",
+               artifact_file.string().c_str(),
+               std::strerror(fallocate_result));
+  cuvs::util::write_large_file(artifact_fd, &header, sizeof(header), 0);
+  cuvs::util::write_large_file(
+    artifact_fd, metadata_json.data(), metadata_json.size(), header.metadata_offset);
+
+  const auto levels_start_time = std::chrono::steady_clock::now();
+  cuvs::util::write_large_file(
+    artifact_fd, hierarchy.levels.data(), metadata.levels_bytes, levels_offset);
+  const auto levels_elapsed_ms = elapsed_ms_since(levels_start_time);
+  RAFT_LOG_INFO("Layered HNSW: wrote levels section in %ld ms (%.2f GiB, %.2f GiB/s)",
+                levels_elapsed_ms,
+                to_gib(metadata.levels_bytes),
+                throughput_gib_per_s(metadata.levels_bytes, levels_elapsed_ms));
+
+  RAFT_LOG_INFO("Layered HNSW: writing hnswlib-ready base links");
+  const auto layer0_start_time = std::chrono::steady_clock::now();
+  write_layered_base_links_from_disk<T, IdxT>(
+    index_, artifact_fd, base_links_offset, metadata.base_link_row_bytes, metadata.maxM0);
+  const auto layer0_elapsed_ms = elapsed_ms_since(layer0_start_time);
+  RAFT_LOG_INFO("Layered HNSW: base links written in %ld ms (%.2f GiB, %.2f GiB/s effective)",
+                layer0_elapsed_ms,
+                to_gib(metadata.base_links_bytes),
+                throughput_gib_per_s(metadata.base_links_bytes, layer0_elapsed_ms));
+
+  size_t upper_graph_bytes_written = 0;
+  if (hierarchy.hist.size() > 1) {
+    RAFT_LOG_INFO("Layered HNSW: gathering promoted vectors");
+    const auto gather_start_time = std::chrono::steady_clock::now();
+    auto host_query_set =
+      raft::make_host_matrix<T, int64_t>(static_cast<int64_t>(hierarchy.promoted_count()), dim);
+#pragma omp parallel for
+    for (int64_t i = 0; i < static_cast<int64_t>(n_rows); i++) {
+      if (hierarchy.levels[i] > 0) {
+        const auto query_row = hierarchy.order_bw[i] - hierarchy.hist[0];
+        auto* dst            = host_query_set.data_handle() + query_row * dim;
+        std::copy(&dataset(i, 0), &dataset(i, 0) + dim, dst);
+      }
+    }
+    const auto gather_elapsed_ms = elapsed_ms_since(gather_start_time);
+    const auto gathered_bytes    = hierarchy.promoted_count() * dim * sizeof(T);
+    RAFT_LOG_INFO("Layered HNSW: gathered promoted vectors in %ld ms (%.2f GiB copied, %.2f GiB/s)",
+                  gather_elapsed_ms,
+                  to_gib(gathered_bytes),
+                  throughput_gib_per_s(gathered_bytes, gather_elapsed_ms));
+
+    auto promoted_dataset = raft::make_host_matrix_view<const T, int64_t, raft::row_major>(
+      host_query_set.data_handle(), host_query_set.extent(0), host_query_set.extent(1));
+    const auto upper_layers_start_time = std::chrono::steady_clock::now();
+    build_hnsw_upper_layer_graphs<T, IdxT>(
+      res,
+      promoted_dataset,
+      hierarchy,
+      appr_algo->M_,
+      index_.metric(),
+      [&](size_t pt_level, size_t start_idx, auto& host_neighbors) {
+        const auto& layer = metadata.layers[pt_level - 1];
+        RAFT_LOG_INFO("Layered HNSW: writing upper layer %zu (%zu rows, degree %zu)",
+                      layer.level,
+                      layer.row_count,
+                      layer.degree);
+        const auto layer_write_start_time = std::chrono::steady_clock::now();
+        const size_t target_batch_bytes   = 64 * 1024 * 1024;
+        const size_t row_bytes            = sizeof(IdxT) + metadata.upper_link_row_bytes;
+        const size_t batch_size           = std::max<size_t>(1, target_batch_bytes / row_bytes);
+        std::vector<IdxT> node_buffer(batch_size);
+        std::vector<char> link_buffer(batch_size * metadata.upper_link_row_bytes);
+        for (size_t batch_start = 0; batch_start < layer.row_count; batch_start += batch_size) {
+          const auto current_batch_size = std::min(batch_size, layer.row_count - batch_start);
+          std::fill(link_buffer.begin(),
+                    link_buffer.begin() + current_batch_size * metadata.upper_link_row_bytes,
+                    0);
+#pragma omp parallel for
+          for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size);
+               ++batch_idx) {
+            const auto row         = batch_start + static_cast<size_t>(batch_idx);
+            node_buffer[batch_idx] = static_cast<IdxT>(hierarchy.order[start_idx + row]);
+            auto* link_row         = link_buffer.data() + batch_idx * metadata.upper_link_row_bytes;
+            hnswlib::linklistsizeint list_count =
+              static_cast<hnswlib::linklistsizeint>(layer.degree);
+            std::memcpy(link_row, &list_count, sizeof(list_count));
+            auto* dst = reinterpret_cast<IdxT*>(link_row + sizeof(hnswlib::linklistsizeint));
+            if (layer.degree > 0) {
+              auto* src = host_neighbors.data_handle() + row * layer.degree;
+              for (size_t j = 0; j < layer.degree; ++j) {
+                dst[j] = static_cast<IdxT>(hierarchy.order[src[j] + start_idx]);
+              }
+            }
+          }
+          cuvs::util::write_large_file(
+            artifact_fd,
+            node_buffer.data(),
+            current_batch_size * sizeof(IdxT),
+            upper_nodes_offset + (layer.node_offset + batch_start) * sizeof(IdxT));
+          cuvs::util::write_large_file(
+            artifact_fd,
+            link_buffer.data(),
+            current_batch_size * metadata.upper_link_row_bytes,
+            upper_links_offset + (layer.link_offset + batch_start) * metadata.upper_link_row_bytes);
+        }
+        const auto layer_bytes = layer.row_count * (sizeof(IdxT) + metadata.upper_link_row_bytes);
+        upper_graph_bytes_written += layer_bytes;
+        const auto layer_write_elapsed_ms = elapsed_ms_since(layer_write_start_time);
+        RAFT_LOG_INFO("Layered HNSW: upper layer %zu written in %ld ms (%.2f GiB, %.2f GiB/s)",
+                      layer.level,
+                      layer_write_elapsed_ms,
+                      to_gib(layer_bytes),
+                      throughput_gib_per_s(layer_bytes, layer_write_elapsed_ms));
+      });
+    const auto upper_layers_elapsed_ms = elapsed_ms_since(upper_layers_start_time);
+    RAFT_LOG_INFO(
+      "Layered HNSW: upper layers generated and written in %ld ms (%.2f GiB written, %.2f "
+      "GiB/s effective)",
+      upper_layers_elapsed_ms,
+      to_gib(upper_graph_bytes_written),
+      throughput_gib_per_s(upper_graph_bytes_written, upper_layers_elapsed_ms));
+  }
+
+  const auto total_elapsed_ms = elapsed_ms_since(total_start_time);
+  RAFT_LOG_INFO(
+    "Layered HNSW index written to: %s in %ld ms (artifact %.2f GiB, %.2f GiB/s effective)",
+    artifact_file.string().c_str(),
+    total_elapsed_ms,
+    to_gib(final_file_size),
+    throughput_gib_per_s(final_file_size, total_elapsed_ms));
+  return artifact_file.string();
+}
+
 template <typename T, typename IdxT>
 void serialize_to_hnswlib_from_disk(raft::resources const& res,
                                     std::ostream& os_raw,
@@ -501,44 +1307,12 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res,
     hnsw_index->get_space(), 1, (graph_degree_int + 1) / 2, params.ef_construction);
 
   bool create_hierarchy = params.hierarchy != HnswHierarchy::NONE;
-
-  // create hierarchy order
-  // sort the points by levels
-  // roll dice & build histogram
-  std::vector<size_t> hist;
-  std::vector<size_t> order(n_rows);
-  std::vector<size_t> order_bw(n_rows);
-  std::vector<int> levels(n_rows);
-  std::vector<size_t> offsets;
-
-  if (create_hierarchy) {
-    RAFT_LOG_INFO("Sort points by levels");
-    for (int64_t i = 0; i < n_rows; i++) {
-      auto pt_level = appr_algo->getRandomLevel(appr_algo->mult_);
-      while (pt_level >= static_cast<int32_t>(hist.size()))
-        hist.push_back(0);
-      hist[pt_level]++;
-      levels[i] = pt_level;
-    }
-
-    // accumulate
-    offsets.resize(hist.size() + 1, 0);
-    for (size_t i = 0; i < hist.size() - 1; i++) {
-      offsets[i + 1] = offsets[i] + hist[i];
-      RAFT_LOG_INFO("Level %zu : %zu", i + 1, size_t(n_rows) - offsets[i + 1]);
-    }
-
-    // fw/bw indices
-    for (int64_t i = 0; i < n_rows; i++) {
-      auto pt_level              = levels[i];
-      order_bw[i]                = offsets[pt_level];
-      order[offsets[pt_level]++] = i;
-    }
-  }
+  hnsw_level_plan hierarchy;
+  if (create_hierarchy) { hierarchy = make_random_hnsw_level_plan(n_rows, *appr_algo, "Level "); }
 
   // set last point of the highest level as the entry point
-  appr_algo->enterpoint_node_ = create_hierarchy ? order.back() : n_rows / 2;
-  appr_algo->maxlevel_        = create_hierarchy ? hist.size() - 1 : 1;
+  appr_algo->enterpoint_node_ = create_hierarchy ? hierarchy.order.back() : n_rows / 2;
+  appr_algo->maxlevel_        = create_hierarchy ? hierarchy.max_level() : 1;
 
   // write header information
   RAFT_LOG_DEBUG("Writing HNSW header: offsetLevel0=%zu, n_rows=%zu, size_data_per_element=%zu",
@@ -582,7 +1356,7 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res,
 
   // host queries
   auto host_query_set =
-    raft::make_host_matrix<T, int64_t>(create_hierarchy ? n_rows - hist[0] : 0, dim);
+    raft::make_host_matrix<T, int64_t>(create_hierarchy ? hierarchy.promoted_count() : 0, dim);
 
   int64_t d_report_offset    = n_rows / 10;  // Report progress in 10% steps.
   int64_t next_report_offset = d_report_offset;
@@ -690,11 +1464,11 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res,
       const T* data_row = &dataset_buffer(batch_idx, 0);
       os.write(reinterpret_cast<const char*>(data_row), sizeof(T) * dim);
 
-      if (create_hierarchy && levels[i] > 0) {
+      if (create_hierarchy && hierarchy.levels[i] > 0) {
         // position in query: order_bw[i]-hist[0]
-        std::copy(data_row,
-                  data_row + dim,
-                  reinterpret_cast<char*>(&host_query_set(order_bw[i] - hist[0], 0)));
+        auto* dst = host_query_set.data_handle() +
+                    (hierarchy.order_bw[i] - hierarchy.hist[0]) * static_cast<size_t>(dim);
+        std::copy(data_row, data_row + dim, dst);
       }
 
       // assign original label
@@ -732,22 +1506,15 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res,
   // trigger knn builds for all levels
   std::vector<raft::host_matrix<IdxT, int64_t>> host_neighbors;
   if (create_hierarchy) {
-    for (size_t pt_level = 1; pt_level < hist.size(); pt_level++) {
-      auto num_pts       = n_rows - offsets[pt_level - 1];
-      auto neighbor_size = num_pts > appr_algo->M_ ? appr_algo->M_ : num_pts - 1;
-      host_neighbors.emplace_back(raft::make_host_matrix<IdxT, int64_t>(num_pts, neighbor_size));
-    }
-    for (size_t pt_level = 1; pt_level < hist.size(); pt_level++) {
-      RAFT_LOG_INFO("Compute hierarchy neighbors level %zu", pt_level);
-      auto removed_rows = offsets[pt_level - 1] - offsets[0];
-      raft::host_matrix_view<T, int64_t, raft::row_major> sub_query_view(
-        host_query_set.data_handle() + removed_rows * dim,
-        host_query_set.extent(0) - removed_rows,
-        dim);
-      auto neighbor_view = host_neighbors[pt_level - 1].view();
-      all_neighbors_graph(
-        res, raft::make_const_mdspan(sub_query_view), neighbor_view, index_.metric());
-    }
+    host_neighbors.reserve(hierarchy.hist.size() - 1);
+    build_hnsw_upper_layer_graphs<T, IdxT>(
+      res,
+      raft::make_host_matrix_view<const T, int64_t, raft::row_major>(
+        host_query_set.data_handle(), host_query_set.extent(0), host_query_set.extent(1)),
+      hierarchy,
+      appr_algo->M_,
+      index_.metric(),
+      [&](size_t, size_t, auto& neighbors) { host_neighbors.emplace_back(std::move(neighbors)); });
   }
 
   if (create_hierarchy) {
@@ -758,7 +1525,7 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res,
   start_clock   = std::chrono::system_clock::now();
 
   for (int64_t i = 0; i < n_rows; i++) {
-    size_t cur_level = create_hierarchy ? levels[i] : 0;
+    size_t cur_level = create_hierarchy ? hierarchy.levels[i] : 0;
     unsigned int linkListSize =
       create_hierarchy && cur_level > 0 ? appr_algo->size_links_per_element_ * cur_level : 0;
     os.write(reinterpret_cast<char*>(&linkListSize), sizeof(int));
@@ -766,14 +1533,16 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res,
     if (linkListSize) {
       for (size_t pt_level = 1; pt_level <= cur_level; pt_level++) {
         auto neighbor_view = host_neighbors[pt_level - 1].view();
-        auto my_row        = order_bw[i] - offsets[pt_level - 1];
+        auto my_row        = hierarchy.order_bw[i] - hierarchy.offsets[pt_level - 1];
 
-        IdxT* neighbors     = &neighbor_view(my_row, 0);
         unsigned int extent = neighbor_view.extent(1);
         os.write(reinterpret_cast<char*>(&extent), sizeof(int));
-        for (unsigned int j = 0; j < extent; j++) {
-          const IdxT converted = order[neighbors[j] + offsets[pt_level - 1]];
-          os.write(reinterpret_cast<const char*>(&converted), sizeof(IdxT));
+        if (extent > 0) {
+          IdxT* neighbors = &neighbor_view(my_row, 0);
+          for (unsigned int j = 0; j < extent; j++) {
+            const IdxT converted = hierarchy.order[neighbors[j] + hierarchy.offsets[pt_level - 1]];
+            os.write(reinterpret_cast<const char*>(&converted), sizeof(IdxT));
+          }
         }
         auto remainder = appr_algo->M_ - neighbor_view.extent(1);
         for (size_t j = 0; j < remainder; j++) {
@@ -1088,6 +1857,18 @@ std::unique_ptr<index<T>> from_cagra(
       std::filesystem::exists(index_directory) && std::filesystem::is_directory(index_directory),
       "Directory '%s' does not exist",
       index_directory.c_str());
+    if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) {
+      RAFT_EXPECTS(dataset.has_value(),
+                   "Layered HNSW serialization requires the original-order dataset.");
+      auto artifact_path =
+        serialize_to_layered_hnsw_from_disk(res, params, cagra_index, dataset.value());
+
+      auto hnsw_index =
+        std::make_unique<index_impl<T>>(cagra_index.dim(), cagra_index.metric(), params.hierarchy);
+      hnsw_index->set_file_descriptor(cuvs::util::file_descriptor(artifact_path, O_RDONLY));
+      return hnsw_index;
+    }
+
     std::string index_filename =
       (std::filesystem::path(index_directory) / "hnsw_index.bin").string();
 
@@ -1118,6 +1899,8 @@ std::unique_ptr<index<T>> from_cagra(
     return from_cagra<T, HnswHierarchy::CPU>(res, params, cagra_index, dataset);
   } else if (params.hierarchy == HnswHierarchy::GPU) {
     return from_cagra<T, HnswHierarchy::GPU>(res, params, cagra_index, dataset);
+  } else if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) {
+    RAFT_FAIL("GPU_LAYERED_ON_DISK requires disk-backed ACE build artifacts.");
   } else {
     RAFT_FAIL("Unsupported hierarchy type");
   }
@@ -1228,6 +2011,13 @@ void serialize(raft::resources const& res, const std::string& filename, const in
                  "Disk-based index file does not exist: %s",
                  source_path.c_str());
 
+    if (idx_impl->hierarchy() == HnswHierarchy::GPU_LAYERED_ON_DISK) {
+      copy_file_overwrite(source_path, filename);
+      RAFT_LOG_INFO(
+        "Copied layered HNSW index from %s to %s", source_path.c_str(), filename.c_str());
+      return;
+    }
+
     // Copy the file to the new location
     std::filesystem::copy_file(
       source_path, filename, std::filesystem::copy_options::overwrite_existing);
@@ -1241,6 +2031,202 @@ void serialize(raft::resources const& res, const std::string& filename, const in
   hnswlib_index->saveIndex(filename);
 }
 
+template <typename T>
+auto deserialize_layered_hnsw(raft::resources const& res,
+                              const index_params& params,
+                              const std::string& artifact_path,
+                              int dim,
+                              cuvs::distance::DistanceType metric) -> std::unique_ptr<index<T>>
+{
+  common::nvtx::range<common::nvtx::domain::cuvs> fun_scope("hnsw::deserialize_layered");
+  cuvs::util::file_descriptor artifact_fd(artifact_path, O_RDONLY);
+  layered_hnsw_file_header header{};
+  cuvs::util::read_large_file(artifact_fd, &header, sizeof(header), 0);
+  RAFT_EXPECTS(std::strncmp(header.magic, layered_hnsw_magic, sizeof(header.magic)) == 0,
+               "Invalid layered HNSW artifact magic: %s",
+               artifact_path.c_str());
+  RAFT_EXPECTS(header.version == layered_hnsw_version,
+               "Unsupported layered HNSW artifact version %u",
+               header.version);
+  RAFT_EXPECTS(header.metadata_offset >= sizeof(layered_hnsw_file_header),
+               "Invalid layered HNSW metadata offset");
+  RAFT_EXPECTS(header.metadata_size > 0, "Invalid layered HNSW metadata size");
+
+  std::string metadata_json(header.metadata_size, '\0');
+  cuvs::util::read_large_file(
+    artifact_fd, metadata_json.data(), metadata_json.size(), header.metadata_offset);
+  const auto metadata = parse_layered_hnsw_metadata(metadata_json);
+
+  RAFT_EXPECTS(metadata.n_rows > 0, "Layered HNSW artifact must contain at least one row");
+  RAFT_EXPECTS(static_cast<size_t>(dim) == metadata.dim,
+               "Layered HNSW artifact dim (%zu) does not match requested dim (%d)",
+               metadata.dim,
+               dim);
+  RAFT_EXPECTS(metadata.layers.size() == static_cast<size_t>(metadata.maxlevel),
+               "Layered HNSW artifact has %zu upper layers, expected %d",
+               metadata.layers.size(),
+               metadata.maxlevel);
+
+  RAFT_EXPECTS(!params.dataset_path.empty(),
+               "Layered HNSW deserialization requires index_params.dataset_path");
+
+  const auto payload_offset =
+    align_up(header.metadata_offset + header.metadata_size, layered_hnsw_alignment);
+  const auto levels_offset      = payload_offset;
+  const auto base_links_offset  = levels_offset + metadata.levels_bytes;
+  const auto upper_nodes_offset = base_links_offset + metadata.base_links_bytes;
+  const auto upper_links_offset = upper_nodes_offset + metadata.upper_nodes_bytes;
+  const auto expected_file_size = upper_links_offset + metadata.upper_links_bytes;
+  const auto artifact_size      = static_cast<size_t>(std::filesystem::file_size(artifact_path));
+  RAFT_EXPECTS(artifact_size >= expected_file_size,
+               "Layered HNSW artifact is truncated: expected at least %zu bytes, got %zu",
+               expected_file_size,
+               artifact_size);
+
+  auto dataset_file = open_layered_dataset_file<T>(params.dataset_path);
+
+  RAFT_EXPECTS(dataset_file.shape.size() == 2 && dataset_file.shape[0] == metadata.n_rows &&
+                 dataset_file.shape[1] == metadata.dim,
+               "Layered HNSW dataset shape mismatch");
+  RAFT_EXPECTS(metadata.levels_bytes == metadata.n_rows * sizeof(uint8_t),
+               "Layered HNSW levels section size mismatch");
+  RAFT_EXPECTS(metadata.base_links_bytes == metadata.n_rows * metadata.base_link_row_bytes,
+               "Layered HNSW base links section size mismatch");
+  RAFT_EXPECTS(metadata.upper_nodes_bytes == metadata.upper_nodes_count * sizeof(uint32_t),
+               "Layered HNSW upper node section size mismatch");
+  RAFT_EXPECTS(
+    metadata.upper_links_bytes == metadata.upper_nodes_count * metadata.upper_link_row_bytes,
+    "Layered HNSW upper link section size mismatch");
+  RAFT_EXPECTS(metadata.base_degree <= metadata.maxM0,
+               "Layered HNSW base degree (%zu) exceeds maxM0 (%zu)",
+               metadata.base_degree,
+               metadata.maxM0);
+
+  std::vector<uint8_t> levels_u8(metadata.n_rows);
+  cuvs::util::read_large_file(artifact_fd, levels_u8.data(), metadata.levels_bytes, levels_offset);
+  const auto max_level_in_levels = *std::max_element(levels_u8.begin(), levels_u8.end());
+  RAFT_EXPECTS(static_cast<int>(max_level_in_levels) == metadata.maxlevel,
+               "Layered HNSW levels max level (%d) does not match artifact maxlevel (%d)",
+               static_cast<int>(max_level_in_levels),
+               metadata.maxlevel);
+
+  auto hnsw_index = std::make_unique<index_impl<T>>(dim, metric, params.hierarchy);
+  auto appr_algo  = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    hnsw_index->get_space(), metadata.n_rows, metadata.M, metadata.ef_construction);
+  appr_algo->cur_element_count = metadata.n_rows;
+  appr_algo->maxlevel_         = metadata.maxlevel;
+  appr_algo->enterpoint_node_  = metadata.enterpoint_node;
+  std::fill_n(appr_algo->linkLists_, metadata.n_rows, nullptr);
+  RAFT_EXPECTS(appr_algo->size_links_level0_ == metadata.base_link_row_bytes,
+               "Layered HNSW base link row size mismatch");
+  RAFT_EXPECTS(appr_algo->size_links_per_element_ == metadata.upper_link_row_bytes,
+               "Layered HNSW upper link row size mismatch");
+  RAFT_EXPECTS(appr_algo->maxM0_ == metadata.maxM0 && appr_algo->maxM_ == metadata.maxM,
+               "Layered HNSW M parameter mismatch");
+
+  auto num_threads =
+    params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads;
+
+  RAFT_LOG_INFO("Layered HNSW: initializing in-memory hnswlib index from local dataset");
+  const size_t target_batch_bytes = 64 * 1024 * 1024;
+  const size_t row_bytes          = metadata.dim * sizeof(T) + metadata.base_link_row_bytes;
+  const size_t batch_size         = std::max<size_t>(1, target_batch_bytes / row_bytes);
+  auto dataset_buffer =
+    raft::make_host_matrix<T, int64_t>(static_cast<int64_t>(batch_size), metadata.dim);
+  std::vector<char> base_link_buffer(batch_size * metadata.base_link_row_bytes);
+
+  for (size_t batch_start = 0; batch_start < metadata.n_rows; batch_start += batch_size) {
+    const auto current_batch_size = std::min(batch_size, metadata.n_rows - batch_start);
+    cuvs::util::read_large_file(dataset_file.fd,
+                                dataset_buffer.data_handle(),
+                                current_batch_size * metadata.dim * sizeof(T),
+                                dataset_file.header_size + batch_start * metadata.dim * sizeof(T));
+    cuvs::util::read_large_file(artifact_fd,
+                                base_link_buffer.data(),
+                                current_batch_size * metadata.base_link_row_bytes,
+                                base_links_offset + batch_start * metadata.base_link_row_bytes);
+
+    bool link_list_allocation_failed = false;
+#pragma omp parallel for num_threads(num_threads) reduction(|| : link_list_allocation_failed)
+    for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size); ++batch_idx) {
+      const auto i                  = batch_start + static_cast<size_t>(batch_idx);
+      auto level                    = static_cast<int32_t>(levels_u8[i]);
+      appr_algo->element_levels_[i] = level;
+      auto ll0                      = appr_algo->get_linklist0(i);
+      memcpy(ll0,
+             base_link_buffer.data() + batch_idx * metadata.base_link_row_bytes,
+             metadata.base_link_row_bytes);
+      memcpy(appr_algo->getDataByInternalId(i),
+             dataset_buffer.data_handle() + batch_idx * metadata.dim,
+             appr_algo->data_size_);
+      *appr_algo->getExternalLabeLp(i) = static_cast<hnswlib::labeltype>(i);
+      if (level > 0) {
+        const auto link_list_size = appr_algo->size_links_per_element_ * level + 1;
+        appr_algo->linkLists_[i]  = static_cast<char*>(malloc(link_list_size));
+        if (appr_algo->linkLists_[i] == nullptr) {
+          link_list_allocation_failed = true;
+          continue;
+        }
+        memset(appr_algo->linkLists_[i], 0, link_list_size);
+      }
+    }
+    if (link_list_allocation_failed) {
+      for (size_t i = 0; i < metadata.n_rows; ++i) {
+        free(appr_algo->linkLists_[i]);
+        appr_algo->linkLists_[i] = nullptr;
+      }
+      throw std::runtime_error("Not enough memory to allocate HNSW upper linklists");
+    }
+  }
+
+  RAFT_LOG_INFO("Layered HNSW: loading upper-layer topology");
+  for (const auto& layer : metadata.layers) {
+    RAFT_LOG_INFO("Layered HNSW: loading layer %zu (%zu rows, degree %zu)",
+                  layer.level,
+                  layer.row_count,
+                  layer.degree);
+    const auto layer_batch_size = std::min(batch_size, layer.row_count);
+    std::vector<uint32_t> node_buffer(layer_batch_size);
+    std::vector<char> link_buffer(layer_batch_size * metadata.upper_link_row_bytes);
+    for (size_t batch_start = 0; batch_start < layer.row_count; batch_start += layer_batch_size) {
+      const auto current_batch_size = std::min(layer_batch_size, layer.row_count - batch_start);
+      cuvs::util::read_large_file(
+        artifact_fd,
+        node_buffer.data(),
+        current_batch_size * sizeof(uint32_t),
+        upper_nodes_offset + (layer.node_offset + batch_start) * sizeof(uint32_t));
+      cuvs::util::read_large_file(
+        artifact_fd,
+        link_buffer.data(),
+        current_batch_size * metadata.upper_link_row_bytes,
+        upper_links_offset + (layer.link_offset + batch_start) * metadata.upper_link_row_bytes);
+      for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size);
+           ++batch_idx) {
+        const auto node_id = static_cast<size_t>(node_buffer[batch_idx]);
+        RAFT_EXPECTS(node_id < metadata.n_rows,
+                     "Invalid upper-layer node id %zu in layered HNSW artifact",
+                     node_id);
+        RAFT_EXPECTS(layer.level <= static_cast<size_t>(levels_u8[node_id]),
+                     "Layered HNSW artifact references node %zu at invalid level %zu",
+                     node_id,
+                     layer.level);
+      }
+#pragma omp parallel for num_threads(num_threads)
+      for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size);
+           ++batch_idx) {
+        const auto node_id = static_cast<size_t>(node_buffer[batch_idx]);
+        auto ll            = appr_algo->get_linklist(node_id, layer.level);
+        memcpy(ll,
+               link_buffer.data() + batch_idx * metadata.upper_link_row_bytes,
+               metadata.upper_link_row_bytes);
+      }
+    }
+  }
+
+  hnsw_index->set_index(std::move(appr_algo));
+  return hnsw_index;
+}
+
 template <typename T>
 void deserialize(raft::resources const& res,
                  const index_params& params,
@@ -1249,6 +2235,12 @@ void deserialize(raft::resources const& res,
                  cuvs::distance::DistanceType metric,
                  index<T>** idx)
 {
+  if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) {
+    auto hnsw_index = deserialize_layered_hnsw<T>(res, params, filename, dim, metric);
+    *idx            = hnsw_index.release();
+    return;
+  }
+
   try {
     auto hnsw_index = std::make_unique<index_impl<T>>(dim, metric, params.hierarchy);
     auto appr_algo  = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
@@ -1286,6 +2278,13 @@ std::unique_ptr<index<T>> build(raft::resources const& res,
       ? std::get<graph_build_params::ace_params>(params.graph_build_params)
       : graph_build_params::ace_params{};
 
+  if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) {
+    RAFT_EXPECTS(ace_params.use_disk,
+                 "GPU_LAYERED_ON_DISK requires ACE disk mode (ace_params.use_disk = true)");
+    RAFT_EXPECTS(!ace_params.build_dir.empty(),
+                 "GPU_LAYERED_ON_DISK requires ace_params.build_dir to be set");
+  }
+
   // Create CAGRA index parameters from HNSW parameters
   cuvs::neighbors::cagra::index_params cagra_params;
   cagra_params.metric                    = params.metric;
diff --git a/cpp/tests/neighbors/ann_hnsw_ace.cuh b/cpp/tests/neighbors/ann_hnsw_ace.cuh
index 78ae54d9fc..46caef7ae1 100644
--- a/cpp/tests/neighbors/ann_hnsw_ace.cuh
+++ b/cpp/tests/neighbors/ann_hnsw_ace.cuh
@@ -7,9 +7,16 @@
 #include "ann_cagra.cuh"
 
 #include <cuvs/neighbors/hnsw.hpp>
+#include <cuvs/util/file_io.hpp>
 
+#include <array>
+#include <cstdint>
 #include <cstdio>
+#include <ctime>
+#include <exception>
 #include <filesystem>
+#include <fstream>
+#include <memory>
 
 namespace cuvs::neighbors::hnsw {
 
@@ -259,6 +266,238 @@ class AnnHnswAceTest : public ::testing::TestWithParam<AnnHnswAceInputs> {
     std::filesystem::remove_all(temp_dir);
   }
 
+  void testHnswAceLayeredBuildDeserializeSearch()
+  {
+    size_t queries_size = ps.n_queries * ps.k;
+    std::vector<IdxT> indexes_naive(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indexes_naive_dev(queries_size, stream_);
+
+      cuvs::neighbors::naive_knn<DistanceT, DataT, IdxT>(handle_,
+                                                         distances_naive_dev.data(),
+                                                         indexes_naive_dev.data(),
+                                                         search_queries.data(),
+                                                         database_dev.data(),
+                                                         ps.n_queries,
+                                                         ps.n_rows,
+                                                         ps.dim,
+                                                         ps.k,
+                                                         ps.metric);
+      raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      raft::update_host(indexes_naive.data(), indexes_naive_dev.data(), queries_size, stream_);
+      raft::resource::sync_stream(handle_);
+    }
+
+    std::string temp_dir = std::string("/tmp/cuvs_hnsw_ace_layered_test_") +
+                           std::to_string(std::time(nullptr)) + "_" +
+                           std::to_string(reinterpret_cast<uintptr_t>(this));
+    std::filesystem::create_directories(temp_dir);
+    struct temp_dir_cleanup {
+      std::string path;
+      ~temp_dir_cleanup()
+      {
+        std::error_code ec;
+        std::filesystem::remove_all(path, ec);
+      }
+    } cleanup{temp_dir};
+
+    auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+    raft::copy(database_host.data_handle(), database_dev.data(), ps.n_rows * ps.dim, stream_);
+    auto queries_host = raft::make_host_matrix<DataT, int64_t>(ps.n_queries, ps.dim);
+    raft::copy(queries_host.data_handle(), search_queries.data(), ps.n_queries * ps.dim, stream_);
+    raft::resource::sync_stream(handle_);
+
+    const auto dataset_file = (std::filesystem::path(temp_dir) / "dataset.npy").string();
+    auto [dataset_fd, dataset_header_size] = cuvs::util::create_numpy_file<DataT>(
+      dataset_file, {static_cast<size_t>(ps.n_rows), static_cast<size_t>(ps.dim)});
+    cuvs::util::write_large_file(dataset_fd,
+                                 database_host.data_handle(),
+                                 static_cast<size_t>(ps.n_rows) * ps.dim * sizeof(DataT),
+                                 dataset_header_size);
+
+    hnsw::index_params hnsw_params;
+    hnsw_params.metric          = ps.metric;
+    hnsw_params.hierarchy       = hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK;
+    hnsw_params.M               = 32;
+    hnsw_params.ef_construction = ps.ef_construction;
+    hnsw_params.dataset_path    = dataset_file;
+
+    auto ace_params                = graph_build_params::ace_params();
+    ace_params.npartitions         = ps.npartitions;
+    ace_params.build_dir           = temp_dir;
+    ace_params.use_disk            = true;
+    ace_params.max_host_memory_gb  = ps.max_host_memory_gb;
+    ace_params.max_gpu_memory_gb   = ps.max_gpu_memory_gb;
+    hnsw_params.graph_build_params = ace_params;
+
+    auto hnsw_index =
+      hnsw::build(handle_, hnsw_params, raft::make_const_mdspan(database_host.view()));
+    ASSERT_NE(hnsw_index, nullptr);
+
+    const auto artifact_path = hnsw_index->file_path();
+    ASSERT_FALSE(artifact_path.empty());
+    ASSERT_TRUE(std::filesystem::is_regular_file(artifact_path));
+    EXPECT_EQ(std::filesystem::path(artifact_path).filename().string(), "hnsw_index.cuvs");
+    size_t cuvs_artifact_count = 0;
+    for (const auto& entry : std::filesystem::directory_iterator(temp_dir)) {
+      if (entry.path().extension() == ".cuvs") { ++cuvs_artifact_count; }
+    }
+    EXPECT_EQ(cuvs_artifact_count, 1);
+    EXPECT_FALSE(std::filesystem::exists(std::filesystem::path(temp_dir) / "layered_hnsw"));
+
+    auto indexes_hnsw_host   = raft::make_host_matrix<uint64_t, int64_t>(ps.n_queries, ps.k);
+    auto distances_hnsw_host = raft::make_host_matrix<DistanceT, int64_t>(ps.n_queries, ps.k);
+
+    hnsw::search_params search_params;
+    search_params.ef          = std::max(ps.ef_construction, ps.k * 2);
+    search_params.num_threads = 1;
+
+    EXPECT_THROW(hnsw::search(handle_,
+                              search_params,
+                              *hnsw_index,
+                              queries_host.view(),
+                              indexes_hnsw_host.view(),
+                              distances_hnsw_host.view()),
+                 std::exception);
+
+    hnsw::index<DataT>* deserialized_index = nullptr;
+    hnsw::deserialize(handle_, hnsw_params, artifact_path, ps.dim, ps.metric, &deserialized_index);
+    ASSERT_NE(deserialized_index, nullptr);
+    std::unique_ptr<hnsw::index<DataT>> deserialized_guard(deserialized_index);
+
+    hnsw::search(handle_,
+                 search_params,
+                 *deserialized_guard,
+                 queries_host.view(),
+                 indexes_hnsw_host.view(),
+                 distances_hnsw_host.view());
+
+    std::vector<IdxT> indexes_hnsw_converted(queries_size);
+    std::vector<DistanceT> distances_hnsw(queries_size);
+    for (size_t i = 0; i < queries_size; i++) {
+      indexes_hnsw_converted[i] = static_cast<IdxT>(indexes_hnsw_host.data_handle()[i]);
+      distances_hnsw[i]         = distances_hnsw_host.data_handle()[i];
+    }
+
+    EXPECT_TRUE(cuvs::neighbors::eval_neighbours(indexes_naive,
+                                                 indexes_hnsw_converted,
+                                                 distances_naive,
+                                                 distances_hnsw,
+                                                 ps.n_queries,
+                                                 ps.k,
+                                                 0.003,
+                                                 ps.min_recall))
+      << "Layered HNSW deserialize and search failed recall check";
+
+    const auto copied_artifact =
+      (std::filesystem::path(temp_dir) / "copied_layered" / "hnsw_index.cuvs").string();
+    hnsw::serialize(handle_, copied_artifact, *hnsw_index);
+    EXPECT_TRUE(std::filesystem::is_regular_file(copied_artifact));
+    size_t copied_file_count = 0;
+    for (const auto& entry : std::filesystem::directory_iterator(
+           std::filesystem::path(copied_artifact).parent_path())) {
+      if (entry.is_regular_file()) { ++copied_file_count; }
+    }
+    EXPECT_EQ(copied_file_count, 1);
+
+    hnsw::index<DataT>* copied_index = nullptr;
+    hnsw::deserialize(handle_, hnsw_params, copied_artifact, ps.dim, ps.metric, &copied_index);
+    ASSERT_NE(copied_index, nullptr);
+    std::unique_ptr<hnsw::index<DataT>> copied_guard(copied_index);
+
+    hnsw::search(handle_,
+                 search_params,
+                 *copied_guard,
+                 queries_host.view(),
+                 indexes_hnsw_host.view(),
+                 distances_hnsw_host.view());
+
+    for (size_t i = 0; i < queries_size; i++) {
+      indexes_hnsw_converted[i] = static_cast<IdxT>(indexes_hnsw_host.data_handle()[i]);
+      distances_hnsw[i]         = distances_hnsw_host.data_handle()[i];
+    }
+    EXPECT_TRUE(cuvs::neighbors::eval_neighbours(indexes_naive,
+                                                 indexes_hnsw_converted,
+                                                 distances_naive,
+                                                 distances_hnsw,
+                                                 ps.n_queries,
+                                                 ps.k,
+                                                 0.003,
+                                                 ps.min_recall))
+      << "Copied layered HNSW artifact deserialize and search failed recall check";
+
+    const auto bad_artifact =
+      (std::filesystem::path(temp_dir) / "bad_layered" / "bad_magic.cuvs").string();
+    std::filesystem::create_directories(std::filesystem::path(bad_artifact).parent_path());
+    {
+      std::ofstream bad_file(bad_artifact, std::ios::binary);
+      std::array<char, 64> bad_bytes{};
+      bad_file.write(bad_bytes.data(), bad_bytes.size());
+    }
+    hnsw::index<DataT>* bad_index = nullptr;
+    EXPECT_THROW(
+      hnsw::deserialize(handle_, hnsw_params, bad_artifact, ps.dim, ps.metric, &bad_index),
+      std::exception);
+
+    const auto bad_version_artifact =
+      (std::filesystem::path(temp_dir) / "bad_layered" / "bad_version.cuvs").string();
+    std::filesystem::copy_file(
+      copied_artifact, bad_version_artifact, std::filesystem::copy_options::overwrite_existing);
+    constexpr std::streamoff layered_header_version_offset = 32;
+    {
+      std::fstream bad_version_file(bad_version_artifact,
+                                    std::ios::in | std::ios::out | std::ios::binary);
+      const uint32_t bad_version = 999;
+      bad_version_file.seekp(layered_header_version_offset);
+      bad_version_file.write(reinterpret_cast<const char*>(&bad_version), sizeof(bad_version));
+    }
+    hnsw::index<DataT>* bad_version_index = nullptr;
+    EXPECT_THROW(
+      hnsw::deserialize(
+        handle_, hnsw_params, bad_version_artifact, ps.dim, ps.metric, &bad_version_index),
+      std::exception);
+
+    auto missing_dataset_params = hnsw_params;
+    missing_dataset_params.dataset_path.clear();
+    hnsw::index<DataT>* missing_dataset_index = nullptr;
+    EXPECT_THROW(hnsw::deserialize(handle_,
+                                   missing_dataset_params,
+                                   copied_artifact,
+                                   ps.dim,
+                                   ps.metric,
+                                   &missing_dataset_index),
+                 std::exception);
+
+    const auto truncated_artifact =
+      (std::filesystem::path(temp_dir) / "bad_layered" / "truncated.cuvs").string();
+    std::filesystem::copy_file(
+      copied_artifact, truncated_artifact, std::filesystem::copy_options::overwrite_existing);
+    std::filesystem::resize_file(truncated_artifact, 128);
+    hnsw::index<DataT>* truncated_index = nullptr;
+    EXPECT_THROW(hnsw::deserialize(
+                   handle_, hnsw_params, truncated_artifact, ps.dim, ps.metric, &truncated_index),
+                 std::exception);
+
+    const auto wrong_dataset_file =
+      (std::filesystem::path(temp_dir) / "bad_layered" / "wrong_dataset.npy").string();
+    auto [wrong_dataset_fd, wrong_dataset_header_size] = cuvs::util::create_numpy_file<DataT>(
+      wrong_dataset_file, {static_cast<size_t>(ps.n_rows - 1), static_cast<size_t>(ps.dim)});
+    cuvs::util::write_large_file(wrong_dataset_fd,
+                                 database_host.data_handle(),
+                                 static_cast<size_t>(ps.n_rows - 1) * ps.dim * sizeof(DataT),
+                                 wrong_dataset_header_size);
+    auto wrong_dataset_params               = hnsw_params;
+    wrong_dataset_params.dataset_path       = wrong_dataset_file;
+    hnsw::index<DataT>* wrong_dataset_index = nullptr;
+    EXPECT_THROW(
+      hnsw::deserialize(
+        handle_, wrong_dataset_params, copied_artifact, ps.dim, ps.metric, &wrong_dataset_index),
+      std::exception);
+  }
+
   void SetUp() override
   {
     database_dev.resize(((size_t)ps.n_rows) * ps.dim, stream_);
@@ -323,8 +562,26 @@ inline std::vector<AnnHnswAceInputs> generate_hnsw_ace_memory_fallback_inputs()
   };
 }
 
+inline std::vector<AnnHnswAceInputs> generate_hnsw_ace_layered_inputs()
+{
+  return {
+    {10,    // n_queries
+     5000,  // n_rows
+     64,    // dim
+     10,    // k
+     2,     // npartitions
+     100,   // ef_construction
+     true,  // use_disk
+     cuvs::distance::DistanceType::L2Expanded,
+     0.9,  // min_recall
+     0.0,  // max_host_memory_gb
+     0.0}  // max_gpu_memory_gb
+  };
+}
+
 const std::vector<AnnHnswAceInputs> hnsw_ace_inputs = generate_hnsw_ace_inputs();
 const std::vector<AnnHnswAceInputs> hnsw_ace_memory_fallback_inputs =
   generate_hnsw_ace_memory_fallback_inputs();
+const std::vector<AnnHnswAceInputs> hnsw_ace_layered_inputs = generate_hnsw_ace_layered_inputs();
 
 }  // namespace cuvs::neighbors::hnsw
diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu
index 57b16d74e9..97a6255300 100644
--- a/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu
+++ b/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu
@@ -23,4 +23,14 @@ INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest,
                         AnnHnswAceMemoryFallbackTest_float,
                         ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs));
 
+typedef AnnHnswAceTest<float, float, uint32_t> AnnHnswAceLayeredTest_float;
+TEST_P(AnnHnswAceLayeredTest_float, AnnHnswAceLayeredBuildDeserializeSearch)
+{
+  this->testHnswAceLayeredBuildDeserializeSearch();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest,
+                        AnnHnswAceLayeredTest_float,
+                        ::testing::ValuesIn(hnsw_ace_layered_inputs));
+
 }  // namespace cuvs::neighbors::hnsw
diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu
index 38f75b2afa..c77391ba28 100644
--- a/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu
+++ b/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu
@@ -12,4 +12,24 @@ TEST_P(AnnHnswAceTest_half, AnnHnswAceBuild) { this->testHnswAceBuild(); }
 
 INSTANTIATE_TEST_CASE_P(AnnHnswAceTest, AnnHnswAceTest_half, ::testing::ValuesIn(hnsw_ace_inputs));
 
+typedef AnnHnswAceTest<float, half, uint32_t> AnnHnswAceMemoryFallbackTest_half;
+TEST_P(AnnHnswAceMemoryFallbackTest_half, AnnHnswAceMemoryLimitFallback)
+{
+  this->testHnswAceMemoryLimitFallback();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest,
+                        AnnHnswAceMemoryFallbackTest_half,
+                        ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs));
+
+typedef AnnHnswAceTest<float, half, uint32_t> AnnHnswAceLayeredTest_half;
+TEST_P(AnnHnswAceLayeredTest_half, AnnHnswAceLayeredBuildDeserializeSearch)
+{
+  this->testHnswAceLayeredBuildDeserializeSearch();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest,
+                        AnnHnswAceLayeredTest_half,
+                        ::testing::ValuesIn(hnsw_ace_layered_inputs));
+
 }  // namespace cuvs::neighbors::hnsw
diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu
index 279df6555f..4ee6ecc5e5 100644
--- a/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu
+++ b/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu
@@ -14,4 +14,24 @@ INSTANTIATE_TEST_CASE_P(AnnHnswAceTest,
                         AnnHnswAceTest_int8_t,
                         ::testing::ValuesIn(hnsw_ace_inputs));
 
+typedef AnnHnswAceTest<float, int8_t, uint32_t> AnnHnswAceMemoryFallbackTest_int8_t;
+TEST_P(AnnHnswAceMemoryFallbackTest_int8_t, AnnHnswAceMemoryLimitFallback)
+{
+  this->testHnswAceMemoryLimitFallback();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest,
+                        AnnHnswAceMemoryFallbackTest_int8_t,
+                        ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs));
+
+typedef AnnHnswAceTest<float, int8_t, uint32_t> AnnHnswAceLayeredTest_int8_t;
+TEST_P(AnnHnswAceLayeredTest_int8_t, AnnHnswAceLayeredBuildDeserializeSearch)
+{
+  this->testHnswAceLayeredBuildDeserializeSearch();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest,
+                        AnnHnswAceLayeredTest_int8_t,
+                        ::testing::ValuesIn(hnsw_ace_layered_inputs));
+
 }  // namespace cuvs::neighbors::hnsw
diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu
index 7e68dc4b17..1ef791df8e 100644
--- a/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu
+++ b/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu
@@ -14,4 +14,24 @@ INSTANTIATE_TEST_CASE_P(AnnHnswAceTest,
                         AnnHnswAceTest_uint8_t,
                         ::testing::ValuesIn(hnsw_ace_inputs));
 
+typedef AnnHnswAceTest<float, uint8_t, uint32_t> AnnHnswAceMemoryFallbackTest_uint8_t;
+TEST_P(AnnHnswAceMemoryFallbackTest_uint8_t, AnnHnswAceMemoryLimitFallback)
+{
+  this->testHnswAceMemoryLimitFallback();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest,
+                        AnnHnswAceMemoryFallbackTest_uint8_t,
+                        ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs));
+
+typedef AnnHnswAceTest<float, uint8_t, uint32_t> AnnHnswAceLayeredTest_uint8_t;
+TEST_P(AnnHnswAceLayeredTest_uint8_t, AnnHnswAceLayeredBuildDeserializeSearch)
+{
+  this->testHnswAceLayeredBuildDeserializeSearch();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest,
+                        AnnHnswAceLayeredTest_uint8_t,
+                        ::testing::ValuesIn(hnsw_ace_layered_inputs));
+
 }  // namespace cuvs::neighbors::hnsw
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index 034b0b3d96..930f898266 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -35,6 +35,7 @@ add_executable(CAGRA_HNSW_ACE_EXAMPLE src/cagra_hnsw_ace_example.cu)
 add_executable(CAGRA_PERSISTENT_EXAMPLE src/cagra_persistent_example.cu)
 add_executable(DYNAMIC_BATCHING_EXAMPLE src/dynamic_batching_example.cu)
 add_executable(HNSW_ACE_EXAMPLE src/hnsw_ace_example.cu)
+add_executable(HNSW_ACE_LAYERED_EXAMPLE src/hnsw_ace_layered_example.cu)
 add_executable(IVF_FLAT_EXAMPLE src/ivf_flat_example.cu)
 add_executable(IVF_PQ_EXAMPLE src/ivf_pq_example.cu)
 add_executable(VAMANA_EXAMPLE src/vamana_example.cu)
@@ -52,6 +53,7 @@ target_link_libraries(
   DYNAMIC_BATCHING_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads
 )
 target_link_libraries(HNSW_ACE_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
+target_link_libraries(HNSW_ACE_LAYERED_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
 target_link_libraries(IVF_PQ_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
 target_link_libraries(IVF_FLAT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
 target_link_libraries(VAMANA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
diff --git a/examples/cpp/src/hnsw_ace_layered_example.cu b/examples/cpp/src/hnsw_ace_layered_example.cu
new file mode 100644
index 0000000000..02f735ef37
--- /dev/null
+++ b/examples/cpp/src/hnsw_ace_layered_example.cu
@@ -0,0 +1,278 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Large graphs exceeding the memory capacity can be built using the Augmented Core Extraction (ACE)
+// algorithm, which partitions the dataset. The resulting HNSW index is too large to fit in memory
+// as well. Thus, the index needs to be transferred to a search server with enough memory.
+//
+// HNSWHierarchy::GPU_LAYERED_ON_DISK is a special hierarchy that builds a layered HNSW index on
+// disk. It emits one topology-only artifact, hnsw_index.cuvs. The dataset remains separate and does
+// not need to be transferred to the search server, which typically has the dataset locally.
+//
+// This example demonstrates how to build a layered HNSW index with ACE:
+//
+// 1. Optionally quantize the dataset and queries to int8.
+// 2. Build a single-file layered HNSW artifact with ACE using hnsw::build.
+// 3. Deserialize the layered HNSW artifact using hnsw::deserialize.
+// 4. Search the in-memory HNSW index.
+//
+// Layered-on-disk layout:
+//
+//   index_dir/hnsw_index.cuvs
+//     fixed header + metadata JSON
+//     levels: uint8 [N], max HNSW level for each original row id
+//     base links: hnswlib-ready [count + uint32 neighbors padded to maxM0]
+//     upper nodes + upper links: hnswlib-ready upper-layer topology
+//
+// The transferred index artifact is topology-only. The dataset is loaded locally during
+// deserialization from hnsw::index_params::dataset_path. The loader supports .npy and ANN benchmark
+// *.bin datasets; this example writes a local dataset .npy only to make the demo self-contained.
+//
+// Layer 0 rows and neighbor IDs are original dataset row IDs. Upper layers are generated with the
+// same level/order/KNN logic as serialize_to_hnswlib_from_disk, then stored as hnswlib-ready link
+// rows so deserialization does no graph remapping or link padding on the search node.
+
+#include <algorithm>
+#include <cstdint>
+#include <filesystem>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <utility>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/random/make_blobs.cuh>
+
+#include <cuvs/neighbors/hnsw.hpp>
+#include <cuvs/preprocessing/quantize/scalar.hpp>
+#include <cuvs/util/file_io.hpp>
+
+#include <rmm/mr/pool_memory_resource.hpp>
+
+#include "common.cuh"
+
+// When 1, scalar-quantize the float dataset to int8.
+#define HNSW_ACE_LAYERED_USE_QUANTIZATION 1
+
+namespace {
+
+constexpr const char* kBuildDir = "/tmp/hnsw_ace_layered";
+
+template <typename T>
+std::string write_local_dataset(raft::host_matrix_view<const T, int64_t> dataset,
+                                const std::string& path)
+{
+  auto [fd, header_size] = cuvs::util::create_numpy_file<T>(
+    path, {static_cast<size_t>(dataset.extent(0)), static_cast<size_t>(dataset.extent(1))});
+  cuvs::util::write_large_file(
+    fd, dataset.data_handle(), dataset.extent(0) * dataset.extent(1) * sizeof(T), header_size);
+  return path;
+}
+
+template <typename T>
+struct quantized_pair {
+  raft::host_matrix<T, int64_t> dataset;
+  raft::host_matrix<T, int64_t> queries;
+};
+
+quantized_pair<int8_t> quantize_dataset(raft::device_resources const& dev_resources,
+                                        raft::host_matrix_view<const float, int64_t> dataset_float,
+                                        raft::host_matrix_view<const float, int64_t> queries_float)
+{
+  std::cout << "  quantize_dataset: training scalar quantizer (float -> int8)" << std::endl;
+  cuvs::preprocessing::quantize::scalar::params qp;
+  auto quantizer = cuvs::preprocessing::quantize::scalar::train(dev_resources, qp, dataset_float);
+
+  auto dataset_i8 =
+    raft::make_host_matrix<int8_t, int64_t>(dataset_float.extent(0), dataset_float.extent(1));
+  cuvs::preprocessing::quantize::scalar::transform(
+    dev_resources, quantizer, dataset_float, dataset_i8.view());
+
+  auto queries_i8 =
+    raft::make_host_matrix<int8_t, int64_t>(queries_float.extent(0), queries_float.extent(1));
+  cuvs::preprocessing::quantize::scalar::transform(
+    dev_resources, quantizer, queries_float, queries_i8.view());
+
+  return {std::move(dataset_i8), std::move(queries_i8)};
+}
+
+auto make_hnsw_ace_params(const std::string& build_dir, const std::string& dataset_path)
+  -> cuvs::neighbors::hnsw::index_params
+{
+  using namespace cuvs::neighbors;
+
+  hnsw::index_params hnsw_params;
+  hnsw_params.metric          = cuvs::distance::DistanceType::L2Expanded;
+  hnsw_params.hierarchy       = hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK;
+  hnsw_params.M               = 32;
+  hnsw_params.dataset_path    = dataset_path;  // Override this path on the search server.
+  hnsw_params.ef_construction = 120;
+
+  auto ace_params                = hnsw::graph_build_params::ace_params();
+  ace_params.npartitions         = 4;
+  ace_params.use_disk            = true;
+  ace_params.build_dir           = build_dir;
+  hnsw_params.graph_build_params = ace_params;
+
+  return hnsw_params;
+}
+
+template <typename T>
+auto hnsw_build(raft::device_resources const& dev_resources,
+                const cuvs::neighbors::hnsw::index_params& hnsw_params,
+                raft::host_matrix_view<const T, int64_t> dataset) -> std::string
+{
+  using namespace cuvs::neighbors;
+
+  auto hnsw_index          = hnsw::build(dev_resources, hnsw_params, dataset);
+  const auto artifact_path = hnsw_index->file_path();
+  if (artifact_path.empty()) {
+    throw std::runtime_error("Expected layered HNSW build to return an artifact path.");
+  }
+  std::cout << "  hnsw_build: layered artifact written to " << artifact_path << std::endl;
+  return artifact_path;
+}
+
+template <typename T>
+auto hnsw_deserialize(raft::device_resources const& dev_resources,
+                      const cuvs::neighbors::hnsw::index_params& hnsw_params,
+                      const std::string& artifact_path,
+                      int64_t dim) -> std::unique_ptr<cuvs::neighbors::hnsw::index<T>>
+{
+  using namespace cuvs::neighbors;
+
+  hnsw::index<T>* deserialized_index = nullptr;
+  // Set params.dataset_path to the local dataset path to load the dataset from disk.
+  // hnsw_params.dataset_path = "/tmp/dataset.npy";
+  hnsw::deserialize(
+    dev_resources, hnsw_params, artifact_path, dim, hnsw_params.metric, &deserialized_index);
+  return std::unique_ptr<hnsw::index<T>>(deserialized_index);
+}
+
+template <typename T>
+void hnsw_search(raft::device_resources const& dev_resources,
+                 const cuvs::neighbors::hnsw::index<T>& hnsw_index,
+                 raft::host_matrix_view<const T, int64_t> queries,
+                 int64_t topk = 12)
+{
+  using namespace cuvs::neighbors;
+
+  const int64_t n_queries  = queries.extent(0);
+  auto indices_hnsw_host   = raft::make_host_matrix<uint64_t, int64_t>(n_queries, topk);
+  auto distances_hnsw_host = raft::make_host_matrix<float, int64_t>(n_queries, topk);
+
+  hnsw::search_params search_params;
+  search_params.ef          = std::max(200, static_cast<int>(topk) * 2);
+  search_params.num_threads = 1;
+
+  hnsw::search(dev_resources,
+               search_params,
+               hnsw_index,
+               queries,
+               indices_hnsw_host.view(),
+               distances_hnsw_host.view());
+
+  auto neighbors      = raft::make_device_matrix<uint32_t>(dev_resources, n_queries, topk);
+  auto distances      = raft::make_device_matrix<float>(dev_resources, n_queries, topk);
+  auto neighbors_host = raft::make_host_matrix<uint32_t, int64_t>(n_queries, topk);
+  for (int64_t i = 0; i < n_queries; ++i) {
+    for (int64_t j = 0; j < topk; ++j) {
+      neighbors_host(i, j) = static_cast<uint32_t>(indices_hnsw_host(i, j));
+    }
+  }
+
+  raft::copy(neighbors.data_handle(),
+             neighbors_host.data_handle(),
+             n_queries * topk,
+             raft::resource::get_cuda_stream(dev_resources));
+  raft::copy(distances.data_handle(),
+             distances_hnsw_host.data_handle(),
+             n_queries * topk,
+             raft::resource::get_cuda_stream(dev_resources));
+  raft::resource::sync_stream(dev_resources);
+
+  print_results(dev_resources, neighbors.view(), distances.view());
+}
+
+}  // namespace
+
+int main()
+{
+  raft::device_resources dev_resources;
+
+  rmm::mr::pool_memory_resource pool_mr(rmm::mr::get_current_device_resource_ref(),
+                                        1024 * 1024 * 1024ull);
+  rmm::mr::set_current_device_resource(pool_mr);
+
+#if HNSW_ACE_LAYERED_USE_QUANTIZATION
+  std::cout << "[stage 1] Generate and quantize dataset (float -> int8)" << std::endl;
+#else
+  std::cout << "[stage 1] Generate dataset (float)" << std::endl;
+#endif
+
+  int64_t n_samples = 10000;
+  int64_t n_dim     = 90;
+  int64_t n_queries = 10;
+  auto dataset      = raft::make_device_matrix<float, int64_t>(dev_resources, n_samples, n_dim);
+  auto queries      = raft::make_device_matrix<float, int64_t>(dev_resources, n_queries, n_dim);
+  generate_dataset(dev_resources, dataset.view(), queries.view());
+
+  auto dataset_host = raft::make_host_matrix<float, int64_t>(n_samples, n_dim);
+  auto queries_host = raft::make_host_matrix<float, int64_t>(n_queries, n_dim);
+  raft::copy(dataset_host.data_handle(),
+             dataset.data_handle(),
+             dataset.extent(0) * dataset.extent(1),
+             raft::resource::get_cuda_stream(dev_resources));
+  raft::copy(queries_host.data_handle(),
+             queries.data_handle(),
+             queries.extent(0) * queries.extent(1),
+             raft::resource::get_cuda_stream(dev_resources));
+  raft::resource::sync_stream(dev_resources);
+
+  auto dataset_host_view = raft::make_host_matrix_view<const float, int64_t, raft::row_major>(
+    dataset_host.data_handle(), n_samples, n_dim);
+  auto queries_host_view = raft::make_host_matrix_view<const float, int64_t, raft::row_major>(
+    queries_host.data_handle(), n_queries, n_dim);
+
+  std::filesystem::create_directories(kBuildDir);
+
+#if HNSW_ACE_LAYERED_USE_QUANTIZATION
+  auto q               = quantize_dataset(dev_resources, dataset_host_view, queries_host_view);
+  auto dataset_i8_view = raft::make_host_matrix_view<const int8_t, int64_t, raft::row_major>(
+    q.dataset.data_handle(), n_samples, n_dim);
+  auto queries_i8_view = raft::make_host_matrix_view<const int8_t, int64_t, raft::row_major>(
+    q.queries.data_handle(), n_queries, n_dim);
+  auto dataset_path = write_local_dataset(dataset_i8_view, std::string{kBuildDir} + "/dataset.npy");
+  auto hnsw_params  = make_hnsw_ace_params(kBuildDir, dataset_path);
+
+  std::cout << "[stage 2] Build layered HNSW index with ACE" << std::endl;
+  auto artifact_path = hnsw_build<int8_t>(dev_resources, hnsw_params, dataset_i8_view);
+
+  std::cout << "[stage 3] Deserialize layered HNSW index" << std::endl;
+  auto hnsw_index = hnsw_deserialize<int8_t>(dev_resources, hnsw_params, artifact_path, n_dim);
+
+  std::cout << "[stage 4] Search HNSW index" << std::endl;
+  hnsw_search<int8_t>(dev_resources, *hnsw_index, queries_i8_view);
+#else
+  auto dataset_path =
+    write_local_dataset(dataset_host_view, std::string{kBuildDir} + "/dataset.npy");
+  auto hnsw_params = make_hnsw_ace_params(kBuildDir, dataset_path);
+
+  std::cout << "[stage 2] Build layered HNSW index with ACE" << std::endl;
+  auto artifact_path = hnsw_build<float>(dev_resources, hnsw_params, dataset_host_view);
+
+  std::cout << "[stage 3] Deserialize layered HNSW index" << std::endl;
+  auto hnsw_index = hnsw_deserialize<float>(dev_resources, hnsw_params, artifact_path, n_dim);
+
+  std::cout << "[stage 4] Search HNSW index" << std::endl;
+  hnsw_search<float>(dev_resources, *hnsw_index, queries_host_view);
+#endif
+
+  return 0;
+}

From a11d9ee6ae0ee1d00f666fc90b40df2de09b64e8 Mon Sep 17 00:00:00 2001
From: Julian Miller <julianmiller@nvidia.com>
Date: Wed, 27 May 2026 10:01:25 +0200
Subject: [PATCH 2/6] Improve deserialization logging

---
 cpp/src/neighbors/detail/hnsw.hpp | 101 ++++++++++++++++++++++++++++--
 1 file changed, 97 insertions(+), 4 deletions(-)

diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index 44dc4e688e..c5b86076c9 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -389,6 +389,11 @@ inline auto elapsed_ms_since(std::chrono::steady_clock::time_point start) -> int
     .count();
 }
 
+inline auto elapsed_ms(std::chrono::steady_clock::duration elapsed) -> double
+{
+  return std::chrono::duration<double, std::milli>(elapsed).count();
+}
+
 inline auto to_gib(size_t bytes) -> double
 {
   return static_cast<double>(bytes) / (1024.0 * 1024.0 * 1024.0);
@@ -399,6 +404,13 @@ inline auto throughput_gib_per_s(size_t bytes, int64_t elapsed_ms) -> double
   return elapsed_ms > 0 ? to_gib(bytes) / (elapsed_ms / 1000.0) : 0.0;
 }
 
+inline auto throughput_gib_per_s(size_t bytes, std::chrono::steady_clock::duration elapsed)
+  -> double
+{
+  const auto elapsed_s = std::chrono::duration<double>(elapsed).count();
+  return elapsed_s > 0.0 ? to_gib(bytes) / elapsed_s : 0.0;
+}
+
 inline auto make_hnsw_level_plan_from_levels(size_t n_rows,
                                              std::vector<uint8_t>&& levels,
                                              bool build_reverse_order,
@@ -2039,6 +2051,8 @@ auto deserialize_layered_hnsw(raft::resources const& res,
                               cuvs::distance::DistanceType metric) -> std::unique_ptr<index<T>>
 {
   common::nvtx::range<common::nvtx::domain::cuvs> fun_scope("hnsw::deserialize_layered");
+  const auto total_start_time    = std::chrono::steady_clock::now();
+  const auto metadata_start_time = std::chrono::steady_clock::now();
   cuvs::util::file_descriptor artifact_fd(artifact_path, O_RDONLY);
   layered_hnsw_file_header header{};
   cuvs::util::read_large_file(artifact_fd, &header, sizeof(header), 0);
@@ -2055,7 +2069,8 @@ auto deserialize_layered_hnsw(raft::resources const& res,
   std::string metadata_json(header.metadata_size, '\0');
   cuvs::util::read_large_file(
     artifact_fd, metadata_json.data(), metadata_json.size(), header.metadata_offset);
-  const auto metadata = parse_layered_hnsw_metadata(metadata_json);
+  const auto metadata            = parse_layered_hnsw_metadata(metadata_json);
+  const auto metadata_elapsed_ms = elapsed_ms_since(metadata_start_time);
 
   RAFT_EXPECTS(metadata.n_rows > 0, "Layered HNSW artifact must contain at least one row");
   RAFT_EXPECTS(static_cast<size_t>(dim) == metadata.dim,
@@ -2082,8 +2097,15 @@ auto deserialize_layered_hnsw(raft::resources const& res,
                "Layered HNSW artifact is truncated: expected at least %zu bytes, got %zu",
                expected_file_size,
                artifact_size);
+  RAFT_LOG_INFO("Layered HNSW load: metadata read in %ld ms (rows=%zu dim=%zu artifact=%.2f GiB)",
+                metadata_elapsed_ms,
+                metadata.n_rows,
+                metadata.dim,
+                to_gib(artifact_size));
 
-  auto dataset_file = open_layered_dataset_file<T>(params.dataset_path);
+  const auto dataset_open_start_time = std::chrono::steady_clock::now();
+  auto dataset_file                  = open_layered_dataset_file<T>(params.dataset_path);
+  const auto dataset_open_elapsed_ms = elapsed_ms_since(dataset_open_start_time);
 
   RAFT_EXPECTS(dataset_file.shape.size() == 2 && dataset_file.shape[0] == metadata.n_rows &&
                  dataset_file.shape[1] == metadata.dim,
@@ -2102,6 +2124,11 @@ auto deserialize_layered_hnsw(raft::resources const& res,
                metadata.base_degree,
                metadata.maxM0);
 
+  RAFT_LOG_INFO("Layered HNSW load: dataset header validated in %ld ms (%s)",
+                dataset_open_elapsed_ms,
+                params.dataset_path.c_str());
+
+  const auto levels_start_time = std::chrono::steady_clock::now();
   std::vector<uint8_t> levels_u8(metadata.n_rows);
   cuvs::util::read_large_file(artifact_fd, levels_u8.data(), metadata.levels_bytes, levels_offset);
   const auto max_level_in_levels = *std::max_element(levels_u8.begin(), levels_u8.end());
@@ -2109,9 +2136,14 @@ auto deserialize_layered_hnsw(raft::resources const& res,
                "Layered HNSW levels max level (%d) does not match artifact maxlevel (%d)",
                static_cast<int>(max_level_in_levels),
                metadata.maxlevel);
+  const auto levels_elapsed_ms = elapsed_ms_since(levels_start_time);
+  RAFT_LOG_INFO("Layered HNSW load: levels read in %ld ms (%.2f MiB)",
+                levels_elapsed_ms,
+                static_cast<double>(metadata.levels_bytes) / (1024.0 * 1024.0));
 
-  auto hnsw_index = std::make_unique<index_impl<T>>(dim, metric, params.hierarchy);
-  auto appr_algo  = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+  const auto allocation_start_time = std::chrono::steady_clock::now();
+  auto hnsw_index                  = std::make_unique<index_impl<T>>(dim, metric, params.hierarchy);
+  auto appr_algo = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
     hnsw_index->get_space(), metadata.n_rows, metadata.M, metadata.ef_construction);
   appr_algo->cur_element_count = metadata.n_rows;
   appr_algo->maxlevel_         = metadata.maxlevel;
@@ -2123,6 +2155,12 @@ auto deserialize_layered_hnsw(raft::resources const& res,
                "Layered HNSW upper link row size mismatch");
   RAFT_EXPECTS(appr_algo->maxM0_ == metadata.maxM0 && appr_algo->maxM_ == metadata.maxM,
                "Layered HNSW M parameter mismatch");
+  const auto allocation_elapsed_ms = elapsed_ms_since(allocation_start_time);
+  RAFT_LOG_INFO(
+    "Layered HNSW load: allocated hnswlib storage in %ld ms (base=%.2f GiB upper=%.2f GiB)",
+    allocation_elapsed_ms,
+    to_gib(metadata.n_rows * appr_algo->size_data_per_element_),
+    to_gib(metadata.upper_nodes_count * metadata.upper_link_row_bytes));
 
   auto num_threads =
     params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads;
@@ -2135,18 +2173,31 @@ auto deserialize_layered_hnsw(raft::resources const& res,
     raft::make_host_matrix<T, int64_t>(static_cast<int64_t>(batch_size), metadata.dim);
   std::vector<char> base_link_buffer(batch_size * metadata.base_link_row_bytes);
 
+  const auto base_start_time = std::chrono::steady_clock::now();
+  std::chrono::steady_clock::duration dataset_read_time{};
+  std::chrono::steady_clock::duration base_link_read_time{};
+  std::chrono::steady_clock::duration base_copy_time{};
+  size_t dataset_bytes_read   = 0;
+  size_t base_link_bytes_read = 0;
   for (size_t batch_start = 0; batch_start < metadata.n_rows; batch_start += batch_size) {
     const auto current_batch_size = std::min(batch_size, metadata.n_rows - batch_start);
+    auto batch_timer              = std::chrono::steady_clock::now();
     cuvs::util::read_large_file(dataset_file.fd,
                                 dataset_buffer.data_handle(),
                                 current_batch_size * metadata.dim * sizeof(T),
                                 dataset_file.header_size + batch_start * metadata.dim * sizeof(T));
+    dataset_read_time += std::chrono::steady_clock::now() - batch_timer;
+    dataset_bytes_read += current_batch_size * metadata.dim * sizeof(T);
+    batch_timer = std::chrono::steady_clock::now();
     cuvs::util::read_large_file(artifact_fd,
                                 base_link_buffer.data(),
                                 current_batch_size * metadata.base_link_row_bytes,
                                 base_links_offset + batch_start * metadata.base_link_row_bytes);
+    base_link_read_time += std::chrono::steady_clock::now() - batch_timer;
+    base_link_bytes_read += current_batch_size * metadata.base_link_row_bytes;
 
     bool link_list_allocation_failed = false;
+    batch_timer                      = std::chrono::steady_clock::now();
 #pragma omp parallel for num_threads(num_threads) reduction(|| : link_list_allocation_failed)
     for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size); ++batch_idx) {
       const auto i                  = batch_start + static_cast<size_t>(batch_idx);
@@ -2177,10 +2228,30 @@ auto deserialize_layered_hnsw(raft::resources const& res,
       }
       throw std::runtime_error("Not enough memory to allocate HNSW upper linklists");
     }
+    base_copy_time += std::chrono::steady_clock::now() - batch_timer;
   }
+  const auto base_elapsed_ms = elapsed_ms_since(base_start_time);
+  RAFT_LOG_INFO(
+    "Layered HNSW load: base layer initialized in %ld ms "
+    "(dataset read %.2f ms %.2f GiB %.2f GiB/s, links read %.2f ms %.2f GiB %.2f GiB/s, "
+    "copy %.2f ms)",
+    base_elapsed_ms,
+    elapsed_ms(dataset_read_time),
+    to_gib(dataset_bytes_read),
+    throughput_gib_per_s(dataset_bytes_read, dataset_read_time),
+    elapsed_ms(base_link_read_time),
+    to_gib(base_link_bytes_read),
+    throughput_gib_per_s(base_link_bytes_read, base_link_read_time),
+    elapsed_ms(base_copy_time));
 
   RAFT_LOG_INFO("Layered HNSW: loading upper-layer topology");
+  const auto upper_start_time = std::chrono::steady_clock::now();
+  std::chrono::steady_clock::duration upper_read_time{};
+  std::chrono::steady_clock::duration upper_validate_time{};
+  std::chrono::steady_clock::duration upper_copy_time{};
+  size_t upper_bytes_read = 0;
   for (const auto& layer : metadata.layers) {
+    const auto layer_start_time = std::chrono::steady_clock::now();
     RAFT_LOG_INFO("Layered HNSW: loading layer %zu (%zu rows, degree %zu)",
                   layer.level,
                   layer.row_count,
@@ -2190,6 +2261,7 @@ auto deserialize_layered_hnsw(raft::resources const& res,
     std::vector<char> link_buffer(layer_batch_size * metadata.upper_link_row_bytes);
     for (size_t batch_start = 0; batch_start < layer.row_count; batch_start += layer_batch_size) {
       const auto current_batch_size = std::min(layer_batch_size, layer.row_count - batch_start);
+      auto batch_timer              = std::chrono::steady_clock::now();
       cuvs::util::read_large_file(
         artifact_fd,
         node_buffer.data(),
@@ -2200,6 +2272,9 @@ auto deserialize_layered_hnsw(raft::resources const& res,
         link_buffer.data(),
         current_batch_size * metadata.upper_link_row_bytes,
         upper_links_offset + (layer.link_offset + batch_start) * metadata.upper_link_row_bytes);
+      upper_read_time += std::chrono::steady_clock::now() - batch_timer;
+      upper_bytes_read += current_batch_size * (sizeof(uint32_t) + metadata.upper_link_row_bytes);
+      batch_timer = std::chrono::steady_clock::now();
       for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size);
            ++batch_idx) {
         const auto node_id = static_cast<size_t>(node_buffer[batch_idx]);
@@ -2211,6 +2286,8 @@ auto deserialize_layered_hnsw(raft::resources const& res,
                      node_id,
                      layer.level);
       }
+      upper_validate_time += std::chrono::steady_clock::now() - batch_timer;
+      batch_timer = std::chrono::steady_clock::now();
 #pragma omp parallel for num_threads(num_threads)
       for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size);
            ++batch_idx) {
@@ -2220,10 +2297,26 @@ auto deserialize_layered_hnsw(raft::resources const& res,
                link_buffer.data() + batch_idx * metadata.upper_link_row_bytes,
                metadata.upper_link_row_bytes);
       }
+      upper_copy_time += std::chrono::steady_clock::now() - batch_timer;
     }
+    RAFT_LOG_INFO("Layered HNSW load: layer %zu loaded in %ld ms",
+                  layer.level,
+                  elapsed_ms_since(layer_start_time));
   }
+  const auto upper_elapsed_ms = elapsed_ms_since(upper_start_time);
+  RAFT_LOG_INFO(
+    "Layered HNSW load: upper layers loaded in %ld ms "
+    "(read %.2f ms %.2f GiB %.2f GiB/s, validate %.2f ms, copy %.2f ms)",
+    upper_elapsed_ms,
+    elapsed_ms(upper_read_time),
+    to_gib(upper_bytes_read),
+    throughput_gib_per_s(upper_bytes_read, upper_read_time),
+    elapsed_ms(upper_validate_time),
+    elapsed_ms(upper_copy_time));
 
   hnsw_index->set_index(std::move(appr_algo));
+  RAFT_LOG_INFO("Layered HNSW load: total deserialize completed in %ld ms",
+                elapsed_ms_since(total_start_time));
   return hnsw_index;
 }
 

From a95b0e0c0014982f469e62e757a349798eaedb2b Mon Sep 17 00:00:00 2001
From: Julian Miller <julianmiller@nvidia.com>
Date: Wed, 27 May 2026 11:58:21 +0200
Subject: [PATCH 3/6] Use ace prefix in benchmarking consistently

---
 cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
index b535ced763..d4791dde11 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
@@ -64,16 +64,18 @@ auto parse_build_param(const nlohmann::json& conf) ->
         cuvs::neighbors::cagra::hnsw_heuristic_type::SAME_GRAPH_FOOTPRINT,
         dist_type);
       ps.metric = dist_type;
-      // Parse ACE parameters if provided
-      if (conf.contains("npartitions") || conf.contains("build_dir") ||
-          conf.contains("ef_construction") || conf.contains("use_disk")) {
+      // Parse ACE parameters if provided.
+      auto ace_conf = collect_conf_with_prefix(conf, "ace_");
+      if (!ace_conf.empty()) {
         auto ace_params = cuvs::neighbors::cagra::graph_build_params::ace_params();
-        if (conf.contains("npartitions")) { ace_params.npartitions = conf.at("npartitions"); }
-        if (conf.contains("build_dir")) { ace_params.build_dir = conf.at("build_dir"); }
-        if (conf.contains("ef_construction")) {
-          ace_params.ef_construction = conf.at("ef_construction");
+        if (ace_conf.contains("npartitions")) {
+          ace_params.npartitions = ace_conf.at("npartitions");
         }
-        if (conf.contains("use_disk")) { ace_params.use_disk = conf.at("use_disk"); }
+        if (ace_conf.contains("build_dir")) { ace_params.build_dir = ace_conf.at("build_dir"); }
+        if (ace_conf.contains("ef_construction")) {
+          ace_params.ef_construction = ace_conf.at("ef_construction");
+        }
+        if (ace_conf.contains("use_disk")) { ace_params.use_disk = ace_conf.at("use_disk"); }
         ps.graph_build_params = ace_params;
       }
       // NB: above, we only provide the defaults. Below we parse the explicit parameters as usual.

From 47e2d858a410030913bd587ed2c11661694ad95d Mon Sep 17 00:00:00 2001
From: Julian Miller <julianmiller@nvidia.com>
Date: Wed, 27 May 2026 11:59:10 +0200
Subject: [PATCH 4/6] Validate metadata before allocating

---
 cpp/src/neighbors/detail/hnsw.hpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index c5b86076c9..d048d322a2 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -2065,6 +2065,14 @@ auto deserialize_layered_hnsw(raft::resources const& res,
   RAFT_EXPECTS(header.metadata_offset >= sizeof(layered_hnsw_file_header),
                "Invalid layered HNSW metadata offset");
   RAFT_EXPECTS(header.metadata_size > 0, "Invalid layered HNSW metadata size");
+  const auto artifact_size = static_cast<size_t>(std::filesystem::file_size(artifact_path));
+  RAFT_EXPECTS(header.metadata_offset <= artifact_size &&
+                 header.metadata_size <= artifact_size - header.metadata_offset,
+               "Layered HNSW metadata range is outside artifact: offset=%zu size=%zu "
+               "artifact=%zu",
+               static_cast<size_t>(header.metadata_offset),
+               static_cast<size_t>(header.metadata_size),
+               artifact_size);
 
   std::string metadata_json(header.metadata_size, '\0');
   cuvs::util::read_large_file(
@@ -2092,7 +2100,6 @@ auto deserialize_layered_hnsw(raft::resources const& res,
   const auto upper_nodes_offset = base_links_offset + metadata.base_links_bytes;
   const auto upper_links_offset = upper_nodes_offset + metadata.upper_nodes_bytes;
   const auto expected_file_size = upper_links_offset + metadata.upper_links_bytes;
-  const auto artifact_size      = static_cast<size_t>(std::filesystem::file_size(artifact_path));
   RAFT_EXPECTS(artifact_size >= expected_file_size,
                "Layered HNSW artifact is truncated: expected at least %zu bytes, got %zu",
                expected_file_size,

From 21ce339abadb11c2bcbeca3c389614877f4e72d1 Mon Sep 17 00:00:00 2001
From: Julian Miller <julianmiller@nvidia.com>
Date: Mon, 1 Jun 2026 10:12:45 +0200
Subject: [PATCH 5/6] Store layered base topology by original node ID

- Add base node IDs for sequential access.
- Scattered writes happen only in deserialization step using host
memory.
---
 cpp/src/neighbors/detail/hnsw.hpp            | 351 ++++++++++---------
 examples/cpp/src/hnsw_ace_layered_example.cu |   8 +-
 2 files changed, 185 insertions(+), 174 deletions(-)

diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index d048d322a2..d090de07f8 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -40,8 +40,8 @@
 #include <random>
 #include <sstream>
 #include <stdexcept>
-#include <sys/mman.h>
 #include <thread>
+#include <type_traits>
 #include <unistd.h>
 #include <utility>
 #include <vector>
@@ -519,6 +519,7 @@ struct layered_hnsw_file_metadata {
   size_t ef_construction      = 0;
   size_t base_degree          = 0;
   size_t levels_bytes         = 0;
+  size_t base_nodes_bytes     = 0;
   size_t base_link_row_bytes  = 0;
   size_t base_links_bytes     = 0;
   size_t upper_nodes_count    = 0;
@@ -580,12 +581,13 @@ auto make_layered_hnsw_metadata_json(const layered_hnsw_file_metadata& metadata,
   std::stringstream os;
   os << "{\n";
   os << "  \"format\": \"cuvs_hnsw_layered\",\n";
-  os << "  \"version\": 1,\n";
+  os << "  \"version\": " << layered_hnsw_version << ",\n";
   os << "  \"n_rows\": " << metadata.n_rows << ",\n";
   os << "  \"dim\": " << metadata.dim << ",\n";
   os << "  \"dtype\": \"" << layered_dtype_name<T>() << "\",\n";
   os << "  \"metric\": \"" << metric_name(metric) << "\",\n";
   os << "  \"row_order\": \"original_id\",\n";
+  os << "  \"base_link_order\": \"base_nodes\",\n";
   os << "  \"M\": " << metadata.M << ",\n";
   os << "  \"maxM\": " << metadata.maxM << ",\n";
   os << "  \"maxM0\": " << metadata.maxM0 << ",\n";
@@ -595,6 +597,7 @@ auto make_layered_hnsw_metadata_json(const layered_hnsw_file_metadata& metadata,
   os << "  \"enterpoint_node\": " << metadata.enterpoint_node << ",\n";
   os << "  \"base_degree\": " << metadata.base_degree << ",\n";
   os << "  \"levels_bytes\": " << metadata.levels_bytes << ",\n";
+  os << "  \"base_nodes_bytes\": " << metadata.base_nodes_bytes << ",\n";
   os << "  \"base_link_row_bytes\": " << metadata.base_link_row_bytes << ",\n";
   os << "  \"base_links_bytes\": " << metadata.base_links_bytes << ",\n";
   os << "  \"upper_nodes_count\": " << metadata.upper_nodes_count << ",\n";
@@ -665,6 +668,7 @@ inline auto parse_layered_hnsw_metadata(const std::string& json) -> layered_hnsw
   metadata.enterpoint_node      = static_cast<int>(json_parse_size(json, "enterpoint_node"));
   metadata.base_degree          = json_parse_size(json, "base_degree");
   metadata.levels_bytes         = json_parse_size(json, "levels_bytes");
+  metadata.base_nodes_bytes     = json_parse_size(json, "base_nodes_bytes");
   metadata.base_link_row_bytes  = json_parse_size(json, "base_link_row_bytes");
   metadata.base_links_bytes     = json_parse_size(json, "base_links_bytes");
   metadata.upper_nodes_count    = json_parse_size(json, "upper_nodes_count");
@@ -766,13 +770,15 @@ inline auto open_layered_dataset_file(const std::string& path) -> npy_file
 template <typename T, typename IdxT>
 void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index<T, IdxT>& index_,
                                         const cuvs::util::file_descriptor& output_fd,
-                                        size_t output_offset,
+                                        size_t base_nodes_offset,
+                                        size_t base_links_offset,
                                         size_t base_link_row_bytes,
                                         size_t maxM0)
 {
-  const auto total_start_time = std::chrono::steady_clock::now();
-  const auto& graph_fd_opt    = index_.graph_fd();
-  const auto& mapping_fd_opt  = index_.mapping_fd();
+  static_assert(std::is_same_v<IdxT, uint32_t>,
+                "Layered HNSW artifacts store topology ids as uint32_t");
+  const auto& graph_fd_opt   = index_.graph_fd();
+  const auto& mapping_fd_opt = index_.mapping_fd();
   RAFT_EXPECTS(graph_fd_opt.has_value() && graph_fd_opt->is_valid(),
                "Graph file descriptor is not available");
   RAFT_EXPECTS(mapping_fd_opt.has_value() && mapping_fd_opt->is_valid(),
@@ -812,7 +818,6 @@ void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index<T, I
   const auto mapping_start_time = std::chrono::steady_clock::now();
   const auto mapping_bytes      = n_rows * sizeof(IdxT);
   std::vector<IdxT> reordered_to_original(n_rows);
-  std::vector<IdxT> original_to_reordered(n_rows);
   cuvs::util::read_large_file(
     mapping_npy.fd, reordered_to_original.data(), mapping_bytes, mapping_npy.header_size);
   for (size_t reordered_id = 0; reordered_id < n_rows; ++reordered_id) {
@@ -821,7 +826,6 @@ void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index<T, I
                  "Invalid original id %zu in ACE dataset mapping at row %zu",
                  original_id,
                  reordered_id);
-    original_to_reordered[original_id] = static_cast<IdxT>(reordered_id);
   }
   const auto mapping_elapsed_ms = elapsed_ms_since(mapping_start_time);
   RAFT_LOG_INFO("HNSW remap: mapping loaded in %ld ms (%.2f GiB, %.2f GiB/s)",
@@ -829,112 +833,80 @@ void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index<T, I
                 to_gib(mapping_bytes),
                 throughput_gib_per_s(mapping_bytes, mapping_elapsed_ms));
 
-  const size_t graph_row_bytes    = degree * sizeof(IdxT);
-  const size_t target_batch_bytes = 64 * 1024 * 1024;
-  const size_t batch_size         = std::max<size_t>(1, target_batch_bytes / base_link_row_bytes);
-  const size_t max_gap_rows       = std::max<size_t>(1, (128 * 1024) / graph_row_bytes);
-  const size_t max_span_rows      = std::max<size_t>(1, (8 * 1024 * 1024) / graph_row_bytes);
-
-  RAFT_LOG_INFO(
-    "HNSW remap: n_rows=%zu degree=%zu maxM0=%zu batch_size=%zu max_gap=%zu rows max_span=%zu rows",
-    n_rows,
-    degree,
-    maxM0,
-    batch_size,
-    max_gap_rows,
-    max_span_rows);
-
-  auto span_buffer = raft::make_host_matrix<IdxT, int64_t>(static_cast<int64_t>(max_span_rows),
-                                                           static_cast<int64_t>(degree));
-  std::vector<char> output_buffer(batch_size * base_link_row_bytes);
-  std::vector<IdxT> source_rows(batch_size);
-  std::vector<size_t> order(batch_size);
-
-  const auto graph_start_time = std::chrono::steady_clock::now();
-  size_t spans_read           = 0;
-  size_t bytes_read           = 0;
-  size_t bytes_written        = 0;
-
-  for (size_t batch_start = 0; batch_start < n_rows; batch_start += batch_size) {
-    const auto current_batch_size = std::min(batch_size, n_rows - batch_start);
-    std::fill(
-      output_buffer.begin(), output_buffer.begin() + current_batch_size * base_link_row_bytes, 0);
-    for (size_t i = 0; i < current_batch_size; ++i) {
-      source_rows[i] = original_to_reordered[batch_start + i];
-      order[i]       = i;
-    }
-    std::sort(order.begin(), order.begin() + current_batch_size, [&](size_t lhs, size_t rhs) {
-      return source_rows[lhs] < source_rows[rhs];
-    });
-
-    size_t sorted_pos = 0;
-    while (sorted_pos < current_batch_size) {
-      const auto span_start = static_cast<size_t>(source_rows[order[sorted_pos]]);
-      auto span_end         = span_start;
-      auto span_next        = sorted_pos + 1;
-      while (span_next < current_batch_size) {
-        const auto next_row = static_cast<size_t>(source_rows[order[span_next]]);
-        if (next_row - span_start + 1 > max_span_rows) { break; }
-        if (next_row > span_end + max_gap_rows + 1) { break; }
-        span_end = next_row;
-        ++span_next;
-      }
-
-      const auto span_rows  = span_end - span_start + 1;
-      const auto span_bytes = span_rows * graph_row_bytes;
-      cuvs::util::read_large_file(graph_npy.fd,
-                                  span_buffer.data_handle(),
-                                  span_bytes,
-                                  graph_npy.header_size + span_start * graph_row_bytes);
-      ++spans_read;
-      bytes_read += span_bytes;
-
-      for (size_t i = sorted_pos; i < span_next; ++i) {
-        const auto dst_row    = order[i];
-        const auto source_row = static_cast<size_t>(source_rows[dst_row]);
-        auto* src             = span_buffer.data_handle() + (source_row - span_start) * degree;
-        auto* dst_row_ptr     = output_buffer.data() + dst_row * base_link_row_bytes;
-        hnswlib::linklistsizeint list_count = static_cast<hnswlib::linklistsizeint>(degree);
-        std::memcpy(dst_row_ptr, &list_count, sizeof(list_count));
-        auto* dst = reinterpret_cast<IdxT*>(dst_row_ptr + sizeof(hnswlib::linklistsizeint));
-        for (size_t j = 0; j < degree; ++j) {
-          const auto neighbor = static_cast<size_t>(src[j]);
-          RAFT_EXPECTS(neighbor < n_rows,
-                       "Invalid reordered neighbor id %zu in ACE graph row %zu",
-                       neighbor,
-                       source_row);
-          dst[j] = reordered_to_original[neighbor];
+  const auto total_start_time       = std::chrono::steady_clock::now();
+  const auto graph_row_bytes        = degree * sizeof(IdxT);
+  const auto base_node_row_bytes    = sizeof(IdxT);
+  const auto base_topology_row_size = graph_row_bytes + base_node_row_bytes + base_link_row_bytes;
+  const size_t target_batch_bytes   = 64 * 1024 * 1024;
+  const size_t batch_size = std::max<size_t>(1, target_batch_bytes / base_topology_row_size);
+  auto graph_buffer       = raft::make_host_matrix<IdxT, int64_t>(static_cast<int64_t>(batch_size),
+                                                            static_cast<int64_t>(degree));
+  std::vector<IdxT> base_node_buffer(batch_size);
+  std::vector<char> base_link_buffer(batch_size * base_link_row_bytes);
+
+  size_t graph_bytes_read   = 0;
+  size_t node_bytes_written = 0;
+  size_t link_bytes_written = 0;
+  for (size_t source_start = 0; source_start < n_rows; source_start += batch_size) {
+    const auto current_batch_size = std::min(batch_size, n_rows - source_start);
+    const auto batch_bytes        = current_batch_size * graph_row_bytes;
+    cuvs::util::read_large_file(graph_npy.fd,
+                                graph_buffer.data_handle(),
+                                batch_bytes,
+                                graph_npy.header_size + source_start * graph_row_bytes);
+    graph_bytes_read += batch_bytes;
+
+    bool invalid_neighbor = false;
+#pragma omp parallel for reduction(|| : invalid_neighbor)
+    for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size); ++batch_idx) {
+      const auto source_row       = source_start + static_cast<size_t>(batch_idx);
+      const auto original_id      = static_cast<size_t>(reordered_to_original[source_row]);
+      base_node_buffer[batch_idx] = static_cast<IdxT>(original_id);
+      auto* dst_row_ptr           = base_link_buffer.data() + batch_idx * base_link_row_bytes;
+      hnswlib::linklistsizeint list_count = static_cast<hnswlib::linklistsizeint>(degree);
+      std::memcpy(dst_row_ptr, &list_count, sizeof(list_count));
+      auto* dst = reinterpret_cast<IdxT*>(dst_row_ptr + sizeof(hnswlib::linklistsizeint));
+      auto* src = graph_buffer.data_handle() + batch_idx * degree;
+      for (size_t j = 0; j < degree; ++j) {
+        const auto neighbor = static_cast<size_t>(src[j]);
+        if (neighbor >= n_rows) {
+          invalid_neighbor = true;
+          continue;
         }
+        dst[j] = reordered_to_original[neighbor];
+      }
+      const auto written_bytes = sizeof(hnswlib::linklistsizeint) + degree * sizeof(IdxT);
+      if (written_bytes < base_link_row_bytes) {
+        std::memset(dst_row_ptr + written_bytes, 0, base_link_row_bytes - written_bytes);
       }
-      sorted_pos = span_next;
     }
+    RAFT_EXPECTS(!invalid_neighbor, "Invalid reordered neighbor id in ACE graph");
 
-    const auto batch_bytes = current_batch_size * base_link_row_bytes;
+    const auto current_node_bytes = current_batch_size * sizeof(IdxT);
+    const auto current_link_bytes = current_batch_size * base_link_row_bytes;
+    cuvs::util::write_large_file(output_fd,
+                                 base_node_buffer.data(),
+                                 current_node_bytes,
+                                 base_nodes_offset + source_start * sizeof(IdxT));
     cuvs::util::write_large_file(output_fd,
-                                 output_buffer.data(),
-                                 batch_bytes,
-                                 output_offset + batch_start * base_link_row_bytes);
-    bytes_written += batch_bytes;
+                                 base_link_buffer.data(),
+                                 current_link_bytes,
+                                 base_links_offset + source_start * base_link_row_bytes);
+    node_bytes_written += current_node_bytes;
+    link_bytes_written += current_link_bytes;
   }
 
-  const auto graph_elapsed_ms = elapsed_ms_since(graph_start_time);
   const auto total_elapsed_ms = elapsed_ms_since(total_start_time);
-  const auto read_gib         = to_gib(bytes_read);
-  const auto written_gib      = to_gib(bytes_written);
-  const auto ideal_read_bytes = n_rows * graph_row_bytes;
-  const auto amplification =
-    ideal_read_bytes == 0 ? 0.0 : static_cast<double>(bytes_read) / ideal_read_bytes;
   RAFT_LOG_INFO(
-    "HNSW remap: base links written in %ld ms: read %.2f GiB at %.2f GiB/s across %zu spans "
-    "(%.2fx amplification), wrote %.2f GiB at %.2f GiB/s",
-    graph_elapsed_ms,
-    read_gib,
-    throughput_gib_per_s(bytes_read, graph_elapsed_ms),
-    spans_read,
-    amplification,
-    written_gib,
-    throughput_gib_per_s(bytes_written, graph_elapsed_ms));
-  RAFT_LOG_INFO("HNSW remap: completed in %ld ms total", total_elapsed_ms);
+    "HNSW remap: base topology written in %ld ms (graph_read=%.2f GiB %.2f GiB/s, "
+    "artifact_write=%.2f GiB %.2f GiB/s, nodes=%.2f GiB links=%.2f GiB)",
+    total_elapsed_ms,
+    to_gib(graph_bytes_read),
+    throughput_gib_per_s(graph_bytes_read, total_elapsed_ms),
+    to_gib(node_bytes_written + link_bytes_written),
+    throughput_gib_per_s(node_bytes_written + link_bytes_written, total_elapsed_ms),
+    to_gib(node_bytes_written),
+    to_gib(link_bytes_written));
 }
 
 template <typename T, typename IdxT>
@@ -975,7 +947,7 @@ auto serialize_to_layered_hnsw_from_disk(
 
   RAFT_LOG_INFO("Layered HNSW: generating hierarchy levels");
   const auto hierarchy_start_time = std::chrono::steady_clock::now();
-  auto hierarchy = make_random_hnsw_level_plan(n_rows, *appr_algo, "Layered HNSW: Level ");
+  auto hierarchy                  = make_random_hnsw_level_plan(n_rows, *appr_algo, nullptr);
   const auto hierarchy_elapsed_ms = elapsed_ms_since(hierarchy_start_time);
   RAFT_LOG_INFO("Layered HNSW: hierarchy levels generated in %ld ms (max_level=%d, promoted=%zu)",
                 hierarchy_elapsed_ms,
@@ -994,6 +966,7 @@ auto serialize_to_layered_hnsw_from_disk(
   metadata.enterpoint_node      = static_cast<int>(hierarchy.order.back());
   metadata.base_degree          = static_cast<size_t>(graph_degree_int);
   metadata.levels_bytes         = n_rows * sizeof(uint8_t);
+  metadata.base_nodes_bytes     = n_rows * sizeof(IdxT);
   metadata.base_link_row_bytes  = appr_algo->size_links_level0_;
   metadata.base_links_bytes     = n_rows * metadata.base_link_row_bytes;
   metadata.upper_link_row_bytes = appr_algo->size_links_per_element_;
@@ -1022,7 +995,8 @@ auto serialize_to_layered_hnsw_from_disk(
   const auto payload_offset =
     align_up(header.metadata_offset + header.metadata_size, layered_hnsw_alignment);
   const auto levels_offset      = payload_offset;
-  const auto base_links_offset  = levels_offset + metadata.levels_bytes;
+  const auto base_nodes_offset  = levels_offset + metadata.levels_bytes;
+  const auto base_links_offset  = base_nodes_offset + metadata.base_nodes_bytes;
   const auto upper_nodes_offset = base_links_offset + metadata.base_links_bytes;
   const auto upper_links_offset = upper_nodes_offset + metadata.upper_nodes_bytes;
   const auto final_file_size    = upper_links_offset + metadata.upper_links_bytes;
@@ -1048,8 +1022,12 @@ auto serialize_to_layered_hnsw_from_disk(
 
   RAFT_LOG_INFO("Layered HNSW: writing hnswlib-ready base links");
   const auto layer0_start_time = std::chrono::steady_clock::now();
-  write_layered_base_links_from_disk<T, IdxT>(
-    index_, artifact_fd, base_links_offset, metadata.base_link_row_bytes, metadata.maxM0);
+  write_layered_base_links_from_disk<T, IdxT>(index_,
+                                              artifact_fd,
+                                              base_nodes_offset,
+                                              base_links_offset,
+                                              metadata.base_link_row_bytes,
+                                              metadata.maxM0);
   const auto layer0_elapsed_ms = elapsed_ms_since(layer0_start_time);
   RAFT_LOG_INFO("Layered HNSW: base links written in %ld ms (%.2f GiB, %.2f GiB/s effective)",
                 layer0_elapsed_ms,
@@ -2081,6 +2059,7 @@ auto deserialize_layered_hnsw(raft::resources const& res,
   const auto metadata_elapsed_ms = elapsed_ms_since(metadata_start_time);
 
   RAFT_EXPECTS(metadata.n_rows > 0, "Layered HNSW artifact must contain at least one row");
+  RAFT_EXPECTS(metadata.dim > 0, "Layered HNSW artifact must contain at least one dimension");
   RAFT_EXPECTS(static_cast<size_t>(dim) == metadata.dim,
                "Layered HNSW artifact dim (%zu) does not match requested dim (%d)",
                metadata.dim,
@@ -2096,7 +2075,8 @@ auto deserialize_layered_hnsw(raft::resources const& res,
   const auto payload_offset =
     align_up(header.metadata_offset + header.metadata_size, layered_hnsw_alignment);
   const auto levels_offset      = payload_offset;
-  const auto base_links_offset  = levels_offset + metadata.levels_bytes;
+  const auto base_nodes_offset  = levels_offset + metadata.levels_bytes;
+  const auto base_links_offset  = base_nodes_offset + metadata.base_nodes_bytes;
   const auto upper_nodes_offset = base_links_offset + metadata.base_links_bytes;
   const auto upper_links_offset = upper_nodes_offset + metadata.upper_nodes_bytes;
   const auto expected_file_size = upper_links_offset + metadata.upper_links_bytes;
@@ -2119,6 +2099,8 @@ auto deserialize_layered_hnsw(raft::resources const& res,
                "Layered HNSW dataset shape mismatch");
   RAFT_EXPECTS(metadata.levels_bytes == metadata.n_rows * sizeof(uint8_t),
                "Layered HNSW levels section size mismatch");
+  RAFT_EXPECTS(metadata.base_nodes_bytes == metadata.n_rows * sizeof(uint32_t),
+               "Layered HNSW base node section size mismatch");
   RAFT_EXPECTS(metadata.base_links_bytes == metadata.n_rows * metadata.base_link_row_bytes,
                "Layered HNSW base links section size mismatch");
   RAFT_EXPECTS(metadata.upper_nodes_bytes == metadata.upper_nodes_count * sizeof(uint32_t),
@@ -2172,36 +2154,26 @@ auto deserialize_layered_hnsw(raft::resources const& res,
   auto num_threads =
     params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads;
 
-  RAFT_LOG_INFO("Layered HNSW: initializing in-memory hnswlib index from local dataset");
   const size_t target_batch_bytes = 64 * 1024 * 1024;
-  const size_t row_bytes          = metadata.dim * sizeof(T) + metadata.base_link_row_bytes;
-  const size_t batch_size         = std::max<size_t>(1, target_batch_bytes / row_bytes);
+  const size_t dataset_row_bytes  = metadata.dim * sizeof(T);
+  const size_t batch_size         = std::max<size_t>(1, target_batch_bytes / dataset_row_bytes);
   auto dataset_buffer =
     raft::make_host_matrix<T, int64_t>(static_cast<int64_t>(batch_size), metadata.dim);
-  std::vector<char> base_link_buffer(batch_size * metadata.base_link_row_bytes);
 
   const auto base_start_time = std::chrono::steady_clock::now();
   std::chrono::steady_clock::duration dataset_read_time{};
-  std::chrono::steady_clock::duration base_link_read_time{};
   std::chrono::steady_clock::duration base_copy_time{};
-  size_t dataset_bytes_read   = 0;
-  size_t base_link_bytes_read = 0;
+  size_t dataset_bytes_read = 0;
   for (size_t batch_start = 0; batch_start < metadata.n_rows; batch_start += batch_size) {
-    const auto current_batch_size = std::min(batch_size, metadata.n_rows - batch_start);
-    auto batch_timer              = std::chrono::steady_clock::now();
+    const auto current_batch_size    = std::min(batch_size, metadata.n_rows - batch_start);
+    const auto current_dataset_bytes = current_batch_size * metadata.dim * sizeof(T);
+    auto batch_timer                 = std::chrono::steady_clock::now();
     cuvs::util::read_large_file(dataset_file.fd,
                                 dataset_buffer.data_handle(),
-                                current_batch_size * metadata.dim * sizeof(T),
+                                current_dataset_bytes,
                                 dataset_file.header_size + batch_start * metadata.dim * sizeof(T));
     dataset_read_time += std::chrono::steady_clock::now() - batch_timer;
-    dataset_bytes_read += current_batch_size * metadata.dim * sizeof(T);
-    batch_timer = std::chrono::steady_clock::now();
-    cuvs::util::read_large_file(artifact_fd,
-                                base_link_buffer.data(),
-                                current_batch_size * metadata.base_link_row_bytes,
-                                base_links_offset + batch_start * metadata.base_link_row_bytes);
-    base_link_read_time += std::chrono::steady_clock::now() - batch_timer;
-    base_link_bytes_read += current_batch_size * metadata.base_link_row_bytes;
+    dataset_bytes_read += current_dataset_bytes;
 
     bool link_list_allocation_failed = false;
     batch_timer                      = std::chrono::steady_clock::now();
@@ -2210,10 +2182,6 @@ auto deserialize_layered_hnsw(raft::resources const& res,
       const auto i                  = batch_start + static_cast<size_t>(batch_idx);
       auto level                    = static_cast<int32_t>(levels_u8[i]);
       appr_algo->element_levels_[i] = level;
-      auto ll0                      = appr_algo->get_linklist0(i);
-      memcpy(ll0,
-             base_link_buffer.data() + batch_idx * metadata.base_link_row_bytes,
-             metadata.base_link_row_bytes);
       memcpy(appr_algo->getDataByInternalId(i),
              dataset_buffer.data_handle() + batch_idx * metadata.dim,
              appr_algo->data_size_);
@@ -2225,7 +2193,6 @@ auto deserialize_layered_hnsw(raft::resources const& res,
           link_list_allocation_failed = true;
           continue;
         }
-        memset(appr_algo->linkLists_[i], 0, link_list_size);
       }
     }
     if (link_list_allocation_failed) {
@@ -2239,36 +2206,83 @@ auto deserialize_layered_hnsw(raft::resources const& res,
   }
   const auto base_elapsed_ms = elapsed_ms_since(base_start_time);
   RAFT_LOG_INFO(
-    "Layered HNSW load: base layer initialized in %ld ms "
-    "(dataset read %.2f ms %.2f GiB %.2f GiB/s, links read %.2f ms %.2f GiB %.2f GiB/s, "
-    "copy %.2f ms)",
+    "Layered HNSW load: hnswlib data and levels initialized in %ld ms "
+    "(dataset read %.2f ms %.2f GiB %.2f GiB/s, copy %.2f ms)",
     base_elapsed_ms,
     elapsed_ms(dataset_read_time),
     to_gib(dataset_bytes_read),
     throughput_gib_per_s(dataset_bytes_read, dataset_read_time),
-    elapsed_ms(base_link_read_time),
-    to_gib(base_link_bytes_read),
-    throughput_gib_per_s(base_link_bytes_read, base_link_read_time),
     elapsed_ms(base_copy_time));
 
-  RAFT_LOG_INFO("Layered HNSW: loading upper-layer topology");
+  const auto base_topology_start_time = std::chrono::steady_clock::now();
+  std::chrono::steady_clock::duration base_topology_read_time{};
+  std::chrono::steady_clock::duration base_topology_copy_time{};
+  size_t base_topology_bytes_read      = 0;
+  const size_t base_topology_row_bytes = sizeof(uint32_t) + metadata.base_link_row_bytes;
+  const size_t base_topology_batch_size =
+    std::max<size_t>(1, target_batch_bytes / base_topology_row_bytes);
+  std::vector<uint32_t> base_node_buffer(base_topology_batch_size);
+  std::vector<char> base_link_buffer(base_topology_batch_size * metadata.base_link_row_bytes);
+  for (size_t batch_start = 0; batch_start < metadata.n_rows;
+       batch_start += base_topology_batch_size) {
+    const auto current_batch_size =
+      std::min(base_topology_batch_size, metadata.n_rows - batch_start);
+    const auto current_base_topology_bytes =
+      current_batch_size * (sizeof(uint32_t) + metadata.base_link_row_bytes);
+    auto batch_timer = std::chrono::steady_clock::now();
+    cuvs::util::read_large_file(artifact_fd,
+                                base_node_buffer.data(),
+                                current_batch_size * sizeof(uint32_t),
+                                base_nodes_offset + batch_start * sizeof(uint32_t));
+    cuvs::util::read_large_file(artifact_fd,
+                                base_link_buffer.data(),
+                                current_batch_size * metadata.base_link_row_bytes,
+                                base_links_offset + batch_start * metadata.base_link_row_bytes);
+    base_topology_read_time += std::chrono::steady_clock::now() - batch_timer;
+    base_topology_bytes_read += current_base_topology_bytes;
+
+    batch_timer          = std::chrono::steady_clock::now();
+    bool invalid_node_id = false;
+#pragma omp parallel for num_threads(num_threads) reduction(|| : invalid_node_id)
+    for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size); ++batch_idx) {
+      const auto node_id = static_cast<size_t>(base_node_buffer[batch_idx]);
+      if (node_id >= metadata.n_rows) {
+        invalid_node_id = true;
+        continue;
+      }
+      auto ll0 = appr_algo->get_linklist0(node_id);
+      memcpy(ll0,
+             base_link_buffer.data() + batch_idx * metadata.base_link_row_bytes,
+             metadata.base_link_row_bytes);
+    }
+    RAFT_EXPECTS(!invalid_node_id, "Invalid base-layer node id in layered HNSW artifact");
+    base_topology_copy_time += std::chrono::steady_clock::now() - batch_timer;
+  }
+  const auto base_topology_elapsed_ms = elapsed_ms_since(base_topology_start_time);
+  RAFT_LOG_INFO(
+    "Layered HNSW load: base-layer topology loaded in %ld ms "
+    "(read %.2f ms %.2f GiB %.2f GiB/s, scatter-copy %.2f ms)",
+    base_topology_elapsed_ms,
+    elapsed_ms(base_topology_read_time),
+    to_gib(base_topology_bytes_read),
+    throughput_gib_per_s(base_topology_bytes_read, base_topology_read_time),
+    elapsed_ms(base_topology_copy_time));
+
   const auto upper_start_time = std::chrono::steady_clock::now();
   std::chrono::steady_clock::duration upper_read_time{};
-  std::chrono::steady_clock::duration upper_validate_time{};
   std::chrono::steady_clock::duration upper_copy_time{};
   size_t upper_bytes_read = 0;
   for (const auto& layer : metadata.layers) {
-    const auto layer_start_time = std::chrono::steady_clock::now();
-    RAFT_LOG_INFO("Layered HNSW: loading layer %zu (%zu rows, degree %zu)",
-                  layer.level,
-                  layer.row_count,
-                  layer.degree);
-    const auto layer_batch_size = std::min(batch_size, layer.row_count);
+    const auto layer_row_bytes = sizeof(uint32_t) + metadata.upper_link_row_bytes;
+    const auto layer_batch_size =
+      std::min(std::max<size_t>(1, target_batch_bytes / layer_row_bytes), layer.row_count);
     std::vector<uint32_t> node_buffer(layer_batch_size);
     std::vector<char> link_buffer(layer_batch_size * metadata.upper_link_row_bytes);
     for (size_t batch_start = 0; batch_start < layer.row_count; batch_start += layer_batch_size) {
       const auto current_batch_size = std::min(layer_batch_size, layer.row_count - batch_start);
-      auto batch_timer              = std::chrono::steady_clock::now();
+      const auto current_upper_bytes =
+        current_batch_size * (sizeof(uint32_t) + metadata.upper_link_row_bytes);
+      auto batch_timer = std::chrono::steady_clock::now();
       cuvs::util::read_large_file(
         artifact_fd,
         node_buffer.data(),
@@ -2280,45 +2294,42 @@ auto deserialize_layered_hnsw(raft::resources const& res,
         current_batch_size * metadata.upper_link_row_bytes,
         upper_links_offset + (layer.link_offset + batch_start) * metadata.upper_link_row_bytes);
       upper_read_time += std::chrono::steady_clock::now() - batch_timer;
-      upper_bytes_read += current_batch_size * (sizeof(uint32_t) + metadata.upper_link_row_bytes);
-      batch_timer = std::chrono::steady_clock::now();
+      upper_bytes_read += current_upper_bytes;
+      batch_timer             = std::chrono::steady_clock::now();
+      bool invalid_node_id    = false;
+      bool invalid_node_level = false;
+#pragma omp parallel for num_threads(num_threads) \
+  reduction(|| : invalid_node_id, invalid_node_level)
       for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size);
            ++batch_idx) {
         const auto node_id = static_cast<size_t>(node_buffer[batch_idx]);
-        RAFT_EXPECTS(node_id < metadata.n_rows,
-                     "Invalid upper-layer node id %zu in layered HNSW artifact",
-                     node_id);
-        RAFT_EXPECTS(layer.level <= static_cast<size_t>(levels_u8[node_id]),
-                     "Layered HNSW artifact references node %zu at invalid level %zu",
-                     node_id,
-                     layer.level);
-      }
-      upper_validate_time += std::chrono::steady_clock::now() - batch_timer;
-      batch_timer = std::chrono::steady_clock::now();
-#pragma omp parallel for num_threads(num_threads)
-      for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size);
-           ++batch_idx) {
-        const auto node_id = static_cast<size_t>(node_buffer[batch_idx]);
-        auto ll            = appr_algo->get_linklist(node_id, layer.level);
+        if (node_id >= metadata.n_rows) {
+          invalid_node_id = true;
+          continue;
+        }
+        if (layer.level > static_cast<size_t>(levels_u8[node_id])) {
+          invalid_node_level = true;
+          continue;
+        }
+        auto ll = appr_algo->get_linklist(node_id, layer.level);
         memcpy(ll,
                link_buffer.data() + batch_idx * metadata.upper_link_row_bytes,
                metadata.upper_link_row_bytes);
       }
+      RAFT_EXPECTS(!invalid_node_id, "Invalid upper-layer node id in layered HNSW artifact");
+      RAFT_EXPECTS(!invalid_node_level,
+                   "Layered HNSW artifact references a node at an invalid upper level");
       upper_copy_time += std::chrono::steady_clock::now() - batch_timer;
     }
-    RAFT_LOG_INFO("Layered HNSW load: layer %zu loaded in %ld ms",
-                  layer.level,
-                  elapsed_ms_since(layer_start_time));
   }
   const auto upper_elapsed_ms = elapsed_ms_since(upper_start_time);
   RAFT_LOG_INFO(
     "Layered HNSW load: upper layers loaded in %ld ms "
-    "(read %.2f ms %.2f GiB %.2f GiB/s, validate %.2f ms, copy %.2f ms)",
+    "(read %.2f ms %.2f GiB %.2f GiB/s, validate+copy %.2f ms)",
     upper_elapsed_ms,
     elapsed_ms(upper_read_time),
     to_gib(upper_bytes_read),
     throughput_gib_per_s(upper_bytes_read, upper_read_time),
-    elapsed_ms(upper_validate_time),
     elapsed_ms(upper_copy_time));
 
   hnsw_index->set_index(std::move(appr_algo));
diff --git a/examples/cpp/src/hnsw_ace_layered_example.cu b/examples/cpp/src/hnsw_ace_layered_example.cu
index 02f735ef37..f11551df20 100644
--- a/examples/cpp/src/hnsw_ace_layered_example.cu
+++ b/examples/cpp/src/hnsw_ace_layered_example.cu
@@ -23,16 +23,16 @@
 //   index_dir/hnsw_index.cuvs
 //     fixed header + metadata JSON
 //     levels: uint8 [N], max HNSW level for each original row id
-//     base links: hnswlib-ready [count + uint32 neighbors padded to maxM0]
+//     base nodes + base links: uint32 node ids with hnswlib-ready link rows
 //     upper nodes + upper links: hnswlib-ready upper-layer topology
 //
 // The transferred index artifact is topology-only. The dataset is loaded locally during
 // deserialization from hnsw::index_params::dataset_path. The loader supports .npy and ANN benchmark
 // *.bin datasets; this example writes a local dataset .npy only to make the demo self-contained.
 //
-// Layer 0 rows and neighbor IDs are original dataset row IDs. Upper layers are generated with the
-// same level/order/KNN logic as serialize_to_hnswlib_from_disk, then stored as hnswlib-ready link
-// rows so deserialization does no graph remapping or link padding on the search node.
+// Layer 0 node IDs and neighbor IDs are original dataset row IDs. Upper layers are generated with
+// the same level/order/KNN logic as serialize_to_hnswlib_from_disk, then stored as hnswlib-ready
+// link rows so deserialization does no graph remapping or link padding on the search node.
 
 #include <algorithm>
 #include <cstdint>

From 4514bc85b1c1743ee05af9eb4b2465967925c390 Mon Sep 17 00:00:00 2001
From: Julian Miller <julianmiller@nvidia.com>
Date: Mon, 1 Jun 2026 10:15:11 +0200
Subject: [PATCH 6/6] Unify the ACE logging format

---
 .../neighbors/detail/cagra/cagra_build.cuh    | 234 +++++++++---------
 cpp/src/neighbors/detail/hnsw.hpp             | 160 ++++++++----
 2 files changed, 238 insertions(+), 156 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 96ff8344d3..3edbec46af 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -51,6 +51,13 @@ namespace cuvs::neighbors::cagra::detail {
 constexpr double to_mib(size_t bytes) { return static_cast<double>(bytes) / (1 << 20); }
 constexpr double to_gib(size_t bytes) { return static_cast<double>(bytes) / (1 << 30); }
 
+inline auto progress_step_10(size_t completed, size_t total) -> size_t
+{
+  if (total == 0) { return 100; }
+  const auto percent = (std::min(completed, total) * 100) / total;
+  return completed >= total ? 100 : (percent / 10) * 10;
+}
+
 template <typename T, typename IdxT>
 void check_graph_degree(size_t& intermediate_degree, size_t& graph_degree, size_t dataset_size)
 {
@@ -99,7 +106,7 @@ void ace_get_partition_labels(
   const size_t min_samples = 100 * n_partitions;
   n_samples                = std::max(n_samples, min_samples);
   n_samples                = std::min(n_samples, dataset_size);
-  RAFT_LOG_DEBUG("ACE: n_samples: %lu", n_samples);
+  RAFT_LOG_DEBUG("ACE build: partition labeling uses %lu sampled vectors", n_samples);
 
   auto sample_db = raft::make_host_matrix<float, int64_t>(n_samples, dataset_dim);
 #pragma omp parallel for
@@ -123,18 +130,10 @@ void ace_get_partition_labels(
   auto _sub_distances     = raft::make_host_matrix<float, int64_t>(chunk_size, n_partitions);
   auto _sub_dataset_dev   = raft::make_device_matrix<float, int64_t>(res, chunk_size, dataset_dim);
   auto _sub_distances_dev = raft::make_device_matrix<float, int64_t>(res, chunk_size, n_partitions);
-  size_t report_interval  = dataset_size / 10;
-  report_interval         = (report_interval / chunk_size) * chunk_size;
-  report_interval         = std::max(report_interval, chunk_size);
+  size_t next_progress_percent = 10;
 
   for (size_t i_base = 0; i_base < dataset_size; i_base += chunk_size) {
     const size_t sub_dataset_size = std::min(chunk_size, dataset_size - i_base);
-    if (i_base % report_interval == 0) {
-      RAFT_LOG_INFO("ACE: Processing chunk %lu / %lu (%.1f%%)",
-                    i_base,
-                    dataset_size,
-                    static_cast<double>(100 * i_base) / dataset_size);
-    }
 
     auto sub_dataset = raft::make_host_matrix_view<float, int64_t>(
       _sub_dataset.data_handle(), sub_dataset_size, dataset_dim);
@@ -191,6 +190,16 @@ void ace_get_partition_labels(
 #pragma omp atomic update
       partition_histogram(augmented_label, 1) += 1;
     }
+
+    const auto completed_rows = i_base + sub_dataset_size;
+    const auto report_percent = progress_step_10(completed_rows, dataset_size);
+    if (report_percent >= next_progress_percent) {
+      RAFT_LOG_INFO("ACE build: partition labeling progress %zu%% (%zu/%zu rows)",
+                    report_percent,
+                    completed_rows,
+                    dataset_size);
+      next_progress_percent = report_percent + 10;
+    }
   }
 }
 
@@ -261,15 +270,15 @@ void ace_check_partition_sizes(
 
     if (total_count > 0 && total_count < very_small_threshold) {
       RAFT_LOG_WARN(
-        "ACE: Partition %lu is very small (%lu vectors, expected ~%.1f). This may affect graph "
-        "quality.",
+        "ACE build: partition %lu is very small (%lu vectors, expected ~%.1f); graph quality may "
+        "be affected",
         c,
         total_count,
         expected_avg_vectors);
     } else if (total_count > very_large_threshold) {
       RAFT_LOG_WARN(
-        "ACE: Partition %lu is very large (%lu vectors, expected ~%.1f, threshold: %lu). This may "
-        "indicate imbalance and can lead to memory issues in restricted environments.",
+        "ACE build: partition %lu is very large (%lu vectors, expected ~%.1f, threshold=%lu); "
+        "partition imbalance may increase memory pressure",
         c,
         total_count,
         expected_avg_vectors,
@@ -482,11 +491,10 @@ void ace_reorder_and_store_dataset(
   size_t dataset_dim  = dataset.extent(1);
   size_t n_partitions = partition_histogram.extent(0);
 
-  RAFT_LOG_DEBUG(
-    "ACE: Reordering and storing dataset to disk (%lu vectors, %lu dimensions, %lu partitions)",
-    dataset_size,
-    dataset_dim,
-    n_partitions);
+  RAFT_LOG_DEBUG("ACE build: reordering dataset to disk (rows=%lu dim=%lu partitions=%lu)",
+                 dataset_size,
+                 dataset_dim,
+                 n_partitions);
 
   // Calculate total sizes for pre-allocation
   size_t total_core_vectors      = 0;
@@ -509,10 +517,10 @@ void ace_reorder_and_store_dataset(
   size_t reordered_file_size = total_core_vectors * vector_size;
   size_t augmented_file_size = total_augmented_vectors * vector_size;
 
-  RAFT_LOG_DEBUG("ACE: Reordered dataset: %lu core vectors (%.2f GiB)",
+  RAFT_LOG_DEBUG("ACE build: reordered dataset section rows=%lu size=%.2f GiB",
                  total_core_vectors,
                  reordered_file_size / (1024.0 * 1024.0 * 1024.0));
-  RAFT_LOG_DEBUG("ACE: Augmented dataset: %lu secondary vectors (%.2f GiB)",
+  RAFT_LOG_DEBUG("ACE build: augmented dataset section rows=%lu size=%.2f GiB",
                  total_augmented_vectors,
                  augmented_file_size / (1024.0 * 1024.0 * 1024.0));
 
@@ -541,7 +549,7 @@ void ace_reorder_and_store_dataset(
   disk_write_size           = std::min<size_t>(disk_write_size, 64 * 1024 * 1024);
   size_t vectors_per_buffer = std::max<size_t>(64, disk_write_size / vector_size);
 
-  RAFT_LOG_DEBUG("ACE: Reorder buffers: %lu vectors per buffer (%.2f MiB)",
+  RAFT_LOG_DEBUG("ACE build: reorder buffer rows=%lu size=%.2f MiB",
                  vectors_per_buffer,
                  to_mib(vectors_per_buffer * vector_size));
 
@@ -593,8 +601,8 @@ void ace_reorder_and_store_dataset(
     }
   };
 
-  size_t vectors_processed  = 0;
-  const size_t log_interval = std::max(dataset_size / 10, size_t(1));
+  size_t vectors_processed     = 0;
+  size_t next_progress_percent = 10;
   for (size_t i = 0; i < dataset_size; i++) {
     size_t core_partition      = partition_labels(i, 0);
     size_t secondary_partition = partition_labels(i, 1);
@@ -623,16 +631,18 @@ void ace_reorder_and_store_dataset(
     }
 
     vectors_processed++;
-    if (vectors_processed % log_interval == 0) {
-      RAFT_LOG_INFO("ACE: Processed %lu/%lu vectors (%.1f%%)",
+    const auto report_percent = progress_step_10(vectors_processed, dataset_size);
+    if (report_percent >= next_progress_percent) {
+      RAFT_LOG_INFO("ACE build: dataset reorder progress %zu%% (%zu/%zu rows)",
+                    report_percent,
                     vectors_processed,
-                    dataset_size,
-                    100.0 * vectors_processed / dataset_size);
+                    dataset_size);
+      next_progress_percent = report_percent + 10;
     }
   }
 
   // Flush all remaining buffers
-  RAFT_LOG_DEBUG("ACE: Flushing remaining buffers...");
+  RAFT_LOG_DEBUG("ACE build: flushing remaining reorder buffers");
 #pragma omp parallel sections
   {
 #pragma omp section
@@ -662,12 +672,12 @@ void ace_reorder_and_store_dataset(
     elapsed_ms > 0 ? to_mib(total_bytes_written) / (elapsed_ms / 1000.0) : 0.0;
 
   RAFT_LOG_INFO(
-    "ACE: Dataset (%.2f GiB reordered, %.2f GiB augmented, %.2f GiB mapping) reordering completed "
-    "in %ld ms (%.1f MiB/s)",
+    "ACE build: dataset reorder completed in %ld ms (reordered=%.2f GiB augmented=%.2f GiB "
+    "mapping=%.2f GiB, %.1f MiB/s)",
+    elapsed_ms,
     reordered_file_size / (1024.0 * 1024.0 * 1024.0),
     augmented_file_size / (1024.0 * 1024.0 * 1024.0),
     mapping_file_size / (1024.0 * 1024.0 * 1024.0),
-    elapsed_ms,
     throughput_mb_s);
 }
 
@@ -685,17 +695,17 @@ void ace_load_partition_dataset_from_disk(
 {
   size_t n_partitions = partition_histogram.extent(0);
 
-  RAFT_LOG_DEBUG("ACE: Loading partition %lu dataset from disk", partition_id);
+  RAFT_LOG_DEBUG("ACE build: loading partition %lu dataset from disk", partition_id);
 
   size_t core_size            = partition_histogram(partition_id, 0);
   size_t augmented_size       = partition_histogram(partition_id, 1);
   size_t total_partition_size = core_size + augmented_size;
 
-  RAFT_LOG_DEBUG("ACE: Partition %lu: %lu core + %lu augmented = %lu total vectors",
+  RAFT_LOG_DEBUG("ACE build: partition %lu rows=%lu (core=%lu augmented=%lu)",
                  partition_id,
+                 total_partition_size,
                  core_size,
-                 augmented_size,
-                 total_partition_size);
+                 augmented_size);
 
   RAFT_EXPECTS(static_cast<size_t>(sub_dataset.extent(0)) == total_partition_size,
                "sub_dataset rows (%lu) must match total partition size (%lu)",
@@ -712,10 +722,10 @@ void ace_load_partition_dataset_from_disk(
   const std::string augmented_dataset_path = build_dir + "/augmented_dataset.npy";
 
   if (!std::filesystem::exists(reordered_dataset_path)) {
-    RAFT_FAIL("ACE: Required file does not exist: %s", reordered_dataset_path.c_str());
+    RAFT_FAIL("ACE build: required file does not exist: %s", reordered_dataset_path.c_str());
   }
   if (!std::filesystem::exists(augmented_dataset_path)) {
-    RAFT_FAIL("ACE: Required file does not exist: %s", augmented_dataset_path.c_str());
+    RAFT_FAIL("ACE build: required file does not exist: %s", augmented_dataset_path.c_str());
   }
 
   size_t core_header_size      = 0;
@@ -748,7 +758,7 @@ void ace_load_partition_dataset_from_disk(
   core_file_offset += core_header_size;
   augmented_file_offset += augmented_header_size;
 
-  RAFT_LOG_DEBUG("ACE: Core file offset: %lu bytes, Augmented file offset: %lu bytes",
+  RAFT_LOG_DEBUG("ACE build: core file offset=%lu bytes augmented file offset=%lu bytes",
                  core_file_offset,
                  augmented_file_offset);
 
@@ -763,7 +773,7 @@ void ace_load_partition_dataset_from_disk(
       try {
         if (core_size > 0) {
           RAFT_LOG_DEBUG(
-            "ACE: Reading %lu core vectors from offset %lu", core_size, core_file_offset);
+            "ACE build: reading %lu core vectors from offset %lu", core_size, core_file_offset);
           cuvs::util::file_descriptor reordered_fd(reordered_dataset_path, O_RDONLY);
           const size_t core_bytes = core_size * vector_size;
           cuvs::util::read_large_file(
@@ -777,7 +787,7 @@ void ace_load_partition_dataset_from_disk(
     {
       try {
         if (augmented_size > 0) {
-          RAFT_LOG_DEBUG("ACE: Reading %lu augmented vectors from offset %lu",
+          RAFT_LOG_DEBUG("ACE build: reading %lu augmented vectors from offset %lu",
                          augmented_size,
                          augmented_file_offset);
           cuvs::util::file_descriptor augmented_fd(augmented_dataset_path, O_RDONLY);
@@ -879,7 +889,8 @@ bool ace_check_use_disk_mode(bool use_disk,
   // Use overridden memory limits if provided (> 0), otherwise query actual system memory
   if (max_host_memory_gb.has_value() && max_host_memory_gb.value() > 0) {
     mem.available_host_memory = static_cast<size_t>(max_host_memory_gb.value() * (1ULL << 30));
-    RAFT_LOG_INFO("ACE: Using overridden host memory limit: %.2f GiB", max_host_memory_gb.value());
+    RAFT_LOG_INFO("ACE build: using overridden host memory limit %.2f GiB",
+                  max_host_memory_gb.value());
   } else {
     mem.available_host_memory = cuvs::util::get_free_host_memory();
   }
@@ -903,7 +914,7 @@ bool ace_check_use_disk_mode(bool use_disk,
   mem.total_size       = mem.partition_labels_size + mem.id_mapping_size + mem.sub_dataset_size +
                    mem.sub_graph_size + mem.cagra_graph_size;
 
-  RAFT_LOG_INFO("ACE: Estimated host memory required: %.2f GiB, available: %.2f GiB",
+  RAFT_LOG_INFO("ACE build: host memory estimate required=%.2f GiB available=%.2f GiB",
                 to_gib(mem.total_size),
                 to_gib(mem.available_host_memory));
 
@@ -915,7 +926,8 @@ bool ace_check_use_disk_mode(bool use_disk,
   // TODO: Extend model or use managed memory if running out of GPU memory.
   if (max_gpu_memory_gb.has_value() && max_gpu_memory_gb.value() > 0) {
     mem.available_gpu_memory = static_cast<size_t>(max_gpu_memory_gb.value() * (1ULL << 30));
-    RAFT_LOG_INFO("ACE: Using overridden GPU memory limit: %.2f GiB", max_gpu_memory_gb.value());
+    RAFT_LOG_INFO("ACE build: using overridden GPU memory limit %.2f GiB",
+                  max_gpu_memory_gb.value());
   } else {
     mem.available_gpu_memory = rmm::available_device_memory().second;
   }
@@ -923,7 +935,7 @@ bool ace_check_use_disk_mode(bool use_disk,
     static_cast<size_t>(usable_gpu_memory_fraction * mem.available_gpu_memory) <
     std::max(mem.sub_graph_size, mem.sub_dataset_size);
 
-  RAFT_LOG_INFO("ACE: Estimated GPU memory required: %.2f GiB, available: %.2f GiB",
+  RAFT_LOG_INFO("ACE build: GPU memory estimate required=%.2f GiB available=%.2f GiB",
                 to_gib(mem.cagra_graph_size),
                 to_gib(mem.available_gpu_memory));
 
@@ -934,7 +946,7 @@ bool ace_check_use_disk_mode(bool use_disk,
     valid_build_dir &= build_dir.find('\0') == std::string::npos;
     valid_build_dir &= build_dir.find("//") == std::string::npos;
     if (!valid_build_dir) {
-      RAFT_LOG_WARN("ACE: Invalid build_dir path, resetting to default: /tmp/ace_build");
+      RAFT_LOG_WARN("ACE build: invalid build_dir path, resetting to /tmp/ace_build");
       build_dir = "/tmp/ace_build";
     }
     if (mkdir(build_dir.c_str(), 0755) != 0 && errno != EEXIST) {
@@ -943,24 +955,19 @@ bool ace_check_use_disk_mode(bool use_disk,
   }
 
   if (host_memory_limited && gpu_memory_limited) {
-    RAFT_LOG_INFO(
-      "ACE: Graph does not fit in host and GPU memory. Using disk-mode with temporary storage %s",
-      build_dir.c_str());
+    RAFT_LOG_INFO("ACE build: graph does not fit in host or GPU memory; using disk mode at %s",
+                  build_dir.c_str());
   } else if (host_memory_limited) {
-    RAFT_LOG_INFO(
-      "ACE: Graph does not fit in host memory. Using disk-mode with temporary storage %s",
-      build_dir.c_str());
+    RAFT_LOG_INFO("ACE build: graph does not fit in host memory; using disk mode at %s",
+                  build_dir.c_str());
   } else if (gpu_memory_limited) {
-    RAFT_LOG_INFO(
-      "ACE: Graph does not fit in GPU memory. Using disk-mode with temporary storage %s",
-      build_dir.c_str());
+    RAFT_LOG_INFO("ACE build: graph does not fit in GPU memory; using disk mode at %s",
+                  build_dir.c_str());
   } else if (use_disk) {
-    RAFT_LOG_INFO(
-      "ACE: Graph fits in host and GPU memory but disk mode is forced. Using disk-mode with "
-      "temporary storage %s",
-      build_dir.c_str());
+    RAFT_LOG_INFO("ACE build: graph fits in host and GPU memory but disk mode is forced; using %s",
+                  build_dir.c_str());
   } else {
-    RAFT_LOG_INFO("ACE: Graph fits in host and GPU memory. Using in-memory mode.");
+    RAFT_LOG_INFO("ACE build: graph fits in host and GPU memory; using in-memory mode");
   }
 
   return use_disk_mode;
@@ -1005,8 +1012,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions,
       disk_mode_host_required) {
     host_memory_insufficient = true;
     RAFT_LOG_WARN(
-      "ACE: Host memory insufficient for disk mode. Required: %.2f GiB, available: %.2f GiB. "
-      "Per-partition breakdown: dataset %.2f GiB, graph %.2f GiB, workspace %.2f GiB",
+      "ACE build: host memory is insufficient for disk mode: required=%.2f GiB available=%.2f "
+      "GiB (partition dataset=%.2f GiB graph=%.2f GiB workspace=%.2f GiB)",
       to_gib(disk_mode_host_required),
       to_gib(mem.available_host_memory),
       to_gib(mem.sub_dataset_size),
@@ -1017,8 +1024,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions,
     double available_for_scaling = usable_cpu_memory_fraction * mem.available_host_memory -
                                    mem.partition_labels_size - mem.id_mapping_size;
     RAFT_EXPECTS(available_for_scaling > 0,
-                 "ACE: Host memory insufficient even for constant overhead (labels + id_mapping). "
-                 "Required: %.2f GiB, available: %.2f GiB",
+                 "ACE build: host memory is insufficient even for constant overhead "
+                 "(labels + id_mapping): required=%.2f GiB available=%.2f GiB",
                  to_gib(mem.partition_labels_size + mem.id_mapping_size),
                  to_gib(usable_cpu_memory_fraction * mem.available_host_memory));
     host_suggested_partitions = static_cast<size_t>(
@@ -1040,9 +1047,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions,
       disk_mode_gpu_required) {
     gpu_memory_insufficient = true;
     RAFT_LOG_WARN(
-      "ACE: GPU memory insufficient for per-partition processing. Required: %.2f GiB, "
-      "available: %.2f GiB. Per-partition breakdown: dataset %.2f GiB, graph %.2f GiB, "
-      "workspace %.2f GiB",
+      "ACE build: GPU memory is insufficient for partition processing: required=%.2f GiB "
+      "available=%.2f GiB (partition dataset=%.2f GiB graph=%.2f GiB workspace=%.2f GiB)",
       to_gib(disk_mode_gpu_required),
       to_gib(mem.available_gpu_memory),
       to_gib(mem.sub_dataset_size),
@@ -1060,8 +1066,7 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions,
     size_t new_n_partitions = std::max(host_suggested_partitions, gpu_suggested_partitions);
 
     RAFT_LOG_WARN(
-      "ACE: Automatically increasing number of partitions from %zu to %zu to satisfy memory "
-      "constraints.%s%s",
+      "ACE build: increasing partitions from %zu to %zu to satisfy memory constraints.%s%s",
       original_n_partitions,
       new_n_partitions,
       host_memory_insufficient
@@ -1088,8 +1093,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions,
                                                                      guarantee_connectivity);
 
     RAFT_LOG_INFO(
-      "ACE: Updated per-partition memory estimates: dataset %.2f GiB, graph %.2f GiB, "
-      "host workspace %.2f GiB, GPU workspace %.2f GiB",
+      "ACE build: updated partition memory estimates dataset=%.2f GiB graph=%.2f GiB host "
+      "workspace=%.2f GiB GPU workspace=%.2f GiB",
       to_gib(mem.sub_dataset_size),
       to_gib(mem.sub_graph_size),
       to_gib(new_opt_host_ws),
@@ -1131,15 +1136,15 @@ index<T, IdxT> build_ace(raft::resources const& res,
   size_t dataset_size = dataset.extent(0);
   size_t dataset_dim  = dataset.extent(1);
 
-  RAFT_EXPECTS(dataset_size > 0, "ACE: Dataset must not be empty");
+  RAFT_EXPECTS(dataset_size > 0, "ACE build: dataset must not be empty");
   if (dataset_size < 1000) {
-    RAFT_LOG_WARN("ACE: Very small dataset size (%zu), consider using regular CAGRA build instead.",
+    RAFT_LOG_WARN("ACE build: very small dataset size (%zu); regular CAGRA may be simpler",
                   dataset_size);
   }
-  RAFT_EXPECTS(dataset_dim > 0, "ACE: Dataset dimension must be greater than 0");
+  RAFT_EXPECTS(dataset_dim > 0, "ACE build: dataset dimension must be greater than 0");
   RAFT_EXPECTS(params.intermediate_graph_degree > 0,
-               "ACE: Intermediate graph degree must be greater than 0");
-  RAFT_EXPECTS(params.graph_degree > 0, "ACE: Graph degree must be greater than 0");
+               "ACE build: intermediate graph degree must be greater than 0");
+  RAFT_EXPECTS(params.graph_degree > 0, "ACE build: graph degree must be greater than 0");
 
   size_t n_partitions = npartitions;
   if (n_partitions == 0) {
@@ -1152,17 +1157,22 @@ index<T, IdxT> build_ace(raft::resources const& res,
     n_partitions = dataset_size / min_required_per_partition;
     if (n_partitions < 2) {
       RAFT_LOG_WARN(
-        "ACE: Reduced number of partitions to the minimum of 2 to avoid tiny partitions. Consider "
-        "using regular CAGRA build instead.");
+        "ACE build: reduced partitions to the minimum of 2 to avoid tiny partitions; regular "
+        "CAGRA may be simpler");
       n_partitions = 2;
     } else {
-      RAFT_LOG_WARN("ACE: Reduced number of partitions to %zu to avoid tiny partitions",
-                    n_partitions);
+      RAFT_LOG_WARN("ACE build: reduced partitions to %zu to avoid tiny partitions", n_partitions);
     }
   }
 
   auto total_start = std::chrono::high_resolution_clock::now();
-  RAFT_LOG_INFO("ACE: Starting partitioned CAGRA build with %zu partitions", n_partitions);
+  RAFT_LOG_INFO(
+    "ACE build: start rows=%zu dim=%zu partitions=%zu graph_degree=%zu intermediate_degree=%zu",
+    dataset_size,
+    dataset_dim,
+    n_partitions,
+    static_cast<size_t>(params.graph_degree),
+    static_cast<size_t>(params.intermediate_graph_degree));
 
   size_t intermediate_degree = params.intermediate_graph_degree;
   size_t graph_degree        = params.graph_degree;
@@ -1228,7 +1238,7 @@ index<T, IdxT> build_ace(raft::resources const& res,
         build_dir + "/cagra_graph.npy", {dataset_size, graph_degree});
 
       RAFT_LOG_DEBUG(
-        "ACE: Wrote numpy headers (reordered: %zu, augmented: %zu, mapping: %zu, graph: %zu bytes)",
+        "ACE build: wrote numpy headers (reordered=%zu augmented=%zu mapping=%zu graph=%zu bytes)",
         reordered_header_size,
         augmented_header_size,
         mapping_header_size,
@@ -1259,11 +1269,9 @@ index<T, IdxT> build_ace(raft::resources const& res,
     auto partition_elapsed =
       std::chrono::duration_cast<std::chrono::milliseconds>(partition_end - partition_start)
         .count();
-    RAFT_LOG_INFO(
-      "ACE: Partition labeling completed in %ld ms (min_partition_size: "
-      "%lu)",
-      partition_elapsed,
-      min_partition_size);
+    RAFT_LOG_INFO("ACE build: partition labeling completed in %ld ms (min_partition_size=%lu)",
+                  partition_elapsed,
+                  min_partition_size);
 
     // Create vector lists for each partition
     auto vectorlist_start      = std::chrono::high_resolution_clock::now();
@@ -1288,7 +1296,7 @@ index<T, IdxT> build_ace(raft::resources const& res,
     auto vectorlist_elapsed =
       std::chrono::duration_cast<std::chrono::milliseconds>(vectorlist_end - vectorlist_start)
         .count();
-    RAFT_LOG_INFO("ACE: Vector list creation completed in %ld ms", vectorlist_elapsed);
+    RAFT_LOG_INFO("ACE build: partition mapping completed in %ld ms", vectorlist_elapsed);
 
     // Reorder the dataset based on partitions and store to disk. Uses write buffers to improve
     // performance.
@@ -1319,7 +1327,7 @@ index<T, IdxT> build_ace(raft::resources const& res,
     // Process each partition
     auto partition_processing_start = std::chrono::high_resolution_clock::now();
     for (size_t partition_id = 0; partition_id < n_partitions; partition_id++) {
-      RAFT_LOG_DEBUG("ACE: Processing partition %lu/%lu", partition_id + 1, n_partitions);
+      RAFT_LOG_DEBUG("ACE build: processing partition %lu/%lu", partition_id + 1, n_partitions);
       auto start = std::chrono::high_resolution_clock::now();
 
       // Extract vectors for this partition
@@ -1328,10 +1336,11 @@ index<T, IdxT> build_ace(raft::resources const& res,
       size_t sub_dataset_size           = core_sub_dataset_size + augmented_sub_dataset_size;
 
       if (sub_dataset_size == 0) {
-        RAFT_LOG_WARN("ACE: Skipping empty partition %lu", partition_id);
+        RAFT_LOG_WARN("ACE build: Skipping empty partition %lu", partition_id);
         continue;
       }
-      RAFT_LOG_DEBUG("ACE: Sub-dataset size: %lu (%lu + %lu)",
+      RAFT_LOG_DEBUG("ACE build: partition %lu rows=%lu (core=%lu augmented=%lu)",
+                     partition_id,
                      sub_dataset_size,
                      core_sub_dataset_size,
                      augmented_sub_dataset_size);
@@ -1444,12 +1453,12 @@ index<T, IdxT> build_ace(raft::resources const& res,
           ? to_mib(core_sub_dataset_size * dataset_dim * sizeof(T)) / (write_elapsed / 1000.0)
           : 0.0;
       RAFT_LOG_INFO(
-        "ACE: Partition %4lu (%8lu + %8lu) completed in %6ld ms: read %6ld ms (%7.1f MiB/s), "
-        "optimize %6ld ms, adjust %6ld ms, write %6ld ms (%7.1f MiB/s)",
+        "ACE build: partition %4lu completed in %6ld ms (core=%8lu augmented=%8lu read=%6ld ms "
+        "%.1f MiB/s optimize=%6ld ms adjust=%6ld ms write=%6ld ms %.1f MiB/s)",
         partition_id,
+        elapsed_ms,
         core_sub_dataset_size,
         augmented_sub_dataset_size,
-        elapsed_ms,
         read_elapsed,
         read_throughput,
         optimize_elapsed,
@@ -1462,7 +1471,7 @@ index<T, IdxT> build_ace(raft::resources const& res,
     auto partition_processing_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
                                           partition_processing_end - partition_processing_start)
                                           .count();
-    RAFT_LOG_INFO("ACE: All partition processing completed in %ld ms (%zu partitions)",
+    RAFT_LOG_INFO("ACE build: partition graph build completed in %ld ms (partitions=%zu)",
                   partition_processing_elapsed,
                   n_partitions);
 
@@ -1472,7 +1481,7 @@ index<T, IdxT> build_ace(raft::resources const& res,
       const std::string augmented_dataset_path = build_dir + "/augmented_dataset.npy";
       if (std::filesystem::exists(augmented_dataset_path)) {
         std::filesystem::remove(augmented_dataset_path);
-        RAFT_LOG_INFO("ACE: Removed augmented dataset file to save disk space");
+        RAFT_LOG_INFO("ACE build: removed temporary augmented dataset");
       }
     }
 
@@ -1488,12 +1497,12 @@ index<T, IdxT> build_ace(raft::resources const& res,
           idx.update_dataset(res, dataset);
         } catch (std::bad_alloc& e) {
           RAFT_LOG_WARN(
-            "Insufficient GPU memory to attach dataset to ACE index. Only the graph will be "
-            "stored.");
+            "ACE build: insufficient GPU memory to attach dataset to index; only the graph will "
+            "be stored");
         } catch (raft::logic_error& e) {
           RAFT_LOG_WARN(
-            "Insufficient GPU memory to attach dataset to ACE index. Only the graph will be "
-            "stored.");
+            "ACE build: insufficient GPU memory to attach dataset to index; only the graph will "
+            "be stored");
         }
       }
     } else {
@@ -1501,37 +1510,36 @@ index<T, IdxT> build_ace(raft::resources const& res,
       idx.update_graph(res, std::move(graph_fd));
       idx.update_mapping(res, std::move(mapping_fd));
 
-      RAFT_LOG_INFO(
-        "ACE: Set disk storage at %s (dataset shape [%zu, %zu], graph shape [%zu, %zu])",
-        build_dir.c_str(),
-        idx.size(),
-        idx.dim(),
-        idx.size(),
-        idx.graph_degree());
+      RAFT_LOG_INFO("ACE build: disk artifacts ready at %s (dataset=[%zu,%zu] graph=[%zu,%zu])",
+                    build_dir.c_str(),
+                    idx.size(),
+                    idx.dim(),
+                    idx.size(),
+                    idx.graph_degree());
     }
 
     auto index_creation_end     = std::chrono::high_resolution_clock::now();
     auto index_creation_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
                                     index_creation_end - index_creation_start)
                                     .count();
-    RAFT_LOG_INFO("ACE: Final index creation completed in %ld ms", index_creation_elapsed);
+    RAFT_LOG_INFO("ACE build: final index initialized in %ld ms", index_creation_elapsed);
 
     auto total_end = std::chrono::high_resolution_clock::now();
     auto total_elapsed =
       std::chrono::duration_cast<std::chrono::milliseconds>(total_end - total_start).count();
-    RAFT_LOG_INFO("ACE: Partitioned CAGRA build completed in %ld ms total", total_elapsed);
+    RAFT_LOG_INFO("ACE build: completed in %ld ms", total_elapsed);
 
     return idx;
   } catch (const std::exception& e) {
     // Clean up build directory on failure if we created it
-    RAFT_LOG_ERROR("ACE: Build failed with exception: %s", e.what());
+    RAFT_LOG_ERROR("ACE build: failed with exception: %s", e.what());
     if (cleanup_on_failure && !build_dir.empty()) {
-      RAFT_LOG_INFO("ACE: Cleaning up build directory: %s", build_dir.c_str());
+      RAFT_LOG_INFO("ACE build: cleaning up build directory %s", build_dir.c_str());
       try {
         std::filesystem::remove_all(build_dir);
-        RAFT_LOG_INFO("ACE: Successfully removed build directory");
+        RAFT_LOG_INFO("ACE build: removed build directory");
       } catch (const std::exception& cleanup_error) {
-        RAFT_LOG_WARN("ACE: Failed to clean up build directory: %s", cleanup_error.what());
+        RAFT_LOG_WARN("ACE build: failed to clean up build directory: %s", cleanup_error.what());
       }
     }
     // Re-throw the original exception
diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index d090de07f8..5be875d804 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -411,6 +411,13 @@ inline auto throughput_gib_per_s(size_t bytes, std::chrono::steady_clock::durati
   return elapsed_s > 0.0 ? to_gib(bytes) / elapsed_s : 0.0;
 }
 
+inline auto progress_step_10(size_t completed, size_t total) -> size_t
+{
+  if (total == 0) { return 100; }
+  const auto percent = (std::min(completed, total) * 100) / total;
+  return completed >= total ? 100 : (percent / 10) * 10;
+}
+
 inline auto make_hnsw_level_plan_from_levels(size_t n_rows,
                                              std::vector<uint8_t>&& levels,
                                              bool build_reverse_order,
@@ -486,7 +493,7 @@ void build_hnsw_upper_layer_graphs(
     const auto row_count     = plan.n_rows - start_idx;
     const auto neighbor_size = hnsw_upper_layer_degree(row_count, M);
 
-    RAFT_LOG_INFO("Compute hierarchy neighbors level %zu", pt_level);
+    RAFT_LOG_INFO("Layered HNSW artifact: computing upper-layer neighbors level=%zu", pt_level);
     auto host_neighbors = raft::make_host_matrix<IdxT, int64_t>(
       static_cast<int64_t>(row_count), static_cast<int64_t>(neighbor_size));
 
@@ -814,7 +821,7 @@ void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index<T, I
   RAFT_EXPECTS(base_link_row_bytes >= sizeof(hnswlib::linklistsizeint) + maxM0 * sizeof(IdxT),
                "Base link row size is too small");
 
-  RAFT_LOG_INFO("HNSW remap: loading ACE reordered-to-original mapping (%zu rows)", n_rows);
+  RAFT_LOG_INFO("Layered HNSW artifact: loading ACE row mapping (%zu rows)", n_rows);
   const auto mapping_start_time = std::chrono::steady_clock::now();
   const auto mapping_bytes      = n_rows * sizeof(IdxT);
   std::vector<IdxT> reordered_to_original(n_rows);
@@ -828,11 +835,18 @@ void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index<T, I
                  reordered_id);
   }
   const auto mapping_elapsed_ms = elapsed_ms_since(mapping_start_time);
-  RAFT_LOG_INFO("HNSW remap: mapping loaded in %ld ms (%.2f GiB, %.2f GiB/s)",
+  RAFT_LOG_INFO("Layered HNSW artifact: ACE row mapping loaded in %ld ms (%.2f GiB, %.2f GiB/s)",
                 mapping_elapsed_ms,
                 to_gib(mapping_bytes),
                 throughput_gib_per_s(mapping_bytes, mapping_elapsed_ms));
 
+  RAFT_LOG_INFO(
+    "Layered HNSW artifact: writing base topology with source-sequential graph reads "
+    "(rows=%zu degree=%zu maxM0=%zu)",
+    n_rows,
+    degree,
+    maxM0);
+
   const auto total_start_time       = std::chrono::steady_clock::now();
   const auto graph_row_bytes        = degree * sizeof(IdxT);
   const auto base_node_row_bytes    = sizeof(IdxT);
@@ -847,6 +861,14 @@ void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index<T, I
   size_t graph_bytes_read   = 0;
   size_t node_bytes_written = 0;
   size_t link_bytes_written = 0;
+  RAFT_LOG_INFO(
+    "Layered HNSW artifact: base topology batch_size=%zu graph_row=%zu bytes node_row=%zu bytes "
+    "link_row=%zu bytes",
+    batch_size,
+    graph_row_bytes,
+    base_node_row_bytes,
+    base_link_row_bytes);
+  size_t next_report_percent = 10;
   for (size_t source_start = 0; source_start < n_rows; source_start += batch_size) {
     const auto current_batch_size = std::min(batch_size, n_rows - source_start);
     const auto batch_bytes        = current_batch_size * graph_row_bytes;
@@ -894,11 +916,27 @@ void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index<T, I
                                  base_links_offset + source_start * base_link_row_bytes);
     node_bytes_written += current_node_bytes;
     link_bytes_written += current_link_bytes;
+
+    const auto completed_rows = source_start + current_batch_size;
+    const auto report_percent = progress_step_10(completed_rows, n_rows);
+    if (report_percent >= next_report_percent) {
+      const auto elapsed = elapsed_ms_since(total_start_time);
+      RAFT_LOG_INFO(
+        "Layered HNSW artifact: base topology progress %zu%% (%zu/%zu rows, read=%.2f GiB "
+        "write=%.2f GiB, %ld ms)",
+        report_percent,
+        completed_rows,
+        n_rows,
+        to_gib(graph_bytes_read),
+        to_gib(node_bytes_written + link_bytes_written),
+        elapsed);
+      next_report_percent = report_percent + 10;
+    }
   }
 
   const auto total_elapsed_ms = elapsed_ms_since(total_start_time);
   RAFT_LOG_INFO(
-    "HNSW remap: base topology written in %ld ms (graph_read=%.2f GiB %.2f GiB/s, "
+    "Layered HNSW artifact: base topology written in %ld ms (graph_read=%.2f GiB %.2f GiB/s, "
     "artifact_write=%.2f GiB %.2f GiB/s, nodes=%.2f GiB links=%.2f GiB)",
     total_elapsed_ms,
     to_gib(graph_bytes_read),
@@ -945,14 +983,15 @@ auto serialize_to_layered_hnsw_from_disk(
   auto appr_algo = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
     hnsw_index->get_space(), 1, (graph_degree_int + 1) / 2, params.ef_construction);
 
-  RAFT_LOG_INFO("Layered HNSW: generating hierarchy levels");
+  RAFT_LOG_INFO("Layered HNSW artifact: generating hierarchy levels");
   const auto hierarchy_start_time = std::chrono::steady_clock::now();
   auto hierarchy                  = make_random_hnsw_level_plan(n_rows, *appr_algo, nullptr);
   const auto hierarchy_elapsed_ms = elapsed_ms_since(hierarchy_start_time);
-  RAFT_LOG_INFO("Layered HNSW: hierarchy levels generated in %ld ms (max_level=%d, promoted=%zu)",
-                hierarchy_elapsed_ms,
-                hierarchy.max_level(),
-                hierarchy.promoted_count());
+  RAFT_LOG_INFO(
+    "Layered HNSW artifact: hierarchy levels generated in %ld ms (max_level=%d promoted=%zu)",
+    hierarchy_elapsed_ms,
+    hierarchy.max_level(),
+    hierarchy.promoted_count());
 
   layered_hnsw_file_metadata metadata;
   metadata.n_rows               = n_rows;
@@ -1015,12 +1054,12 @@ auto serialize_to_layered_hnsw_from_disk(
   cuvs::util::write_large_file(
     artifact_fd, hierarchy.levels.data(), metadata.levels_bytes, levels_offset);
   const auto levels_elapsed_ms = elapsed_ms_since(levels_start_time);
-  RAFT_LOG_INFO("Layered HNSW: wrote levels section in %ld ms (%.2f GiB, %.2f GiB/s)",
+  RAFT_LOG_INFO("Layered HNSW artifact: levels section written in %ld ms (%.2f GiB, %.2f GiB/s)",
                 levels_elapsed_ms,
                 to_gib(metadata.levels_bytes),
                 throughput_gib_per_s(metadata.levels_bytes, levels_elapsed_ms));
 
-  RAFT_LOG_INFO("Layered HNSW: writing hnswlib-ready base links");
+  RAFT_LOG_INFO("Layered HNSW artifact: writing hnswlib-ready base topology section");
   const auto layer0_start_time = std::chrono::steady_clock::now();
   write_layered_base_links_from_disk<T, IdxT>(index_,
                                               artifact_fd,
@@ -1029,14 +1068,16 @@ auto serialize_to_layered_hnsw_from_disk(
                                               metadata.base_link_row_bytes,
                                               metadata.maxM0);
   const auto layer0_elapsed_ms = elapsed_ms_since(layer0_start_time);
-  RAFT_LOG_INFO("Layered HNSW: base links written in %ld ms (%.2f GiB, %.2f GiB/s effective)",
-                layer0_elapsed_ms,
-                to_gib(metadata.base_links_bytes),
-                throughput_gib_per_s(metadata.base_links_bytes, layer0_elapsed_ms));
+  static_cast<void>(layer0_elapsed_ms);
+  RAFT_LOG_INFO(
+    "Layered HNSW artifact: base topology section written in %ld ms (%.2f GiB, %.2f GiB/s)",
+    layer0_elapsed_ms,
+    to_gib(metadata.base_nodes_bytes + metadata.base_links_bytes),
+    throughput_gib_per_s(metadata.base_nodes_bytes + metadata.base_links_bytes, layer0_elapsed_ms));
 
   size_t upper_graph_bytes_written = 0;
   if (hierarchy.hist.size() > 1) {
-    RAFT_LOG_INFO("Layered HNSW: gathering promoted vectors");
+    RAFT_LOG_INFO("Layered HNSW artifact: gathering promoted vectors");
     const auto gather_start_time = std::chrono::steady_clock::now();
     auto host_query_set =
       raft::make_host_matrix<T, int64_t>(static_cast<int64_t>(hierarchy.promoted_count()), dim);
@@ -1050,10 +1091,11 @@ auto serialize_to_layered_hnsw_from_disk(
     }
     const auto gather_elapsed_ms = elapsed_ms_since(gather_start_time);
     const auto gathered_bytes    = hierarchy.promoted_count() * dim * sizeof(T);
-    RAFT_LOG_INFO("Layered HNSW: gathered promoted vectors in %ld ms (%.2f GiB copied, %.2f GiB/s)",
-                  gather_elapsed_ms,
-                  to_gib(gathered_bytes),
-                  throughput_gib_per_s(gathered_bytes, gather_elapsed_ms));
+    RAFT_LOG_INFO(
+      "Layered HNSW artifact: promoted vectors gathered in %ld ms (%.2f GiB copied, %.2f GiB/s)",
+      gather_elapsed_ms,
+      to_gib(gathered_bytes),
+      throughput_gib_per_s(gathered_bytes, gather_elapsed_ms));
 
     auto promoted_dataset = raft::make_host_matrix_view<const T, int64_t, raft::row_major>(
       host_query_set.data_handle(), host_query_set.extent(0), host_query_set.extent(1));
@@ -1066,10 +1108,10 @@ auto serialize_to_layered_hnsw_from_disk(
       index_.metric(),
       [&](size_t pt_level, size_t start_idx, auto& host_neighbors) {
         const auto& layer = metadata.layers[pt_level - 1];
-        RAFT_LOG_INFO("Layered HNSW: writing upper layer %zu (%zu rows, degree %zu)",
-                      layer.level,
-                      layer.row_count,
-                      layer.degree);
+        RAFT_LOG_DEBUG("Layered HNSW artifact: writing upper layer level=%zu rows=%zu degree=%zu",
+                       layer.level,
+                       layer.row_count,
+                       layer.degree);
         const auto layer_write_start_time = std::chrono::steady_clock::now();
         const size_t target_batch_bytes   = 64 * 1024 * 1024;
         const size_t row_bytes            = sizeof(IdxT) + metadata.upper_link_row_bytes;
@@ -1112,28 +1154,29 @@ auto serialize_to_layered_hnsw_from_disk(
         const auto layer_bytes = layer.row_count * (sizeof(IdxT) + metadata.upper_link_row_bytes);
         upper_graph_bytes_written += layer_bytes;
         const auto layer_write_elapsed_ms = elapsed_ms_since(layer_write_start_time);
-        RAFT_LOG_INFO("Layered HNSW: upper layer %zu written in %ld ms (%.2f GiB, %.2f GiB/s)",
-                      layer.level,
-                      layer_write_elapsed_ms,
-                      to_gib(layer_bytes),
-                      throughput_gib_per_s(layer_bytes, layer_write_elapsed_ms));
+        static_cast<void>(layer_write_elapsed_ms);
+        RAFT_LOG_INFO(
+          "Layered HNSW artifact: upper layer level=%zu written in %ld ms (%.2f GiB, %.2f GiB/s)",
+          layer.level,
+          layer_write_elapsed_ms,
+          to_gib(layer_bytes),
+          throughput_gib_per_s(layer_bytes, layer_write_elapsed_ms));
       });
     const auto upper_layers_elapsed_ms = elapsed_ms_since(upper_layers_start_time);
     RAFT_LOG_INFO(
-      "Layered HNSW: upper layers generated and written in %ld ms (%.2f GiB written, %.2f "
-      "GiB/s effective)",
+      "Layered HNSW artifact: upper layers generated and written in %ld ms (%.2f GiB, %.2f "
+      "GiB/s)",
       upper_layers_elapsed_ms,
       to_gib(upper_graph_bytes_written),
       throughput_gib_per_s(upper_graph_bytes_written, upper_layers_elapsed_ms));
   }
 
   const auto total_elapsed_ms = elapsed_ms_since(total_start_time);
-  RAFT_LOG_INFO(
-    "Layered HNSW index written to: %s in %ld ms (artifact %.2f GiB, %.2f GiB/s effective)",
-    artifact_file.string().c_str(),
-    total_elapsed_ms,
-    to_gib(final_file_size),
-    throughput_gib_per_s(final_file_size, total_elapsed_ms));
+  RAFT_LOG_INFO("Layered HNSW artifact: wrote %s in %ld ms (artifact=%.2f GiB, %.2f GiB/s)",
+                artifact_file.string().c_str(),
+                total_elapsed_ms,
+                to_gib(final_file_size),
+                throughput_gib_per_s(final_file_size, total_elapsed_ms));
   return artifact_file.string();
 }
 
@@ -2096,7 +2139,13 @@ auto deserialize_layered_hnsw(raft::resources const& res,
 
   RAFT_EXPECTS(dataset_file.shape.size() == 2 && dataset_file.shape[0] == metadata.n_rows &&
                  dataset_file.shape[1] == metadata.dim,
-               "Layered HNSW dataset shape mismatch");
+               "Layered HNSW dataset shape mismatch: artifact rows=%zu dim=%zu, dataset rows=%zu "
+               "dim=%zu path=%s",
+               metadata.n_rows,
+               metadata.dim,
+               dataset_file.shape.size() > 0 ? dataset_file.shape[0] : 0,
+               dataset_file.shape.size() > 1 ? dataset_file.shape[1] : 0,
+               params.dataset_path.c_str());
   RAFT_EXPECTS(metadata.levels_bytes == metadata.n_rows * sizeof(uint8_t),
                "Layered HNSW levels section size mismatch");
   RAFT_EXPECTS(metadata.base_nodes_bytes == metadata.n_rows * sizeof(uint32_t),
@@ -2117,9 +2166,32 @@ auto deserialize_layered_hnsw(raft::resources const& res,
                 dataset_open_elapsed_ms,
                 params.dataset_path.c_str());
 
+  const auto dataset_total_bytes = metadata.n_rows * metadata.dim * sizeof(T);
+  const auto deserialize_progress_total_bytes =
+    metadata.levels_bytes + dataset_total_bytes + metadata.base_nodes_bytes +
+    metadata.base_links_bytes + metadata.upper_nodes_bytes + metadata.upper_links_bytes;
+  size_t deserialize_progress_bytes        = 0;
+  size_t deserialize_next_progress_percent = 10;
+  auto log_deserialize_progress            = [&](size_t bytes_loaded) {
+    if (deserialize_progress_total_bytes == 0) { return; }
+    deserialize_progress_bytes =
+      std::min(deserialize_progress_total_bytes, deserialize_progress_bytes + bytes_loaded);
+    const auto progress_percent =
+      (deserialize_progress_bytes * 100) / deserialize_progress_total_bytes;
+    const auto progress_step = std::min<size_t>(100, (progress_percent / 10) * 10);
+    if (progress_step >= deserialize_next_progress_percent) {
+      RAFT_LOG_INFO("Layered HNSW load: deserialize progress %zu%% (%.2f/%.2f GiB)",
+                    progress_step,
+                    to_gib(deserialize_progress_bytes),
+                    to_gib(deserialize_progress_total_bytes));
+      deserialize_next_progress_percent = progress_step + 10;
+    }
+  };
+
   const auto levels_start_time = std::chrono::steady_clock::now();
   std::vector<uint8_t> levels_u8(metadata.n_rows);
   cuvs::util::read_large_file(artifact_fd, levels_u8.data(), metadata.levels_bytes, levels_offset);
+  log_deserialize_progress(metadata.levels_bytes);
   const auto max_level_in_levels = *std::max_element(levels_u8.begin(), levels_u8.end());
   RAFT_EXPECTS(static_cast<int>(max_level_in_levels) == metadata.maxlevel,
                "Layered HNSW levels max level (%d) does not match artifact maxlevel (%d)",
@@ -2203,6 +2275,7 @@ auto deserialize_layered_hnsw(raft::resources const& res,
       throw std::runtime_error("Not enough memory to allocate HNSW upper linklists");
     }
     base_copy_time += std::chrono::steady_clock::now() - batch_timer;
+    log_deserialize_progress(current_dataset_bytes);
   }
   const auto base_elapsed_ms = elapsed_ms_since(base_start_time);
   RAFT_LOG_INFO(
@@ -2257,6 +2330,7 @@ auto deserialize_layered_hnsw(raft::resources const& res,
     }
     RAFT_EXPECTS(!invalid_node_id, "Invalid base-layer node id in layered HNSW artifact");
     base_topology_copy_time += std::chrono::steady_clock::now() - batch_timer;
+    log_deserialize_progress(current_base_topology_bytes);
   }
   const auto base_topology_elapsed_ms = elapsed_ms_since(base_topology_start_time);
   RAFT_LOG_INFO(
@@ -2320,6 +2394,7 @@ auto deserialize_layered_hnsw(raft::resources const& res,
       RAFT_EXPECTS(!invalid_node_level,
                    "Layered HNSW artifact references a node at an invalid upper level");
       upper_copy_time += std::chrono::steady_clock::now() - batch_timer;
+      log_deserialize_progress(current_upper_bytes);
     }
   }
   const auto upper_elapsed_ms = elapsed_ms_since(upper_start_time);
@@ -2412,15 +2487,14 @@ std::unique_ptr<index<T>> build(raft::resources const& res,
   cagra_ace_params.max_gpu_memory_gb  = ace_params.max_gpu_memory_gb;
   cagra_params.graph_build_params     = cagra_ace_params;
 
-  RAFT_LOG_INFO(
-    "hnsw::build - Building HNSW index using ACE with %zu partitions, ef_construction=%zu",
-    ace_params.npartitions,
-    ace_params.ef_construction);
+  RAFT_LOG_INFO("HNSW ACE build: building CAGRA graph (partitions=%zu ef_construction=%zu)",
+                cagra_ace_params.npartitions,
+                cagra_ace_params.ef_construction);
 
   // Build CAGRA index using ACE
   auto cagra_index = cuvs::neighbors::cagra::build(res, cagra_params, dataset);
 
-  RAFT_LOG_INFO("hnsw::build - Converting CAGRA index to HNSW format");
+  RAFT_LOG_INFO("HNSW ACE build: converting CAGRA graph to HNSW hierarchy");
 
   // Convert CAGRA index to HNSW index
   return from_cagra<T>(res, params, cagra_index, dataset);