From cce66e66360d2a387b017c4566ce8666d5158580 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Sun, 3 Aug 2025 16:50:24 +0900
Subject: [PATCH 001/119] Add pq_len=8 instances

---
 cpp/CMakeLists.txt                            |  12 ++
 .../detail/cagra/compute_distance-ext.cuh     | 134 +++++++++++++++++-
 .../detail/cagra/compute_distance.cu          |  14 +-
 .../cagra/compute_distance_00_generate.py     |   6 +-
 ...d_float_uint32_dim128_t8_8pq_8subd_half.cu |  41 ++++++
 ..._float_uint32_dim256_t16_8pq_8subd_half.cu |  41 ++++++
 ..._float_uint32_dim512_t32_8pq_8subd_half.cu |  41 ++++++
 ...ed_half_uint32_dim128_t8_8pq_8subd_half.cu |  41 ++++++
 ...d_half_uint32_dim256_t16_8pq_8subd_half.cu |  41 ++++++
 ...d_half_uint32_dim512_t32_8pq_8subd_half.cu |  41 ++++++
 ...ed_int8_uint32_dim128_t8_8pq_8subd_half.cu |  41 ++++++
 ...d_int8_uint32_dim256_t16_8pq_8subd_half.cu |  41 ++++++
 ...d_int8_uint32_dim512_t32_8pq_8subd_half.cu |  41 ++++++
 ...d_uint8_uint32_dim128_t8_8pq_8subd_half.cu |  41 ++++++
 ..._uint8_uint32_dim256_t16_8pq_8subd_half.cu |  41 ++++++
 ..._uint8_uint32_dim512_t32_8pq_8subd_half.cu |  41 ++++++
 16 files changed, 653 insertions(+), 5 deletions(-)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 50ee1a0ce2..ed78fcaca5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -237,28 +237,40 @@ if(BUILD_SHARED_LIBS)
     src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu
     src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu
     src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu
     src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
index faca960808..3957abc6b5 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,6 +62,15 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            float,
                                            uint32_t,
                                            float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           8,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
 extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
                                                 16,
                                                 256,
@@ -92,6 +101,15 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            float,
                                            uint32_t,
                                            float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           8,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
 extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
                                                 32,
                                                 512,
@@ -122,6 +140,15 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            float,
                                            uint32_t,
                                            float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           8,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
 extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
                                                 8,
                                                 128,
@@ -152,6 +179,15 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            uint32_t,
                                            float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           8,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
 extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
                                                 16,
                                                 256,
@@ -182,6 +218,15 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            uint32_t,
                                            float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           8,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
 extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
                                                 32,
                                                 512,
@@ -212,6 +257,15 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            uint32_t,
                                            float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           8,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
 extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
                                                 8,
                                                 128,
@@ -242,6 +296,15 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            int8_t,
                                            uint32_t,
                                            float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           8,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
 extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
                                                 16,
                                                 256,
@@ -272,6 +335,15 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            int8_t,
                                            uint32_t,
                                            float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           8,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
 extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
                                                 32,
                                                 512,
@@ -302,6 +374,15 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            int8_t,
                                            uint32_t,
                                            float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           8,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
 extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
                                                 8,
                                                 128,
@@ -332,6 +413,15 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            uint8_t,
                                            uint32_t,
                                            float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           8,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
 extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
                                                 16,
                                                 256,
@@ -362,6 +452,15 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            uint8_t,
                                            uint32_t,
                                            float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           8,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
 extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
                                                 32,
                                                 512,
@@ -392,6 +491,15 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            uint8_t,
                                            uint32_t,
                                            float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           8,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
 extern template struct standard_descriptor_spec<DistanceType::BitwiseHamming,
                                                 8,
                                                 128,
@@ -416,50 +524,62 @@ extern template struct instance_selector<
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 32, 512, uint8_t, uint32_t, float>>;
@@ -469,50 +589,62 @@ using descriptor_instances = instance_selector<
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 32, 512, uint8_t, uint32_t, float>>;
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu
index 4158863465..cbb6f4540b 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,50 +34,62 @@ template struct instance_selector<
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 32, 512, uint8_t, uint32_t, float>>;
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
index 1c813f1a50..9834ca696f 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
 import glob
 
 template = """/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@
 #mxdim_team = [(32, 8), (64, 16), (128, 32)]
 
 pq_bits = [8]
-pq_lens = [2, 4]
+pq_lens = [2, 4, 8]
 
 # rblock = [(256, 4), (512, 2), (1024, 1)]
 # rcandidates = [32]
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu
new file mode 100644
index 0000000000..52fb9140fe
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    8,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu
new file mode 100644
index 0000000000..41cea6b5b1
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    8,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu
new file mode 100644
index 0000000000..b0f650f45a
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    8,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu
new file mode 100644
index 0000000000..82d5d738ab
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    8,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu
new file mode 100644
index 0000000000..a8b1472cff
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    8,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu
new file mode 100644
index 0000000000..dc56563595
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    8,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu
new file mode 100644
index 0000000000..4cb61e18ad
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    8,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu
new file mode 100644
index 0000000000..5463f42ab5
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    8,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu
new file mode 100644
index 0000000000..64d436115b
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    8,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu
new file mode 100644
index 0000000000..eb9bc63041
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    8,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu
new file mode 100644
index 0000000000..3825658388
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    8,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu
new file mode 100644
index 0000000000..f4fc937e25
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    8,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail

From 43e6145e21ce04013f18c01dafb698ba4414475c Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Fri, 15 Aug 2025 00:46:01 +0900
Subject: [PATCH 002/119] Update CAGRA-Q test

---
 cpp/tests/neighbors/ann_cagra.cuh | 48 ++++++++++++++++++-
 cpp/tests/neighbors/ann_utils.cuh | 35 +++++++++++---
 cpp/tests/neighbors/vpq_utils.cuh | 77 +++++++++++++++++++++++++++++++
 3 files changed, 151 insertions(+), 9 deletions(-)
 create mode 100644 cpp/tests/neighbors/vpq_utils.cuh

diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh
index e10e1300ca..156b55ad24 100644
--- a/cpp/tests/neighbors/ann_cagra.cuh
+++ b/cpp/tests/neighbors/ann_cagra.cuh
@@ -17,12 +17,14 @@
 
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
+#include "vpq_utils.cuh"
 #include <raft/core/resource/cuda_stream.hpp>
 
 #include "naive_knn.cuh"
 
 #include <cuvs/distance/distance.hpp>
 #include <cuvs/neighbors/cagra.hpp>
+#include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/composite/merge.hpp>
 #include <cuvs/neighbors/index_wrappers.hpp>
 #include <raft/core/device_mdspan.hpp>
@@ -446,6 +448,46 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
         raft::update_host(indices_Cagra.data(), indices_dev.data(), queries_size, stream_);
 
         raft::resource::sync_stream(handle_);
+
+        reference_recall = 1;
+        if (ps.compression.has_value()) {
+          auto decoded_dataset =
+            raft::make_device_matrix<DataT, int64_t>(handle_, ps.n_rows, ps.dim);
+          cuvs::neighbors::decode_vpq_dataset<DataT, half>(
+            decoded_dataset.view(),
+            dynamic_cast<const cuvs::neighbors::vpq_dataset<half, int64_t>&>(index.data()),
+            raft::resource::get_cuda_stream(handle_));
+          auto indices_out_view = raft::make_device_matrix_view<SearchIdxT, int64_t>(
+            indices_dev.data(), ps.n_queries, ps.k);
+          auto dists_out_view = raft::make_device_matrix_view<DistanceT, int64_t>(
+            distances_dev.data(), ps.n_queries, ps.k);
+
+          cuvs::neighbors::naive_knn<DistanceT, DataT, SearchIdxT>(handle_,
+                                                                   dists_out_view.data_handle(),
+                                                                   indices_out_view.data_handle(),
+                                                                   search_queries.data(),
+                                                                   decoded_dataset.data_handle(),
+                                                                   ps.n_queries,
+                                                                   ps.n_rows,
+                                                                   ps.dim,
+                                                                   ps.k,
+                                                                   ps.metric);
+          std::vector<SearchIdxT> indices_vpq_dataset(queries_size);
+          std::vector<DistanceT> distances_vpq_dataset(queries_size);
+          raft::update_host(
+            distances_vpq_dataset.data(), dists_out_view.data_handle(), queries_size, stream_);
+          raft::update_host(
+            indices_vpq_dataset.data(), indices_out_view.data_handle(), queries_size, stream_);
+
+          reference_recall = std::get<1>(calc_recall(indices_naive,
+                                                     indices_vpq_dataset,
+                                                     distances_naive,
+                                                     distances_vpq_dataset,
+                                                     ps.n_queries,
+                                                     ps.k,
+                                                     0));
+          printf("reference_recall = %e\n", reference_recall);
+        }
       }
 
       // for (int i = 0; i < min(ps.n_queries, 10); i++) {
@@ -455,7 +497,7 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
       //   print_vector("T", distances_naive.data() + i * ps.k, ps.k, std::cout);
       //   print_vector("C", distances_Cagra.data() + i * ps.k, ps.k, std::cout);
       // }
-      double min_recall = ps.min_recall;
+      double min_recall = ps.min_recall * reference_recall;
       EXPECT_TRUE(eval_neighbours(indices_naive,
                                   indices_Cagra,
                                   distances_naive,
@@ -504,6 +546,7 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
   AnnCagraInputs ps;
   rmm::device_uvector<DataT> database;
   rmm::device_uvector<DataT> search_queries;
+  double reference_recall;
 };
 
 template <typename DistanceT, typename DataT, typename IdxT>
@@ -1325,7 +1368,8 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL,
      cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL});  // don't demand high recall
                                                                 // without refinement
-  for (uint32_t pq_len : {2}) {  // for now, only pq_len = 2 is supported, more options coming  soon
+  for (uint32_t pq_len :
+       {2, 4, 8}) {  // for now, only pq_len = 2 is supported, more options coming  soon
     for (uint32_t vq_n_centers : {100, 1000}) {
       for (auto input : inputs2) {
         vpq_params ps{};
diff --git a/cpp/tests/neighbors/ann_utils.cuh b/cpp/tests/neighbors/ann_utils.cuh
index 0c01c48c9c..dbf5c1f6b3 100644
--- a/cpp/tests/neighbors/ann_utils.cuh
+++ b/cpp/tests/neighbors/ann_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -207,7 +207,7 @@ auto eval_recall(const std::vector<T>& expected_idx,
                  double min_recall,
                  bool test_unique = true) -> testing::AssertionResult
 {
-  auto [actual_recall, match_count, total_count] =
+  auto [actual_recall, index_based_actual_recall, match_count, total_count] =
     calc_recall(expected_idx, actual_idx, rows, cols);
   double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps);
   RAFT_LOG_INFO("Recall = %f (%zu/%zu), the error is %2.1f%% %s the threshold (eps = %f).",
@@ -239,8 +239,9 @@ auto calc_recall(const std::vector<T>& expected_idx,
                  size_t cols,
                  double eps)
 {
-  size_t match_count = 0;
-  size_t total_count = static_cast<size_t>(rows) * static_cast<size_t>(cols);
+  size_t match_count       = 0;
+  size_t index_match_count = 0;
+  size_t total_count       = static_cast<size_t>(rows) * static_cast<size_t>(cols);
   for (size_t i = 0; i < rows; ++i) {
     for (size_t k = 0; k < cols; ++k) {
       size_t idx_k  = i * cols + k;  // row major assumption!
@@ -259,8 +260,28 @@ auto calc_recall(const std::vector<T>& expected_idx,
       }
     }
   }
-  return std::make_tuple(
-    static_cast<double>(match_count) / static_cast<double>(total_count), match_count, total_count);
+
+  // Index based recall
+  for (size_t i = 0; i < rows; ++i) {
+    for (size_t k = 0; k < cols; ++k) {
+      size_t idx_k = i * cols + k;  // row major assumption!
+      auto act_idx = actual_idx[idx_k];
+      for (size_t j = 0; j < cols; ++j) {
+        size_t idx   = i * cols + j;  // row major assumption!
+        auto exp_idx = expected_idx[idx];
+
+        if (act_idx == exp_idx) {
+          index_match_count++;
+          break;
+        }
+      }
+    }
+  }
+
+  return std::make_tuple(static_cast<double>(match_count) / static_cast<double>(total_count),
+                         static_cast<double>(index_match_count) / static_cast<double>(total_count),
+                         match_count,
+                         total_count);
 }
 
 /** same as eval_recall, but in case indices do not match,
@@ -277,7 +298,7 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
                      bool test_unique      = true,
                      size_t max_duplicates = 0) -> testing::AssertionResult
 {
-  auto [actual_recall, match_count, total_count] =
+  auto [actual_recall, index_based_actual_recall, match_count, total_count] =
     calc_recall(expected_idx, actual_idx, expected_dist, actual_dist, rows, cols, eps);
   double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps);
 
diff --git a/cpp/tests/neighbors/vpq_utils.cuh b/cpp/tests/neighbors/vpq_utils.cuh
new file mode 100644
index 0000000000..383e5ef063
--- /dev/null
+++ b/cpp/tests/neighbors/vpq_utils.cuh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cuvs/neighbors/common.hpp>
+
+namespace cuvs::neighbors {
+template <class data_t, class math_t>
+__global__ void decode_vpq_dataset_kernel(data_t* const decoded_dataset_ptr,
+                                          const uint32_t ldd,
+                                          const math_t* const vq_codebook_ptr,
+                                          const uint32_t ldv,
+                                          const math_t* const pq_codebook_ptr,
+                                          const uint32_t pq_subspace_dim,
+                                          const uint32_t pq_table_size,
+                                          const uint32_t dataset_dim,
+                                          const size_t dataset_size,
+                                          const uint8_t* const data_ptr,
+                                          const uint32_t ldi)
+{
+  constexpr uint32_t warp_size = 32;
+  const size_t batch_id        = (blockIdx.x * blockDim.x + threadIdx.x) / warp_size;
+  if (batch_id >= dataset_size) { return; }
+
+  const auto local_data_ptr = data_ptr + ldi * batch_id;
+  const auto vq_code        = *reinterpret_cast<const uint32_t*>(local_data_ptr);
+  const auto pq_code_ptr    = local_data_ptr + sizeof(uint32_t);
+  const auto vq_vec_ptr     = vq_codebook_ptr + vq_code * ldv;
+  auto local_dst_ptr        = decoded_dataset_ptr + batch_id * ldd;
+
+  const auto lane_id = threadIdx.x % warp_size;
+  for (uint32_t i = lane_id; i < dataset_dim; i += warp_size) {
+    const auto pq_code = pq_code_ptr[i / pq_subspace_dim];
+    const auto pq_v    = pq_codebook_ptr[pq_code * pq_subspace_dim + (i % pq_subspace_dim)];
+
+    local_dst_ptr[i] = static_cast<data_t>(vq_vec_ptr[i]) + static_cast<data_t>(pq_v);
+  }
+}
+
+template <class data_t, class math_t>
+void decode_vpq_dataset(raft::device_matrix_view<data_t, int64_t> decoded_dataset,
+                        const cuvs::neighbors::vpq_dataset<math_t, int64_t>& vpq_dataset,
+                        cudaStream_t cuda_stream)
+{
+  const auto dataset_size = decoded_dataset.extent(0);
+  RAFT_EXPECTS(vpq_dataset.data.extent(0) == dataset_size, "Dataset sizes mismatch");
+
+  constexpr uint32_t block_size  = 256;
+  constexpr uint32_t warp_size   = 32;
+  constexpr int64_t vecs_per_cta = block_size / warp_size;
+  const auto grid_size = raft::div_rounding_up_safe(decoded_dataset.extent(0), vecs_per_cta);
+
+  decode_vpq_dataset_kernel<data_t, math_t>
+    <<<grid_size, block_size, 0, cuda_stream>>>(decoded_dataset.data_handle(),
+                                                decoded_dataset.stride(0),
+                                                vpq_dataset.vq_code_book.data_handle(),
+                                                vpq_dataset.vq_code_book.stride(0),
+                                                vpq_dataset.pq_code_book.data_handle(),
+                                                vpq_dataset.pq_len(),
+                                                1u << vpq_dataset.pq_bits(),
+                                                vpq_dataset.dim(),
+                                                dataset_size,
+                                                vpq_dataset.data.data_handle(),
+                                                vpq_dataset.data.stride(0));
+}
+}  // namespace cuvs::neighbors

From 16321bcd48bef9b967abf7dc3859ee53d2db8078 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 3 Sep 2025 12:36:54 +0900
Subject: [PATCH 003/119] Update CAGRA-Q distance kernel

---
 .../detail/cagra/compute_distance_vpq-impl.cuh       | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 6caa173f2c..3c11a8d30f 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -234,10 +234,11 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
   constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim;
   constexpr auto PQ_BITS         = DescriptorT::kPqBits;
   constexpr auto PQ_LEN          = DescriptorT::kPqLen;
+  using PQ_CODEBOOK_LOAD_T       = uint32_t;
 
   const uint32_t query_ptr = pq_codebook_ptr + DescriptorT::kSMemCodeBookSizeInBytes;
   static_assert(PQ_BITS == 8, "Only pq_bits == 8 is supported at the moment.");
-  constexpr uint32_t vlen = 4;  // **** DO NOT CHANGE ****
+  constexpr uint32_t vlen = utils::size_of<PQ_CODEBOOK_LOAD_T>() / utils::size_of<uint8_t>();
   constexpr uint32_t nelem =
     raft::div_rounding_up_unsafe<uint32_t>(DatasetBlockDim / PQ_LEN, TeamSize * vlen);
 
@@ -250,13 +251,12 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
   for (uint32_t elem_offset = 0; elem_offset * PQ_LEN < dim;
        elem_offset += DatasetBlockDim / PQ_LEN) {
     // Loading PQ codes
-    uint32_t pq_codes[nelem];
+    PQ_CODEBOOK_LOAD_T pq_codes[nelem];
 #pragma unroll
     for (std::uint32_t e = 0; e < nelem; e++) {
       const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen;
       if (k >= n_subspace) break;
-      // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory)
-      device::ldg_cg(pq_codes[e], reinterpret_cast<const std::uint32_t*>(dataset_ptr + 4 + k));
+      device::ldg_cg(pq_codes[e], reinterpret_cast<const PQ_CODEBOOK_LOAD_T*>(dataset_ptr + 4 + k));
     }
     //
     if constexpr (PQ_LEN % 2 == 0) {
@@ -274,7 +274,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
           device::ldg_ca(vq_vals[m], vq_code_book_ptr + d);
         }
         // Compute distance
-        std::uint32_t pq_code = pq_codes[e];
+        PQ_CODEBOOK_LOAD_T pq_code = pq_codes[e];
 #pragma unroll
         for (std::uint32_t v = 0; v < vlen; v++) {
           if (PQ_LEN * (v + k) >= dim) break;

From bfdc2d4fc802ec83c4d70ca4c60ad7cf1cff54e7 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 3 Sep 2025 15:35:48 +0900
Subject: [PATCH 004/119] Add DatasetBlockDim check

---
 cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 3c11a8d30f..cbd8c71b08 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -241,6 +241,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
   constexpr uint32_t vlen = utils::size_of<PQ_CODEBOOK_LOAD_T>() / utils::size_of<uint8_t>();
   constexpr uint32_t nelem =
     raft::div_rounding_up_unsafe<uint32_t>(DatasetBlockDim / PQ_LEN, TeamSize * vlen);
+  static_assert(DatasetBlockDim / PQ_LEN >= TeamSize * vlen, "DatasetBlockDim is too small");
 
   constexpr auto kTeamMask = DescriptorT::kTeamSize - 1;
   constexpr auto kTeamVLen = TeamSize * vlen;

From 23c02e16689c53d56580b4ffc90d4ae01f41f3ea Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 3 Sep 2025 16:33:32 +0900
Subject: [PATCH 005/119] Update VPQ compute distance kernel

---
 cpp/CMakeLists.txt                            |  40 +-
 .../detail/cagra/compute_distance-ext.cuh     | 649 ++++++++++++------
 .../detail/cagra/compute_distance.cu          | 104 ++-
 .../cagra/compute_distance_00_generate.py     |  30 +-
 .../cagra/compute_distance_vpq-impl.cuh       |   8 +-
 ...float_uint32_dim1024_t32_8pq_8subd_half.cu |  41 ++
 ..._float_uint32_dim128_t4_8pq_8subd_half.cu} |   2 +-
 ..._float_uint32_dim256_t8_8pq_8subd_half.cu} |   2 +-
 ...float_uint32_dim512_t16_8pq_8subd_half.cu} |   2 +-
 ...ed_float_uint32_dim64_t4_8pq_2subd_half.cu |  41 ++
 ...ed_float_uint32_dim64_t4_8pq_4subd_half.cu |  41 ++
 ..._half_uint32_dim1024_t32_8pq_8subd_half.cu |  41 ++
 ...d_half_uint32_dim128_t4_8pq_8subd_half.cu} |   2 +-
 ...d_half_uint32_dim256_t8_8pq_8subd_half.cu} |   2 +-
 ..._half_uint32_dim512_t16_8pq_8subd_half.cu} |   2 +-
 ...ded_half_uint32_dim64_t4_8pq_2subd_half.cu |  41 ++
 ...ded_half_uint32_dim64_t4_8pq_4subd_half.cu |  41 ++
 ..._int8_uint32_dim1024_t32_8pq_8subd_half.cu |  41 ++
 ...d_int8_uint32_dim128_t4_8pq_8subd_half.cu} |   2 +-
 ...d_int8_uint32_dim256_t8_8pq_8subd_half.cu} |   2 +-
 ..._int8_uint32_dim512_t16_8pq_8subd_half.cu} |   2 +-
 ...ded_int8_uint32_dim64_t4_8pq_2subd_half.cu |  41 ++
 ...ded_int8_uint32_dim64_t4_8pq_4subd_half.cu |  41 ++
 ...uint8_uint32_dim1024_t32_8pq_8subd_half.cu |  41 ++
 ..._uint8_uint32_dim128_t4_8pq_8subd_half.cu} |   2 +-
 ..._uint8_uint32_dim256_t8_8pq_8subd_half.cu} |   2 +-
 ...uint8_uint32_dim512_t16_8pq_8subd_half.cu} |   2 +-
 ...ed_uint8_uint32_dim64_t4_8pq_2subd_half.cu |  41 ++
 ...ed_uint8_uint32_dim64_t4_8pq_4subd_half.cu |  41 ++
 ..._L2Expanded_dim128_t8_uint32_t_uint64_t.cu |  41 ++
 ...L2Expanded_dim256_t16_uint32_t_uint64_t.cu |  41 ++
 ...L2Expanded_dim512_t32_uint32_t_uint64_t.cu |  41 ++
 ...q_L2Expanded_dim64_t4_uint32_t_uint64_t.cu |  41 ++
 33 files changed, 1251 insertions(+), 260 deletions(-)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu} (97%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu} (97%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu} (97%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu} (97%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu} (97%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu} (97%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu} (97%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu} (97%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu} (97%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu} (97%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu} (97%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu} (97%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 23aa794004..3d154917bd 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -235,42 +235,58 @@ if(BUILD_SHARED_LIBS)
     src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu
     src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu
     src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu
+    src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu
+    src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu
+    src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu
     src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu
     src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu
     src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
index 3957abc6b5..45078a91d6 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
@@ -27,6 +27,7 @@
 
 #include "compute_distance_standard.hpp"
 #include "compute_distance_vpq.hpp"
+#include "compute_distance_vrabitq.hpp"
 
 namespace cuvs::neighbors::cagra::detail {
 
@@ -44,6 +45,39 @@ extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
                                                 float,
                                                 uint32_t,
                                                 float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                16,
+                                                256,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                16,
+                                                256,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                32,
+                                                512,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                32,
+                                                512,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
+                                           8,
+                                           2,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
@@ -54,8 +88,26 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
                                            8,
-                                           128,
+                                           2,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           2,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
                                            8,
                                            4,
                                            half,
@@ -66,35 +118,23 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
                                            8,
-                                           8,
+                                           4,
                                            half,
                                            float,
                                            uint32_t,
                                            float>;
-extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
-                                                16,
-                                                256,
-                                                float,
-                                                uint32_t,
-                                                float>;
-extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
-                                                16,
-                                                256,
-                                                float,
-                                                uint32_t,
-                                                float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            256,
                                            8,
-                                           2,
+                                           4,
                                            half,
                                            float,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           16,
-                                           256,
+                                           32,
+                                           512,
                                            8,
                                            4,
                                            half,
@@ -102,47 +142,35 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           16,
-                                           256,
+                                           4,
+                                           128,
                                            8,
                                            8,
                                            half,
                                            float,
                                            uint32_t,
                                            float>;
-extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
-                                                32,
-                                                512,
-                                                float,
-                                                uint32_t,
-                                                float>;
-extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
-                                                32,
-                                                512,
-                                                float,
-                                                uint32_t,
-                                                float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           32,
-                                           512,
                                            8,
-                                           2,
+                                           256,
+                                           8,
+                                           8,
                                            half,
                                            float,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           32,
+                                           16,
                                            512,
                                            8,
-                                           4,
+                                           8,
                                            half,
                                            float,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
-                                           512,
+                                           1024,
                                            8,
                                            8,
                                            half,
@@ -161,6 +189,39 @@ extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
                                                 half,
                                                 uint32_t,
                                                 float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                16,
+                                                256,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                16,
+                                                256,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                32,
+                                                512,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                32,
+                                                512,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
+                                           8,
+                                           2,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
@@ -171,8 +232,26 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
                                            8,
-                                           128,
+                                           2,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           2,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
                                            8,
                                            4,
                                            half,
@@ -183,35 +262,23 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
                                            8,
-                                           8,
+                                           4,
                                            half,
                                            half,
                                            uint32_t,
                                            float>;
-extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
-                                                16,
-                                                256,
-                                                half,
-                                                uint32_t,
-                                                float>;
-extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
-                                                16,
-                                                256,
-                                                half,
-                                                uint32_t,
-                                                float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            256,
                                            8,
-                                           2,
+                                           4,
                                            half,
                                            half,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           16,
-                                           256,
+                                           32,
+                                           512,
                                            8,
                                            4,
                                            half,
@@ -219,47 +286,35 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           16,
-                                           256,
+                                           4,
+                                           128,
                                            8,
                                            8,
                                            half,
                                            half,
                                            uint32_t,
                                            float>;
-extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
-                                                32,
-                                                512,
-                                                half,
-                                                uint32_t,
-                                                float>;
-extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
-                                                32,
-                                                512,
-                                                half,
-                                                uint32_t,
-                                                float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           32,
-                                           512,
                                            8,
-                                           2,
+                                           256,
+                                           8,
+                                           8,
                                            half,
                                            half,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           32,
+                                           16,
                                            512,
                                            8,
-                                           4,
+                                           8,
                                            half,
                                            half,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
-                                           512,
+                                           1024,
                                            8,
                                            8,
                                            half,
@@ -278,6 +333,39 @@ extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
                                                 int8_t,
                                                 uint32_t,
                                                 float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                16,
+                                                256,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                16,
+                                                256,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                32,
+                                                512,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                32,
+                                                512,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
+                                           8,
+                                           2,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
@@ -288,8 +376,26 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
                                            8,
-                                           128,
+                                           2,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           2,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
                                            8,
                                            4,
                                            half,
@@ -300,35 +406,23 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
                                            8,
-                                           8,
+                                           4,
                                            half,
                                            int8_t,
                                            uint32_t,
                                            float>;
-extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
-                                                16,
-                                                256,
-                                                int8_t,
-                                                uint32_t,
-                                                float>;
-extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
-                                                16,
-                                                256,
-                                                int8_t,
-                                                uint32_t,
-                                                float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            256,
                                            8,
-                                           2,
+                                           4,
                                            half,
                                            int8_t,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           16,
-                                           256,
+                                           32,
+                                           512,
                                            8,
                                            4,
                                            half,
@@ -336,47 +430,35 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           16,
-                                           256,
+                                           4,
+                                           128,
                                            8,
                                            8,
                                            half,
                                            int8_t,
                                            uint32_t,
                                            float>;
-extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
-                                                32,
-                                                512,
-                                                int8_t,
-                                                uint32_t,
-                                                float>;
-extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
-                                                32,
-                                                512,
-                                                int8_t,
-                                                uint32_t,
-                                                float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           32,
-                                           512,
                                            8,
-                                           2,
+                                           256,
+                                           8,
+                                           8,
                                            half,
                                            int8_t,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           32,
+                                           16,
                                            512,
                                            8,
-                                           4,
+                                           8,
                                            half,
                                            int8_t,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
-                                           512,
+                                           1024,
                                            8,
                                            8,
                                            half,
@@ -395,6 +477,39 @@ extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
                                                 uint8_t,
                                                 uint32_t,
                                                 float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                16,
+                                                256,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                16,
+                                                256,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                32,
+                                                512,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                32,
+                                                512,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
+                                           8,
+                                           2,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
@@ -405,8 +520,26 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           2,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
                                            8,
-                                           128,
+                                           2,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
                                            8,
                                            4,
                                            half,
@@ -417,35 +550,23 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
                                            8,
-                                           8,
+                                           4,
                                            half,
                                            uint8_t,
                                            uint32_t,
                                            float>;
-extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
-                                                16,
-                                                256,
-                                                uint8_t,
-                                                uint32_t,
-                                                float>;
-extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
-                                                16,
-                                                256,
-                                                uint8_t,
-                                                uint32_t,
-                                                float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            256,
                                            8,
-                                           2,
+                                           4,
                                            half,
                                            uint8_t,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           16,
-                                           256,
+                                           32,
+                                           512,
                                            8,
                                            4,
                                            half,
@@ -453,53 +574,77 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           16,
-                                           256,
+                                           4,
+                                           128,
                                            8,
                                            8,
                                            half,
                                            uint8_t,
                                            uint32_t,
                                            float>;
-extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
-                                                32,
-                                                512,
-                                                uint8_t,
-                                                uint32_t,
-                                                float>;
-extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
-                                                32,
-                                                512,
-                                                uint8_t,
-                                                uint32_t,
-                                                float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           32,
-                                           512,
                                            8,
-                                           2,
+                                           256,
+                                           8,
+                                           8,
                                            half,
                                            uint8_t,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                           32,
+                                           16,
                                            512,
                                            8,
-                                           4,
+                                           8,
                                            half,
                                            uint8_t,
                                            uint32_t,
                                            float>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
-                                           512,
+                                           1024,
                                            8,
                                            8,
                                            half,
                                            uint8_t,
                                            uint32_t,
                                            float>;
+extern template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                                               4,
+                                               64,
+                                               float,
+                                               uint32_t,
+                                               uint32_t,
+                                               uint64_t,
+                                               float,
+                                               float>;
+extern template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                                               8,
+                                               128,
+                                               float,
+                                               uint32_t,
+                                               uint32_t,
+                                               uint64_t,
+                                               float,
+                                               float>;
+extern template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                                               16,
+                                               256,
+                                               float,
+                                               uint32_t,
+                                               uint32_t,
+                                               uint64_t,
+                                               float,
+                                               float>;
+extern template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                                               32,
+                                               512,
+                                               float,
+                                               uint32_t,
+                                               uint32_t,
+                                               uint64_t,
+                                               float,
+                                               float>;
 extern template struct standard_descriptor_spec<DistanceType::BitwiseHamming,
                                                 8,
                                                 128,
@@ -522,64 +667,112 @@ extern template struct standard_descriptor_spec<DistanceType::BitwiseHamming,
 extern template struct instance_selector<
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, uint8_t, uint32_t, float>,
+  vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                          4,
+                          64,
+                          float,
+                          uint32_t,
+                          uint32_t,
+                          uint64_t,
+                          float,
+                          float>,
+  vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                          8,
+                          128,
+                          float,
+                          uint32_t,
+                          uint32_t,
+                          uint64_t,
+                          float,
+                          float>,
+  vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                          16,
+                          256,
+                          float,
+                          uint32_t,
+                          uint32_t,
+                          uint64_t,
+                          float,
+                          float>,
+  vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                          32,
+                          512,
+                          float,
+                          uint32_t,
+                          uint32_t,
+                          uint64_t,
+                          float,
+                          float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 32, 512, uint8_t, uint32_t, float>>;
@@ -587,64 +780,112 @@ extern template struct instance_selector<
 using descriptor_instances = instance_selector<
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, uint8_t, uint32_t, float>,
+  vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                          4,
+                          64,
+                          float,
+                          uint32_t,
+                          uint32_t,
+                          uint64_t,
+                          float,
+                          float>,
+  vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                          8,
+                          128,
+                          float,
+                          uint32_t,
+                          uint32_t,
+                          uint64_t,
+                          float,
+                          float>,
+  vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                          16,
+                          256,
+                          float,
+                          uint32_t,
+                          uint32_t,
+                          uint64_t,
+                          float,
+                          float>,
+  vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                          32,
+                          512,
+                          float,
+                          uint32_t,
+                          uint32_t,
+                          uint64_t,
+                          float,
+                          float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 32, 512, uint8_t, uint32_t, float>>;
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu
index cbb6f4540b..d3fc849147 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu
@@ -32,64 +32,112 @@ using namespace cuvs::distance;
 template struct instance_selector<
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 8, half, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 8, half, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 8, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, uint8_t, uint32_t, float>,
+  vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                          4,
+                          64,
+                          float,
+                          uint32_t,
+                          uint32_t,
+                          uint64_t,
+                          float,
+                          float>,
+  vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                          8,
+                          128,
+                          float,
+                          uint32_t,
+                          uint32_t,
+                          uint64_t,
+                          float,
+                          float>,
+  vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                          16,
+                          256,
+                          float,
+                          uint32_t,
+                          uint32_t,
+                          uint64_t,
+                          float,
+                          float>,
+  vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                          32,
+                          512,
+                          float,
+                          uint32_t,
+                          uint32_t,
+                          uint64_t,
+                          float,
+                          float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 32, 512, uint8_t, uint32_t, float>>;
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
index 9834ca696f..0b877f889a 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
@@ -50,6 +50,9 @@
 """
 
 mxdim_team = [(128, 8), (256, 16), (512, 32)]
+vpq_2_4_mxdim_team = [(64, 4), (128, 8), (256, 16), (512, 32)]
+vpq_8_mxdim_team = [(128, 4), (256, 8), (512, 16), (1024, 32)]
+vrq_mxdim_team = [(64, 4), (128, 8), (256, 16), (512, 32)]
 #mxdim_team = [(64, 8), (128, 16), (256, 32)]
 #mxdim_team = [(32, 8), (64, 16), (128, 32)]
 
@@ -98,9 +101,11 @@
                 f.write(template.format(includes=includes, content=content))
                 cmake_list.append(f"  src/neighbors/detail/cagra/{path}")
 
-        # CAGRA-Q
-        for code_book_t in code_book_types:
-            for pq_len in pq_lens:
+    for pq_len in pq_lens:
+        vpq_mxdim_team = vpq_8_mxdim_team if pq_len == 8 else vpq_2_4_mxdim_team
+        for (mxdim, team) in vpq_mxdim_team:
+            # CAGRA-Q
+            for code_book_t in code_book_types:
                 for pq_bit in pq_bits:
                     for metric in ['L2Expanded']:
                         path = f"compute_distance_vpq_{metric}_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}.cu"
@@ -113,6 +118,24 @@
                             f.write(template.format(includes=includes, content=content))
                             cmake_list.append(f"  src/neighbors/detail/cagra/{path}")
 
+# CAGRA-RaBitQ
+for (mxdim, team) in vrq_mxdim_team:
+    for vq_code_t in ['uint32_t']:
+        for rabitq_code_t in ['uint64_t']:
+            for metric in ['L2Expanded']:
+                data_t = 'float'
+                idx_t = 'uint32_t'
+                distance_t = 'float'
+                path = f"compute_distance_vrabitq_{metric}_dim{mxdim}_t{team}_{vq_code_t}_{rabitq_code_t}.cu"
+                includes = '#include "compute_distance_vrabitq-impl.cuh"'
+                params = f"{metric_prefix}{metric}, {team}, {mxdim}, {data_t}, {idx_t}, {vq_code_t}, {rabitq_code_t}, float, {distance_t}"
+                spec = f"vrabitq_descriptor_spec<{params}>"
+                content = f"""template struct {spec};"""
+                specs.append(spec)
+                with open(path, "w") as f:
+                    f.write(template.format(includes=includes, content=content))
+                    cmake_list.append(f"  src/neighbors/detail/cagra/{path}")
+
 # CAGRA (Binary Hamming distance)
 for (mxdim, team) in mxdim_team:
     metric = 'BitwiseHamming'
@@ -137,6 +160,7 @@
 
 #include "compute_distance_standard.hpp"
 #include "compute_distance_vpq.hpp"
+#include "compute_distance_vrabitq.hpp"
 '''
     newline = "\n"
     contents = f'''
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index cbd8c71b08..7545c25a2b 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -257,7 +257,13 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
     for (std::uint32_t e = 0; e < nelem; e++) {
       const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen;
       if (k >= n_subspace) break;
-      device::ldg_cg(pq_codes[e], reinterpret_cast<const PQ_CODEBOOK_LOAD_T*>(dataset_ptr + 4 + k));
+
+      if constexpr (std::is_same_v<PQ_CODEBOOK_LOAD_T, uint32_t>) {
+        device::ldg_cg(pq_codes[e],
+                       reinterpret_cast<const PQ_CODEBOOK_LOAD_T*>(dataset_ptr + 4 + k));
+      } else {
+        pq_codes[e] = *reinterpret_cast<const PQ_CODEBOOK_LOAD_T*>(dataset_ptr + 4 + k);
+      }
     }
     //
     if constexpr (PQ_LEN % 2 == 0) {
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu
new file mode 100644
index 0000000000..5c458a281a
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    1024,
+                                    8,
+                                    8,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu
similarity index 97%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu
index 52fb9140fe..d5579a2be0 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu
@@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail {
 
 using namespace cuvs::distance;
 template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
+                                    4,
                                     128,
                                     8,
                                     8,
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu
similarity index 97%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu
index 41cea6b5b1..ae33cdc65a 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu
@@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail {
 
 using namespace cuvs::distance;
 template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
+                                    8,
                                     256,
                                     8,
                                     8,
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu
similarity index 97%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu
index b0f650f45a..dcd1d6a074 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu
@@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail {
 
 using namespace cuvs::distance;
 template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
+                                    16,
                                     512,
                                     8,
                                     8,
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu
new file mode 100644
index 0000000000..740ad40f21
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    2,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu
new file mode 100644
index 0000000000..6a01a5c0d5
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    4,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu
new file mode 100644
index 0000000000..a9766124f8
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    1024,
+                                    8,
+                                    8,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu
similarity index 97%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu
index 82d5d738ab..c5d6c72a6f 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu
@@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail {
 
 using namespace cuvs::distance;
 template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
+                                    4,
                                     128,
                                     8,
                                     8,
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu
similarity index 97%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu
index a8b1472cff..2114941e3e 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu
@@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail {
 
 using namespace cuvs::distance;
 template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
+                                    8,
                                     256,
                                     8,
                                     8,
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu
similarity index 97%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu
index dc56563595..0b78982890 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu
@@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail {
 
 using namespace cuvs::distance;
 template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
+                                    16,
                                     512,
                                     8,
                                     8,
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu
new file mode 100644
index 0000000000..be699ca98a
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    2,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu
new file mode 100644
index 0000000000..36592482e1
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    4,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu
new file mode 100644
index 0000000000..e2d68ae772
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    1024,
+                                    8,
+                                    8,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu
similarity index 97%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu
index 4cb61e18ad..65cdfb0998 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu
@@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail {
 
 using namespace cuvs::distance;
 template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
+                                    4,
                                     128,
                                     8,
                                     8,
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu
similarity index 97%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu
index 5463f42ab5..bf24d343fc 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu
@@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail {
 
 using namespace cuvs::distance;
 template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
+                                    8,
                                     256,
                                     8,
                                     8,
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu
similarity index 97%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu
index 64d436115b..6cfa5ede30 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu
@@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail {
 
 using namespace cuvs::distance;
 template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
+                                    16,
                                     512,
                                     8,
                                     8,
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu
new file mode 100644
index 0000000000..aedfb0ef44
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    2,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu
new file mode 100644
index 0000000000..42a56de4a5
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    4,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu
new file mode 100644
index 0000000000..6217a0047c
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    1024,
+                                    8,
+                                    8,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu
similarity index 97%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu
index eb9bc63041..6d06771052 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu
@@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail {
 
 using namespace cuvs::distance;
 template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
+                                    4,
                                     128,
                                     8,
                                     8,
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu
similarity index 97%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu
index 3825658388..4da8b5c0d7 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu
@@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail {
 
 using namespace cuvs::distance;
 template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
+                                    8,
                                     256,
                                     8,
                                     8,
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu
similarity index 97%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu
index f4fc937e25..c1f63841b4 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu
@@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail {
 
 using namespace cuvs::distance;
 template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
+                                    16,
                                     512,
                                     8,
                                     8,
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu
new file mode 100644
index 0000000000..3d458c0c94
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    2,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu
new file mode 100644
index 0000000000..6f59b47bbe
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    4,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu
new file mode 100644
index 0000000000..10ddfc0163
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vrabitq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                                        8,
+                                        128,
+                                        float,
+                                        uint32_t,
+                                        uint32_t,
+                                        uint64_t,
+                                        float,
+                                        float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu
new file mode 100644
index 0000000000..e057457a6a
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vrabitq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                                        16,
+                                        256,
+                                        float,
+                                        uint32_t,
+                                        uint32_t,
+                                        uint64_t,
+                                        float,
+                                        float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu
new file mode 100644
index 0000000000..c30bd76785
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vrabitq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                                        32,
+                                        512,
+                                        float,
+                                        uint32_t,
+                                        uint32_t,
+                                        uint64_t,
+                                        float,
+                                        float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu
new file mode 100644
index 0000000000..472dd9821f
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vrabitq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
+                                        4,
+                                        64,
+                                        float,
+                                        uint32_t,
+                                        uint32_t,
+                                        uint64_t,
+                                        float,
+                                        float>;
+
+}  // namespace cuvs::neighbors::cagra::detail

From e0f629cb82e157d8c0fbdfd13cf57bac6c4528ef Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Tue, 21 Oct 2025 17:11:20 +0900
Subject: [PATCH 006/119] Add fp_8bit4

---
 cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
index 1b098ac5c1..2806e88646 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,4 +111,10 @@ struct fp_8bit {
   }
 };
 
+template <uint32_t ExpBits, bool Signed>
+struct fp_8bit4 {
+  fp_8bit<ExpBits, Signed> x, y, z, w;
+  HDI fp_8bit4() : x(0), y(0), z(0), w(0) {}
+};
+
 }  // namespace cuvs::neighbors::ivf_pq::detail

From 0da1aa2bcf2357fb0e6f4470552bdeaaa22a69c3 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Tue, 21 Oct 2025 17:40:03 +0900
Subject: [PATCH 007/119] Fix compilation error

---
 cpp/CMakeLists.txt                            |  28 ++---
 .../detail/cagra/compute_distance-ext.cuh     | 109 ------------------
 .../detail/cagra/compute_distance.cu          |  36 ------
 .../cagra/compute_distance_00_generate.py     |  19 ---
 ...d_CosineExpanded_float_uint32_dim128_t8.cu |   2 +-
 ..._CosineExpanded_float_uint32_dim256_t16.cu |   2 +-
 ..._CosineExpanded_float_uint32_dim512_t32.cu |   2 +-
 ...rd_CosineExpanded_half_uint32_dim128_t8.cu |   2 +-
 ...d_CosineExpanded_half_uint32_dim256_t16.cu |   2 +-
 ...d_CosineExpanded_half_uint32_dim512_t32.cu |   2 +-
 ...rd_CosineExpanded_int8_uint32_dim128_t8.cu |   2 +-
 ...d_CosineExpanded_int8_uint32_dim256_t16.cu |   2 +-
 ...d_CosineExpanded_int8_uint32_dim512_t32.cu |   2 +-
 ...d_CosineExpanded_uint8_uint32_dim128_t8.cu |   2 +-
 ..._CosineExpanded_uint8_uint32_dim256_t16.cu |   2 +-
 ..._CosineExpanded_uint8_uint32_dim512_t32.cu |   2 +-
 16 files changed, 24 insertions(+), 192 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f4831ef1c7..6b4397307c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -224,12 +224,21 @@ if(NOT BUILD_CPU_ONLY)
     src/neighbors/detail/cagra/compute_distance_standard_BitwiseHamming_u8_uint32_dim128_t8.cu
     src/neighbors/detail/cagra/compute_distance_standard_BitwiseHamming_u8_uint32_dim256_t16.cu
     src/neighbors/detail/cagra/compute_distance_standard_BitwiseHamming_u8_uint32_dim512_t32.cu
-    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu
-    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu
-    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu
     src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim128_t8.cu
     src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim256_t16.cu
     src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim512_t32.cu
+    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim128_t8.cu
+    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim256_t16.cu
+    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim512_t32.cu
+    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim128_t8.cu
+    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim256_t16.cu
+    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim512_t32.cu
+    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim128_t8.cu
+    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim256_t16.cu
+    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim512_t32.cu
+    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu
+    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu
+    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu
     src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu
     src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu
     src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu
@@ -245,15 +254,6 @@ if(NOT BUILD_CPU_ONLY)
     src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu
     src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu
     src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu
-    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim128_t8.cu
-    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim256_t16.cu
-    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim512_t32.cu
-    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim128_t8.cu
-    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim256_t16.cu
-    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim512_t32.cu
-    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim128_t8.cu
-    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim256_t16.cu
-    src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim512_t32.cu
     src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu
     src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu
     src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu
@@ -308,10 +308,6 @@ if(NOT BUILD_CPU_ONLY)
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu
     src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu
-    src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu
-    src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu
-    src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu
     src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu
     src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu
     src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
index 093bb92730..ce97558f67 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
@@ -27,7 +27,6 @@
 
 #include "compute_distance_standard.hpp"
 #include "compute_distance_vpq.hpp"
-#include "compute_distance_vrabitq.hpp"
 
 namespace cuvs::neighbors::cagra::detail {
 
@@ -681,42 +680,6 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            uint8_t,
                                            uint32_t,
                                            float>;
-extern template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                                               4,
-                                               64,
-                                               float,
-                                               uint32_t,
-                                               uint32_t,
-                                               uint64_t,
-                                               float,
-                                               float>;
-extern template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                                               8,
-                                               128,
-                                               float,
-                                               uint32_t,
-                                               uint32_t,
-                                               uint64_t,
-                                               float,
-                                               float>;
-extern template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                                               16,
-                                               256,
-                                               float,
-                                               uint32_t,
-                                               uint32_t,
-                                               uint64_t,
-                                               float,
-                                               float>;
-extern template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                                               32,
-                                               512,
-                                               float,
-                                               uint32_t,
-                                               uint32_t,
-                                               uint64_t,
-                                               float,
-                                               float>;
 extern template struct standard_descriptor_spec<DistanceType::BitwiseHamming,
                                                 8,
                                                 128,
@@ -821,42 +784,6 @@ extern template struct instance_selector<
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, uint8_t, uint32_t, float>,
-  vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                          4,
-                          64,
-                          float,
-                          uint32_t,
-                          uint32_t,
-                          uint64_t,
-                          float,
-                          float>,
-  vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                          8,
-                          128,
-                          float,
-                          uint32_t,
-                          uint32_t,
-                          uint64_t,
-                          float,
-                          float>,
-  vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                          16,
-                          256,
-                          float,
-                          uint32_t,
-                          uint32_t,
-                          uint64_t,
-                          float,
-                          float>,
-  vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                          32,
-                          512,
-                          float,
-                          uint32_t,
-                          uint32_t,
-                          uint64_t,
-                          float,
-                          float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 32, 512, uint8_t, uint32_t, float>>;
@@ -946,42 +873,6 @@ using descriptor_instances = instance_selector<
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, uint8_t, uint32_t, float>,
-  vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                          4,
-                          64,
-                          float,
-                          uint32_t,
-                          uint32_t,
-                          uint64_t,
-                          float,
-                          float>,
-  vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                          8,
-                          128,
-                          float,
-                          uint32_t,
-                          uint32_t,
-                          uint64_t,
-                          float,
-                          float>,
-  vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                          16,
-                          256,
-                          float,
-                          uint32_t,
-                          uint32_t,
-                          uint64_t,
-                          float,
-                          float>,
-  vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                          32,
-                          512,
-                          float,
-                          uint32_t,
-                          uint32_t,
-                          uint64_t,
-                          float,
-                          float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 32, 512, uint8_t, uint32_t, float>>;
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu
index ab39670bc8..13fdb7b832 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu
@@ -114,42 +114,6 @@ template struct instance_selector<
   vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, uint8_t, uint32_t, float>,
   vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, uint8_t, uint32_t, float>,
-  vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                          4,
-                          64,
-                          float,
-                          uint32_t,
-                          uint32_t,
-                          uint64_t,
-                          float,
-                          float>,
-  vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                          8,
-                          128,
-                          float,
-                          uint32_t,
-                          uint32_t,
-                          uint64_t,
-                          float,
-                          float>,
-  vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                          16,
-                          256,
-                          float,
-                          uint32_t,
-                          uint32_t,
-                          uint64_t,
-                          float,
-                          float>,
-  vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                          32,
-                          512,
-                          float,
-                          uint32_t,
-                          uint32_t,
-                          uint64_t,
-                          float,
-                          float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 32, 512, uint8_t, uint32_t, float>>;
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
index 440c04cffa..095b4bb36b 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
@@ -119,24 +119,6 @@
                             f.write(template.format(includes=includes, content=content))
                             cmake_list.append(f"  src/neighbors/detail/cagra/{path}")
 
-# CAGRA-RaBitQ
-for (mxdim, team) in vrq_mxdim_team:
-    for vq_code_t in ['uint32_t']:
-        for rabitq_code_t in ['uint64_t']:
-            for metric in ['L2Expanded']:
-                data_t = 'float'
-                idx_t = 'uint32_t'
-                distance_t = 'float'
-                path = f"compute_distance_vrabitq_{metric}_dim{mxdim}_t{team}_{vq_code_t}_{rabitq_code_t}.cu"
-                includes = '#include "compute_distance_vrabitq-impl.cuh"'
-                params = f"{metric_prefix}{metric}, {team}, {mxdim}, {data_t}, {idx_t}, {vq_code_t}, {rabitq_code_t}, float, {distance_t}"
-                spec = f"vrabitq_descriptor_spec<{params}>"
-                content = f"""template struct {spec};"""
-                specs.append(spec)
-                with open(path, "w") as f:
-                    f.write(template.format(includes=includes, content=content))
-                    cmake_list.append(f"  src/neighbors/detail/cagra/{path}")
-
 # CAGRA (Binary Hamming distance)
 for (mxdim, team) in mxdim_team:
     metric = 'BitwiseHamming'
@@ -161,7 +143,6 @@
 
 #include "compute_distance_standard.hpp"
 #include "compute_distance_vpq.hpp"
-#include "compute_distance_vrabitq.hpp"
 '''
     newline = "\n"
     contents = f'''
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim128_t8.cu
index c34298040c..c5d3579d25 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim128_t8.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim256_t16.cu
index a8b4c726c2..c1f000700d 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim256_t16.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim512_t32.cu
index 6c8a090093..ea51d0af5c 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim512_t32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim128_t8.cu
index cfc13a2e5d..c8bc8d46e3 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim128_t8.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim256_t16.cu
index fc6e5084b2..faa3e5d765 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim256_t16.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim512_t32.cu
index bad82cbb4e..1a1eb89630 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim512_t32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim128_t8.cu
index 4babd36f31..104059b9ed 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim128_t8.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim256_t16.cu
index 4593798241..4e057ed800 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim256_t16.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim512_t32.cu
index 987d7e8f26..5b76a3c17b 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim512_t32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim128_t8.cu
index 02f86c8d74..82d7c39886 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim128_t8.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim256_t16.cu
index 8c01303faa..2a5349bcf5 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim256_t16.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim512_t32.cu
index d39fe01d7c..7951da6233 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim512_t32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 62ba0adbacb6f4e7eb62da3709b86a4dd61880aa Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Tue, 21 Oct 2025 17:40:36 +0900
Subject: [PATCH 008/119] Add as_u32

---
 cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
index 2806e88646..4395c46408 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
@@ -115,6 +115,9 @@ template <uint32_t ExpBits, bool Signed>
 struct fp_8bit4 {
   fp_8bit<ExpBits, Signed> x, y, z, w;
   HDI fp_8bit4() : x(0), y(0), z(0), w(0) {}
+
+  HDI uint32_t& as_u32() { return *reinterpret_cast<uint32_t*>(this); }
+  HDI uint32_t as_u32() const { return *reinterpret_cast<const uint32_t*>(this); }
 };
 
 }  // namespace cuvs::neighbors::ivf_pq::detail

From 7f9f614355256a88561f4a4da38538d8671b0e7b Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Tue, 21 Oct 2025 23:36:45 +0900
Subject: [PATCH 009/119] Update VPQ

---
 .../cagra/compute_distance_vpq-impl.cuh       | 52 ++++++++++++-------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 7545c25a2b..abd9146144 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -26,6 +26,11 @@
 
 namespace cuvs::neighbors::cagra::detail {
 
+using pq_val_t                              = half;
+using pq_val_pack_t                         = half2;
+using pq_val_pack_uint_t                    = uint32_t;
+constexpr uint32_t pq_val_pack_num_elements = 2;
+
 template <cuvs::distance::DistanceType Metric,
           uint32_t TeamSize,
           uint32_t DatasetBlockDim,
@@ -96,7 +101,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
   }
 
   static constexpr std::uint32_t kSMemCodeBookSizeInBytes =
-    (1 << PQ_BITS) * PQ_LEN * utils::size_of<CODE_BOOK_T>();
+    (1 << PQ_BITS) * PQ_LEN * utils::size_of<pq_val_pack_uint_t>() / pq_val_pack_num_elements;
 
   _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl,
                                                  compute_distance_type* compute_distance_impl,
@@ -178,19 +183,22 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
 
     // Copy PQ table
     for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) {
-      half2 buf2;
-      buf2.x = r->pq_code_book_ptr()[i];
-      buf2.y = r->pq_code_book_ptr()[i + 1];
-
       // Change the order of PQ code book array to reduce the
       // frequency of bank conflicts.
-      constexpr auto num_elements_per_bank  = 4 / utils::size_of<CODE_BOOK_T>();
+      constexpr auto num_elements_per_bank =
+        pq_val_pack_num_elements /
+        (utils::size_of<pq_val_pack_uint_t>() / utils::size_of<uint32_t>());
       constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank;
       const auto j                          = i / num_elements_per_bank;
       const auto smem_index =
         (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
 
-      device::sts(codebook_buf + smem_index * sizeof(half2), buf2);
+      if constexpr (std::is_same_v<pq_val_t, half>) {
+        half2 buf2;
+        buf2.x = r->pq_code_book_ptr()[i];
+        buf2.y = r->pq_code_book_ptr()[i + 1];
+        device::sts(codebook_buf + smem_index * sizeof(half2), buf2);
+      }
     }
   }
 
@@ -286,22 +294,28 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
         for (std::uint32_t v = 0; v < vlen; v++) {
           if (PQ_LEN * (v + k) >= dim) break;
 #pragma unroll
-          for (std::uint32_t m = 0; m < PQ_LEN / 2; m++) {
+          for (std::uint32_t m = 0; m < PQ_LEN / pq_val_pack_num_elements; m++) {
             constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN);
-            const std::uint32_t d1     = m + (PQ_LEN / 2) * v;
-            const std::uint32_t d =
-              d1 * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
-            half2 q2, c2;
-            // Loading query vector from smem
-            device::lds(q2, query_ptr + sizeof(half2) * d);
+            const std::uint32_t d1     = m + (PQ_LEN / pq_val_pack_num_elements) * v;
+            const std::uint32_t d      = d1 * kQueryBlock +
+                                    elem_offset * (PQ_LEN / pq_val_pack_num_elements) +
+                                    e * TeamSize + laneId;
+            half2 q2;
+            pq_val_pack_t c2;
             // Loading PQ code book from smem
             device::lds(c2,
                         pq_codebook_ptr +
-                          sizeof(CODE_BOOK_T) * ((1 << PQ_BITS) * 2 * m + (2 * (pq_code & 0xff))));
-            // L2 distance
-            auto dist = q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
-            dist      = dist * dist;
-            norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+                          sizeof(pq_val_pack_uint_t) * ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
+
+            if constexpr (std::is_same_v<pq_val_t, half>) {
+              // Loading query vector from smem
+              device::lds(q2, query_ptr + sizeof(half2) * d);
+              // L2 distance
+              auto dist = q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
+              dist      = dist * dist;
+              norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+            } else {
+            }
           }
           pq_code >>= 8;
         }

From 058abbb6e56a016efe21d32132a55227605cdd8a Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 22 Oct 2025 01:38:39 +0900
Subject: [PATCH 010/119] Fix fp_8bit4 constructor

---
 cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
index 4395c46408..359b638a41 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
@@ -114,7 +114,7 @@ struct fp_8bit {
 template <uint32_t ExpBits, bool Signed>
 struct fp_8bit4 {
   fp_8bit<ExpBits, Signed> x, y, z, w;
-  HDI fp_8bit4() : x(0), y(0), z(0), w(0) {}
+  HDI fp_8bit4() : x(0.f), y(0.f), z(0.f), w(0.f) {}
 
   HDI uint32_t& as_u32() { return *reinterpret_cast<uint32_t*>(this); }
   HDI uint32_t as_u32() const { return *reinterpret_cast<const uint32_t*>(this); }

From e64f8c67af9c8c975fa611a48d986810eca61cfd Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 22 Oct 2025 01:39:09 +0900
Subject: [PATCH 011/119] Add sts for u32

---
 cpp/src/neighbors/detail/cagra/device_common.hpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp
index 882928add0..ed583d6f5a 100644
--- a/cpp/src/neighbors/detail/cagra/device_common.hpp
+++ b/cpp/src/neighbors/detail/cagra/device_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -310,6 +310,11 @@ RAFT_DEVICE_INLINE_FUNCTION void lds(uint4& x, const uint4* addr)
   lds(x, uint32_t(__cvta_generic_to_shared(addr)));
 }
 
+RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const uint32_t& x)
+{
+  asm volatile("st.shared.u32 [%0], %1;" : : "r"(addr), "r"(reinterpret_cast<const uint32_t&>(x)));
+}
+
 RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const half2& x)
 {
   asm volatile("st.shared.v2.u16 [%0], {%1, %2};"

From 77492dd7439374fc08841538a0347ed7c4beb245 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 22 Oct 2025 01:40:08 +0900
Subject: [PATCH 012/119] Add f8

---
 .../cagra/compute_distance_vpq-impl.cuh       | 143 ++++++++++++------
 1 file changed, 93 insertions(+), 50 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index abd9146144..f5e66830ae 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "../../ivf_pq/ivf_pq_fp_8bit.cuh"
 #include "compute_distance_vpq.hpp"
 
 #include <cuvs/distance/distance.hpp>
@@ -26,10 +27,10 @@
 
 namespace cuvs::neighbors::cagra::detail {
 
-using pq_val_t                              = half;
-using pq_val_pack_t                         = half2;
+using pq_val_t                              = ivf_pq::detail::fp_8bit<5, true>;
+using pq_val_pack_t                         = ivf_pq::detail::fp_8bit4<5, true>;
 using pq_val_pack_uint_t                    = uint32_t;
-constexpr uint32_t pq_val_pack_num_elements = 2;
+constexpr uint32_t pq_val_pack_num_elements = 4;
 
 template <cuvs::distance::DistanceType Metric,
           uint32_t TeamSize,
@@ -188,16 +189,26 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
       constexpr auto num_elements_per_bank =
         pq_val_pack_num_elements /
         (utils::size_of<pq_val_pack_uint_t>() / utils::size_of<uint32_t>());
-      constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank;
-      const auto j                          = i / num_elements_per_bank;
-      const auto smem_index =
-        (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
-
-      if constexpr (std::is_same_v<pq_val_t, half>) {
-        half2 buf2;
-        buf2.x = r->pq_code_book_ptr()[i];
-        buf2.y = r->pq_code_book_ptr()[i + 1];
-        device::sts(codebook_buf + smem_index * sizeof(half2), buf2);
+
+      if constexpr (PQ_LEN > num_elements_per_bank) {  // safety
+        constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank;
+        const auto j                          = i / num_elements_per_bank;
+        const auto smem_index =
+          (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
+
+        if constexpr (std::is_same_v<pq_val_t, half>) {
+          pq_val_pack_t buf2;
+          buf2.x = r->pq_code_book_ptr()[i];
+          buf2.y = r->pq_code_book_ptr()[i + 1];
+          device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2);
+        } else {
+          pq_val_pack_t buf4;
+          buf4.x = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
+          buf4.y = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
+          buf4.z = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
+          buf4.w = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
+          device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf4.as_u32());
+        }
       }
     }
   }
@@ -275,49 +286,81 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
     }
     //
     if constexpr (PQ_LEN % 2 == 0) {
-      // **** Use half2 for distance computation ****
+      if constexpr (PQ_LEN >= pq_val_pack_num_elements) {  // safety
+        // **** Use half2 for distance computation ****
 #pragma unroll
-      for (std::uint32_t e = 0; e < nelem; e++) {
-        const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen;
-        if (k >= n_subspace) break;
-        // Loading VQ code-book
-        half2 vq_vals[PQ_LEN][vlen / 2];
+        for (std::uint32_t e = 0; e < nelem; e++) {
+          const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen;
+          if (k >= n_subspace) break;
+          // Loading VQ code-book
+          half2 vq_vals[PQ_LEN][vlen / 2];
 #pragma unroll
-        for (std::uint32_t m = 0; m < PQ_LEN; m++) {
-          const uint32_t d = (vlen * m) + (PQ_LEN * k);
-          if (d >= dim) break;
-          device::ldg_ca(vq_vals[m], vq_code_book_ptr + d);
-        }
-        // Compute distance
-        PQ_CODEBOOK_LOAD_T pq_code = pq_codes[e];
+          for (std::uint32_t m = 0; m < PQ_LEN; m++) {
+            const uint32_t d = (vlen * m) + (PQ_LEN * k);
+            if (d >= dim) break;
+            device::ldg_ca(vq_vals[m], vq_code_book_ptr + d);
+          }
+          // Compute distance
+          PQ_CODEBOOK_LOAD_T pq_code = pq_codes[e];
 #pragma unroll
-        for (std::uint32_t v = 0; v < vlen; v++) {
-          if (PQ_LEN * (v + k) >= dim) break;
+          for (std::uint32_t v = 0; v < vlen; v++) {
+            if (PQ_LEN * (v + k) >= dim) break;
 #pragma unroll
-          for (std::uint32_t m = 0; m < PQ_LEN / pq_val_pack_num_elements; m++) {
-            constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN);
-            const std::uint32_t d1     = m + (PQ_LEN / pq_val_pack_num_elements) * v;
-            const std::uint32_t d      = d1 * kQueryBlock +
-                                    elem_offset * (PQ_LEN / pq_val_pack_num_elements) +
-                                    e * TeamSize + laneId;
-            half2 q2;
-            pq_val_pack_t c2;
-            // Loading PQ code book from smem
-            device::lds(c2,
-                        pq_codebook_ptr +
-                          sizeof(pq_val_pack_uint_t) * ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
-
-            if constexpr (std::is_same_v<pq_val_t, half>) {
-              // Loading query vector from smem
-              device::lds(q2, query_ptr + sizeof(half2) * d);
-              // L2 distance
-              auto dist = q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
-              dist      = dist * dist;
-              norm += static_cast<DISTANCE_T>(dist.x + dist.y);
-            } else {
+            for (std::uint32_t m = 0; m < PQ_LEN / pq_val_pack_num_elements; m++) {
+              constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN);
+              std::uint32_t d1 =
+                m * (pq_val_pack_num_elements / 2) + (PQ_LEN / pq_val_pack_num_elements) * v;
+              std::uint32_t d = d1 * kQueryBlock +
+                                elem_offset * (PQ_LEN / pq_val_pack_num_elements) + e * TeamSize +
+                                laneId;
+              half2 q2;
+              // if constexpr (false) {
+              if constexpr (std::is_same_v<pq_val_t, half>) {
+                half2 c2;
+                // Loading PQ code book from smem
+                device::lds(c2,
+                            pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
+                                                ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
+
+                // Loading query vector from smem
+                device::lds(q2, query_ptr + sizeof(half2) * d);
+                half2 c2_ = c2;
+                // L2 distance
+                auto dist = q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
+                dist      = dist * dist;
+                norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+              } else {
+                pq_val_pack_t c2;
+                // Loading PQ code book from smem
+                device::lds(c2.as_u32(),
+                            pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
+                                                ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
+
+                half2 c2_;
+
+                // Loading query vector from smem
+                device::lds(q2, query_ptr + sizeof(half2) * d);
+                c2_.x = static_cast<half>(c2.x);
+                c2_.y = static_cast<half>(c2.y);
+                // L2 distance
+                auto dist = q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
+                dist      = dist * dist;
+                norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+
+                d1 += 1;
+                d += kQueryBlock;
+
+                device::lds(q2, query_ptr + sizeof(half2) * d);
+                c2_.x = static_cast<half>(c2.z);
+                c2_.y = static_cast<half>(c2.w);
+                // L2 distance
+                dist = q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
+                dist = dist * dist;
+                norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+              }
             }
+            pq_code >>= 8;
           }
-          pq_code >>= 8;
         }
       }
     } else {

From 4638eb28c716be8922accc1cb14ae18f0db8fab0 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 22 Oct 2025 11:00:39 +0900
Subject: [PATCH 013/119] Fix a bug

---
 .../detail/cagra/compute_distance_vpq-impl.cuh | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index f5e66830ae..088af6c01b 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -27,10 +27,17 @@
 
 namespace cuvs::neighbors::cagra::detail {
 
+#if 1
 using pq_val_t                              = ivf_pq::detail::fp_8bit<5, true>;
 using pq_val_pack_t                         = ivf_pq::detail::fp_8bit4<5, true>;
 using pq_val_pack_uint_t                    = uint32_t;
 constexpr uint32_t pq_val_pack_num_elements = 4;
+#else
+using pq_val_t                              = half;
+using pq_val_pack_t                         = half2;
+using pq_val_pack_uint_t                    = uint32_t;
+constexpr uint32_t pq_val_pack_num_elements = 2;
+#endif
 
 template <cuvs::distance::DistanceType Metric,
           uint32_t TeamSize,
@@ -183,7 +190,8 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
     __syncthreads();
 
     // Copy PQ table
-    for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) {
+    for (unsigned i = threadIdx.x * pq_val_pack_num_elements; i < (1 << PQ_BITS) * PQ_LEN;
+         i += blockDim.x * pq_val_pack_num_elements) {
       // Change the order of PQ code book array to reduce the
       // frequency of bank conflicts.
       constexpr auto num_elements_per_bank =
@@ -308,11 +316,9 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
 #pragma unroll
             for (std::uint32_t m = 0; m < PQ_LEN / pq_val_pack_num_elements; m++) {
               constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN);
-              std::uint32_t d1 =
-                m * (pq_val_pack_num_elements / 2) + (PQ_LEN / pq_val_pack_num_elements) * v;
-              std::uint32_t d = d1 * kQueryBlock +
-                                elem_offset * (PQ_LEN / pq_val_pack_num_elements) + e * TeamSize +
-                                laneId;
+              std::uint32_t d1           = m * (pq_val_pack_num_elements / 2) + (PQ_LEN / 2) * v;
+              std::uint32_t d =
+                d1 * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
               half2 q2;
               // if constexpr (false) {
               if constexpr (std::is_same_v<pq_val_t, half>) {

From a64f2642e941458704497e07e11e32ead919e33a Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 22 Oct 2025 13:01:16 +0900
Subject: [PATCH 014/119] Add native f8 support

---
 .../detail/cagra/compute_distance_vpq-impl.cuh   |  4 ++--
 cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh      | 16 ++++++++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 088af6c01b..21078e73da 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -28,8 +28,8 @@
 namespace cuvs::neighbors::cagra::detail {
 
 #if 1
-using pq_val_t                              = ivf_pq::detail::fp_8bit<5, true>;
-using pq_val_pack_t                         = ivf_pq::detail::fp_8bit4<5, true>;
+using pq_val_pack_t                         = ivf_pq::detail::fp_8bit4<5, true, false>;
+using pq_val_t                              = typename pq_val_pack_t::unit_t;
 using pq_val_pack_uint_t                    = uint32_t;
 constexpr uint32_t pq_val_pack_num_elements = 4;
 #else
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
index 359b638a41..c4e1c08c01 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
@@ -32,6 +32,7 @@
 
 #include <cub/cub.cuh>
 #include <cuda_fp16.h>
+#include <cuda_fp8.h>
 
 namespace cuvs::neighbors::ivf_pq::detail {
 
@@ -111,9 +112,20 @@ struct fp_8bit {
   }
 };
 
-template <uint32_t ExpBits, bool Signed>
+template <uint32_t ExpBits, bool Signed, bool SW_Emulation = true>
 struct fp_8bit4 {
-  fp_8bit<ExpBits, Signed> x, y, z, w;
+  using unit_t = fp_8bit<ExpBits, Signed>;
+  unit_t x, y, z, w;
+  HDI fp_8bit4() : x(0.f), y(0.f), z(0.f), w(0.f) {}
+
+  HDI uint32_t& as_u32() { return *reinterpret_cast<uint32_t*>(this); }
+  HDI uint32_t as_u32() const { return *reinterpret_cast<const uint32_t*>(this); }
+};
+
+template <>
+struct fp_8bit4<5, true, false> {
+  using unit_t = __nv_fp8_e5m2;
+  unit_t x, y, z, w;
   HDI fp_8bit4() : x(0.f), y(0.f), z(0.f), w(0.f) {}
 
   HDI uint32_t& as_u32() { return *reinterpret_cast<uint32_t*>(this); }

From 60ba5e997397c010e7e6d3fc6cc25081a75f3c9b Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 22 Oct 2025 15:47:27 +0900
Subject: [PATCH 015/119] Fix VPQ init

---
 cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 21078e73da..afb9d0bbcc 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -198,7 +198,7 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
         pq_val_pack_num_elements /
         (utils::size_of<pq_val_pack_uint_t>() / utils::size_of<uint32_t>());
 
-      if constexpr (PQ_LEN > num_elements_per_bank) {  // safety
+      if constexpr (PQ_LEN >= num_elements_per_bank) {  // safety
         constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank;
         const auto j                          = i / num_elements_per_bank;
         const auto smem_index =

From f37a13160838fb9c26e0029fa9b2f759a187cd37 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 27 Aug 2025 01:14:46 +0900
Subject: [PATCH 016/119] Update clock measure

---
 .../neighbors/detail/cagra/device_common.hpp  |   11 +
 .../cagra/search_multi_cta_kernel-inl.cuh     | 1156 +++++++++--------
 .../cagra/search_single_cta_kernel-inl.cuh    |   30 +-
 3 files changed, 610 insertions(+), 587 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp
index ed583d6f5a..9715a0473f 100644
--- a/cpp/src/neighbors/detail/cagra/device_common.hpp
+++ b/cpp/src/neighbors/detail/cagra/device_common.hpp
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+// #define _CLK_BREAKDOWN
+
 #include "hashmap.hpp"
 #include "utils.hpp"
 
@@ -186,6 +188,9 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes(
   const IndexT* __restrict__ parent_indices,
   const IndexT* __restrict__ internal_topk_list,
   const uint32_t search_width,
+#ifdef _CLK_BREAKDOWN
+  std::uint64_t& clk_compute_actual_distance,
+#endif
   int* __restrict__ result_position = nullptr,
   const int max_result_position     = 0)
 {
@@ -238,11 +243,17 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes(
     // > const auto child_dist = dataset_desc.compute_distance(child_id, child_id != invalid_index);
     // Instead, we manually inline this function for performance reasons.
     // This allows us to move the fetching of the arguments from shared memory out of the loop.
+#ifdef _CLK_BREAKDOWN
+    const auto start_clock = clock64();
+#endif
     const DistanceT child_dist = device::team_sum(
       (child_id != invalid_index) ? compute_distance(args, child_id)
                                   : (lead_lane ? raft::upper_bound<DistanceT>() : 0),
       team_size_bits);
     __syncwarp();
+#ifdef _CLK_BREAKDOWN
+    clk_compute_actual_distance += clock64() - start_clock;
+#endif
 
     // Store the distance
     if (valid_i && lead_lane) { result_child_distances_ptr[j] = child_dist; }
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
index ea738b137b..7860a32eba 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
@@ -1,575 +1,581 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include "search_multi_cta_kernel.cuh"
-
-#include "bitonic.hpp"
-#include "compute_distance-ext.cuh"
-#include "device_common.hpp"
-#include "hashmap.hpp"
-#include "search_plan.cuh"
-#include "topk_for_cagra/topk.h"  // TODO replace with raft topk if possible
-#include "utils.hpp"
-
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/logger.hpp>
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/core/resource/device_properties.hpp>
-#include <raft/core/resources.hpp>
-
-#include <cuvs/distance/distance.hpp>
-
-#include <cuvs/neighbors/common.hpp>
-
-// TODO: This shouldn't be invoking anything from spatial/knn
-#include "../ann_utils.cuh"
-
-#include <raft/util/cuda_rt_essentials.hpp>
-#include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
-
-#include <algorithm>
-#include <cassert>
-#include <cstdio>
-#include <iostream>
-#include <memory>
-#include <numeric>
-#include <vector>
-
-namespace cuvs::neighbors::cagra::detail {
-namespace multi_cta_search {
-
-// #define _CLK_BREAKDOWN
-
-template <class INDEX_T, class DISTANCE_T>
-RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parent(
-  INDEX_T* const next_parent_indices,
-  INDEX_T* const itopk_indices,       // [itopk_size * 2]
-  DISTANCE_T* const itopk_distances,  // [itopk_size * 2]
-  INDEX_T* const hash_ptr,
-  const uint32_t hash_bitlen)
-{
-  constexpr uint32_t itopk_size      = 32;
-  constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
-  constexpr INDEX_T invalid_index    = ~static_cast<INDEX_T>(0);
-
-  const unsigned warp_id = threadIdx.x / 32;
-  if (warp_id > 0) { return; }
-  if (threadIdx.x == 0) { next_parent_indices[0] = invalid_index; }
-  __syncwarp();
-
-  int j = -1;
-  for (unsigned i = threadIdx.x; i < itopk_size * 2; i += 32) {
-    INDEX_T index    = itopk_indices[i];
-    int is_invalid   = 0;
-    int is_candidate = 0;
-    if (index == invalid_index) {
-      is_invalid = 1;
-    } else if (index & index_msb_1_mask) {
-    } else {
-      is_candidate = 1;
-    }
-
-    const auto ballot_mask  = __ballot_sync(0xffffffff, is_candidate);
-    const auto candidate_id = __popc(ballot_mask & ((1 << threadIdx.x) - 1));
-    for (int k = 0; k < __popc(ballot_mask); k++) {
-      int flag_done = 0;
-      if (is_candidate && candidate_id == k) {
-        is_candidate = 0;
-        if (hashmap::insert<INDEX_T, 1>(hash_ptr, hash_bitlen, index)) {
-          // Use this candidate as next parent
-          index |= index_msb_1_mask;  // set most significant bit as used node
-          if (i < itopk_size) {
-            next_parent_indices[0] = i;
-            itopk_indices[i]       = index;
-          } else {
-            next_parent_indices[0] = j;
-            // Move the next parent node from i-th position to j-th position
-            itopk_indices[j]   = index;
-            itopk_distances[j] = itopk_distances[i];
-            itopk_indices[i]   = invalid_index;
-            itopk_distances[i] = utils::get_max_value<DISTANCE_T>();
-          }
-          flag_done = 1;
-        } else {
-          // Deactivate the node since it has been used by other CTA.
-          itopk_indices[i]   = invalid_index;
-          itopk_distances[i] = utils::get_max_value<DISTANCE_T>();
-          is_invalid         = 1;
-        }
-      }
-      if (__any_sync(0xffffffff, (flag_done > 0))) { return; }
-    }
-    if (i < itopk_size) {
-      j = 31 - __clz(__ballot_sync(0xffffffff, is_invalid));
-      if (j < 0) { return; }
-    }
-  }
-}
-
-template <unsigned MAX_ELEMENTS, class INDEX_T>
-RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort(float* distances,  // [num_elements]
-                                                      INDEX_T* indices,  // [num_elements]
-                                                      const uint32_t num_elements)
-{
-  const unsigned warp_id = threadIdx.x / 32;
-  if (warp_id > 0) { return; }
-  const unsigned lane_id = threadIdx.x % 32;
-  constexpr unsigned N   = (MAX_ELEMENTS + 31) / 32;
-  float key[N];
-  INDEX_T val[N];
-  for (unsigned i = 0; i < N; i++) {
-    unsigned j = lane_id + (32 * i);
-    if (j < num_elements) {
-      key[i] = distances[j];
-      val[i] = indices[j];
-    } else {
-      key[i] = utils::get_max_value<float>();
-      val[i] = ~static_cast<INDEX_T>(0);
-    }
-  }
-  /* Warp Sort */
-  bitonic::warp_sort<float, INDEX_T, N>(key, val);
-  /* Store sorted results */
-  for (unsigned i = 0; i < N; i++) {
-    unsigned j = (N * lane_id) + i;
-    if (j < num_elements) {
-      distances[j] = key[i];
-      indices[j]   = val[i];
-    }
-  }
-}
-
-//
-// multiple CTAs per single query
-//
-template <std::uint32_t MAX_ELEMENTS, class DATASET_DESCRIPTOR_T, class SAMPLE_FILTER_T>
-RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const
-    result_indices_ptr,  // [num_queries, num_cta_per_query, itopk_size]
-  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const
-    result_distances_ptr,  // [num_queries, num_cta_per_query, itopk_size]
-  const DATASET_DESCRIPTOR_T* dataset_desc,
-  const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph,   // [dataset_size, graph_degree]
-  const uint32_t graph_degree,
-  const unsigned num_distilation,
-  const uint64_t rand_xor_mask,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* seed_ptr,  // [num_queries, num_seeds]
-  const uint32_t num_seeds,
-  const uint32_t visited_hash_bitlen,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const
-    traversed_hashmap_ptr,  // [num_queries, 1 << traversed_hash_bitlen]
-  const uint32_t traversed_hash_bitlen,
-  const uint32_t itopk_size,
-  const uint32_t min_iteration,
-  const uint32_t max_iteration,
-  uint32_t* const num_executed_iterations, /* stats */
-  SAMPLE_FILTER_T sample_filter)
-{
-  using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
-  using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
-  using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
-
-  const auto num_queries       = gridDim.y;
-  const auto query_id          = blockIdx.y;
-  const auto num_cta_per_query = gridDim.x;
-  const auto cta_id            = blockIdx.x;  // local CTA ID
-
-#ifdef _CLK_BREAKDOWN
-  uint64_t clk_init                 = 0;
-  uint64_t clk_compute_1st_distance = 0;
-  uint64_t clk_topk                 = 0;
-  uint64_t clk_pickup_parents       = 0;
-  uint64_t clk_compute_distance     = 0;
-  uint64_t clk_start;
-#define _CLK_START() clk_start = clock64()
-#define _CLK_REC(V)  V += clock64() - clk_start;
-#else
-#define _CLK_START()
-#define _CLK_REC(V)
-#endif
-  _CLK_START();
-
-  extern __shared__ uint8_t smem[];
-
-  // Layout of result_buffer
-  // +----------------+---------+---------------------------+
-  // | internal_top_k | padding | neighbors of parent nodes |
-  // | <itopk_size>   | upto 32 | <graph_degree>            |
-  // +----------------+---------+---------------------------+
-  // |<---        result_buffer_size_32                 --->|
-  const auto result_buffer_size    = itopk_size + graph_degree;
-  const auto result_buffer_size_32 = raft::round_up_safe<uint32_t>(result_buffer_size, 32);
-  assert(result_buffer_size_32 <= MAX_ELEMENTS);
-
-  // Set smem working buffer for the distance calculation
-  dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id);
-
-  auto* __restrict__ result_indices_buffer =
-    reinterpret_cast<INDEX_T*>(smem + dataset_desc->smem_ws_size_in_bytes());
-  auto* __restrict__ result_distances_buffer =
-    reinterpret_cast<DISTANCE_T*>(result_indices_buffer + result_buffer_size_32);
-  auto* __restrict__ local_visited_hashmap_ptr =
-    reinterpret_cast<INDEX_T*>(result_distances_buffer + result_buffer_size_32);
-  auto* __restrict__ parent_indices_buffer =
-    reinterpret_cast<INDEX_T*>(local_visited_hashmap_ptr + hashmap::get_size(visited_hash_bitlen));
-  auto* __restrict__ result_position = reinterpret_cast<int*>(parent_indices_buffer + 1);
-
-  INDEX_T* const local_traversed_hashmap_ptr =
-    traversed_hashmap_ptr + (hashmap::get_size(traversed_hash_bitlen) * query_id);
-
-  constexpr INDEX_T invalid_index    = ~static_cast<INDEX_T>(0);
-  constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
-
-  for (unsigned i = threadIdx.x; i < result_buffer_size_32; i += blockDim.x) {
-    result_indices_buffer[i]   = invalid_index;
-    result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
-  }
-  hashmap::init<INDEX_T>(local_visited_hashmap_ptr, visited_hash_bitlen);
-  __syncthreads();
-  _CLK_REC(clk_init);
-
-  // compute distance to randomly selecting nodes
-  _CLK_START();
-  const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr;
-  uint32_t block_id                   = cta_id + (num_cta_per_query * query_id);
-  uint32_t num_blocks                 = num_cta_per_query * num_queries;
-
-  device::compute_distance_to_random_nodes(result_indices_buffer,
-                                           result_distances_buffer,
-                                           *dataset_desc,
-                                           graph_degree,
-                                           num_distilation,
-                                           rand_xor_mask,
-                                           local_seed_ptr,
-                                           num_seeds,
-                                           local_visited_hashmap_ptr,
-                                           visited_hash_bitlen,
-                                           local_traversed_hashmap_ptr,
-                                           traversed_hash_bitlen,
-                                           block_id,
-                                           num_blocks);
-  __syncthreads();
-  _CLK_REC(clk_compute_1st_distance);
-
-  uint32_t iter = 0;
-  while (1) {
-    _CLK_START();
-    if (threadIdx.x < 32) {
-      // [1st warp] Topk with bitonic sort
-      topk_by_bitonic_sort<MAX_ELEMENTS, INDEX_T>(
-        result_distances_buffer, result_indices_buffer, result_buffer_size_32);
-    }
-    __syncthreads();
-    _CLK_REC(clk_topk);
-
-    if (iter + 1 >= max_iteration) { break; }
-
-    _CLK_START();
-    if (threadIdx.x < 32) {
-      // [1st warp] Pick up a next parent
-      pickup_next_parent<INDEX_T, DISTANCE_T>(parent_indices_buffer,
-                                              result_indices_buffer,
-                                              result_distances_buffer,
-                                              local_traversed_hashmap_ptr,
-                                              traversed_hash_bitlen);
-    } else {
-      // [Other warps] Reset visited hashmap
-      hashmap::init<INDEX_T>(local_visited_hashmap_ptr, visited_hash_bitlen, 32);
-    }
-    __syncthreads();
-    _CLK_REC(clk_pickup_parents);
-
-    if ((parent_indices_buffer[0] == invalid_index) && (iter >= min_iteration)) { break; }
-
-    _CLK_START();
-    for (unsigned i = threadIdx.x; i < result_buffer_size_32; i += blockDim.x) {
-      INDEX_T index = result_indices_buffer[i];
-      if (index == invalid_index) { continue; }
-      if ((i >= itopk_size) && (index & index_msb_1_mask)) {
-        // Remove nodes kicked out of the itopk list from the traversed hash table.
-        hashmap::remove<INDEX_T>(
-          local_traversed_hashmap_ptr, traversed_hash_bitlen, index & ~index_msb_1_mask);
-        result_indices_buffer[i]   = invalid_index;
-        result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
-      } else {
-        // Restore visited hashmap by putting nodes on result buffer in it.
-        index &= ~index_msb_1_mask;
-        hashmap::insert(local_visited_hashmap_ptr, visited_hash_bitlen, index);
-      }
-    }
-    // Initialize buffer for compute_distance_to_child_nodes.
-    if (threadIdx.x == blockDim.x - 1) { result_position[0] = result_buffer_size_32; }
-    __syncthreads();
-
-    // Compute the norms between child nodes and query node
-    device::compute_distance_to_child_nodes<INDEX_T, DISTANCE_T, DATASET_DESCRIPTOR_T, 0>(
-      result_indices_buffer,
-      result_distances_buffer,
-      *dataset_desc,
-      knn_graph,
-      graph_degree,
-      local_visited_hashmap_ptr,
-      visited_hash_bitlen,
-      local_traversed_hashmap_ptr,
-      traversed_hash_bitlen,
-      parent_indices_buffer,
-      result_indices_buffer,
-      1,
-      result_position,
-      result_buffer_size_32);
-    // __syncthreads();
-
-    // Check the state of the nodes in the result buffer which were not updated
-    // by the compute_distance_to_child_nodes above, and if it cannot be used as
-    // a parent node, it is deactivated.
-    for (uint32_t i = threadIdx.x; i < result_position[0]; i += blockDim.x) {
-      INDEX_T index = result_indices_buffer[i];
-      if (index == invalid_index || index & index_msb_1_mask) { continue; }
-      if (hashmap::search<INDEX_T, 1>(local_traversed_hashmap_ptr, traversed_hash_bitlen, index)) {
-        result_indices_buffer[i]   = invalid_index;
-        result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
-      }
-    }
-    __syncthreads();
-    _CLK_REC(clk_compute_distance);
-
-    // Filtering
-    if constexpr (!std::is_same<SAMPLE_FILTER_T,
-                                cuvs::neighbors::filtering::none_sample_filter>::value) {
-      for (unsigned p = threadIdx.x; p < 1; p += blockDim.x) {
-        if (parent_indices_buffer[p] != invalid_index) {
-          const auto parent_id =
-            result_indices_buffer[parent_indices_buffer[p]] & ~index_msb_1_mask;
-          if (!sample_filter(query_id, parent_id)) {
-            // If the parent must not be in the resulting top-k list, remove from the parent list
-            result_distances_buffer[parent_indices_buffer[p]] = utils::get_max_value<DISTANCE_T>();
-            result_indices_buffer[parent_indices_buffer[p]]   = invalid_index;
-          }
-        }
-      }
-      __syncthreads();
-    }
-
-    iter++;
-  }
-
-  // Filtering
-  if constexpr (!std::is_same<SAMPLE_FILTER_T,
-                              cuvs::neighbors::filtering::none_sample_filter>::value) {
-    for (uint32_t i = threadIdx.x; i < result_buffer_size_32; i += blockDim.x) {
-      INDEX_T index = result_indices_buffer[i];
-      if (index == invalid_index) { continue; }
-      index &= ~index_msb_1_mask;
-      if (!sample_filter(query_id, index)) {
-        result_indices_buffer[i]   = invalid_index;
-        result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
-      }
-    }
-    __syncthreads();
-  }
-
-  // Output search results (1st warp only).
-  if (threadIdx.x < 32) {
-    uint32_t offset = 0;
-    for (uint32_t i = threadIdx.x; i < result_buffer_size_32; i += 32) {
-      INDEX_T index = result_indices_buffer[i];
-      bool is_valid = false;
-      if (index != invalid_index) {
-        if (index & index_msb_1_mask) {
-          is_valid = true;
-          index &= ~index_msb_1_mask;
-        } else if ((offset < itopk_size) &&
-                   hashmap::insert<INDEX_T, 1>(
-                     local_traversed_hashmap_ptr, traversed_hash_bitlen, index)) {
-          // If a node that is not used as a parent can be inserted into
-          // the traversed hash table, it is considered a valid result.
-          is_valid = true;
-        }
-      }
-      const auto mask = __ballot_sync(0xffffffff, is_valid);
-      if (is_valid) {
-        const auto j = offset + __popc(mask & ((1 << threadIdx.x) - 1));
-        if (j < itopk_size) {
-          uint32_t k            = j + (itopk_size * (cta_id + (num_cta_per_query * query_id)));
-          result_indices_ptr[k] = index & ~index_msb_1_mask;
-          if (result_distances_ptr != nullptr) {
-            result_distances_ptr[k] = result_distances_buffer[i];
-          }
-        } else {
-          // If it is valid and registered in the traversed hash table but is
-          // not output as a result, it is removed from the hash table.
-          hashmap::remove<INDEX_T>(local_traversed_hashmap_ptr, traversed_hash_bitlen, index);
-        }
-      }
-      offset += __popc(mask);
-    }
-    // If the number of outputs is insufficient, fill in with invalid results.
-    for (uint32_t i = offset + threadIdx.x; i < itopk_size; i += 32) {
-      uint32_t k            = i + (itopk_size * (cta_id + (num_cta_per_query * query_id)));
-      result_indices_ptr[k] = invalid_index;
-      if (result_distances_ptr != nullptr) {
-        result_distances_ptr[k] = utils::get_max_value<DISTANCE_T>();
-      }
-    }
-  }
-
-  if (threadIdx.x == 0 && cta_id == 0 && num_executed_iterations != nullptr) {
-    num_executed_iterations[query_id] = iter + 1;
-  }
-
-#ifdef _CLK_BREAKDOWN
-  if ((threadIdx.x == 0 || threadIdx.x == blockDim.x - 1) && (blockIdx.x == 0) &&
-      ((query_id * 3) % gridDim.y < 3)) {
-    printf(
-      "%s:%d "
-      "query, %d, thread, %d"
-      ", init, %lu"
-      ", 1st_distance, %lu"
-      ", topk, %lu"
-      ", pickup_parents, %lu"
-      ", distance, %lu"
-      "\n",
-      __FILE__,
-      __LINE__,
-      query_id,
-      threadIdx.x,
-      clk_init,
-      clk_compute_1st_distance,
-      clk_topk,
-      clk_pickup_parents,
-      clk_compute_distance);
-  }
-#endif
-}
-
-template <class T>
-RAFT_KERNEL set_value_batch_kernel(T* const dev_ptr,
-                                   const std::size_t ld,
-                                   const T val,
-                                   const std::size_t count,
-                                   const std::size_t batch_size)
-{
-  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid >= count * batch_size) { return; }
-  const auto batch_id              = tid / count;
-  const auto elem_id               = tid % count;
-  dev_ptr[elem_id + ld * batch_id] = val;
-}
-
-template <class T>
-void set_value_batch(T* const dev_ptr,
-                     const std::size_t ld,
-                     const T val,
-                     const std::size_t count,
-                     const std::size_t batch_size,
-                     cudaStream_t cuda_stream)
-{
-  constexpr std::uint32_t block_size = 256;
-  const auto grid_size               = (count * batch_size + block_size - 1) / block_size;
-  set_value_batch_kernel<T>
-    <<<grid_size, block_size, 0, cuda_stream>>>(dev_ptr, ld, val, count, batch_size);
-}
-
-template <typename DATASET_DESCRIPTOR_T, typename SAMPLE_FILTER_T>
-struct search_kernel_config {
-  // Search kernel function type. Note that the actual values for the template value
-  // parameters do not matter, because they are not part of the function signature. The
-  // second to fourth value parameters will be selected by the choose_* functions below.
-  using kernel_t = decltype(&search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>);
-
-  static auto choose_buffer_size(unsigned result_buffer_size, unsigned block_size) -> kernel_t
-  {
-    if (result_buffer_size <= 64) {
-      return search_kernel<64, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
-    } else if (result_buffer_size <= 128) {
-      return search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
-    } else if (result_buffer_size <= 256) {
-      return search_kernel<256, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
-    }
-    THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256);
-  }
-};
-
-template <typename DataT, typename IndexT, typename DistanceT, typename SampleFilterT>
-void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
-                    raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
-                    IndexT* topk_indices_ptr,       // [num_queries, topk]
-                    DistanceT* topk_distances_ptr,  // [num_queries, topk]
-                    const DataT* queries_ptr,       // [num_queries, dataset_dim]
-                    uint32_t num_queries,
-                    const IndexT* dev_seed_ptr,         // [num_queries, num_seeds]
-                    uint32_t* num_executed_iterations,  // [num_queries,]
-                    const search_params& ps,
-                    uint32_t topk,
-                    // multi_cta_search (params struct)
-                    uint32_t block_size,  //
-                    uint32_t result_buffer_size,
-                    uint32_t smem_size,
-                    uint32_t visited_hash_bitlen,
-                    int64_t traversed_hash_bitlen,
-                    IndexT* traversed_hashmap_ptr,
-                    uint32_t num_cta_per_query,
-                    uint32_t num_seeds,
-                    SampleFilterT sample_filter,
-                    cudaStream_t stream)
-{
-  auto kernel =
-    search_kernel_config<dataset_descriptor_base_t<DataT, IndexT, DistanceT>,
-                         SampleFilterT>::choose_buffer_size(result_buffer_size, block_size);
-
-  RAFT_CUDA_TRY(
-    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-  // Initialize hash table
-  const uint32_t traversed_hash_size = hashmap::get_size(traversed_hash_bitlen);
-  set_value_batch(traversed_hashmap_ptr,
-                  traversed_hash_size,
-                  ~static_cast<IndexT>(0),
-                  traversed_hash_size,
-                  num_queries,
-                  stream);
-
-  dim3 block_dims(block_size, 1, 1);
-  dim3 grid_dims(num_cta_per_query, num_queries, 1);
-  RAFT_LOG_DEBUG("Launching kernel with %u threads, (%u, %u) blocks %u smem",
-                 block_size,
-                 num_cta_per_query,
-                 num_queries,
-                 smem_size);
-
-  kernel<<<grid_dims, block_dims, smem_size, stream>>>(topk_indices_ptr,
-                                                       topk_distances_ptr,
-                                                       dataset_desc.dev_ptr(stream),
-                                                       queries_ptr,
-                                                       graph.data_handle(),
-                                                       graph.extent(1),
-                                                       ps.num_random_samplings,
-                                                       ps.rand_xor_mask,
-                                                       dev_seed_ptr,
-                                                       num_seeds,
-                                                       visited_hash_bitlen,
-                                                       traversed_hashmap_ptr,
-                                                       traversed_hash_bitlen,
-                                                       ps.itopk_size,
-                                                       ps.min_iterations,
-                                                       ps.max_iterations,
-                                                       num_executed_iterations,
-                                                       sample_filter);
-}
-
-}  // namespace multi_cta_search
-}  // namespace cuvs::neighbors::cagra::detail
+/*
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "search_multi_cta_kernel.cuh"
+
+#include "bitonic.hpp"
+#include "compute_distance-ext.cuh"
+#include "device_common.hpp"
+#include "hashmap.hpp"
+#include "search_plan.cuh"
+#include "topk_for_cagra/topk.h"  // TODO replace with raft topk if possible
+#include "utils.hpp"
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_properties.hpp>
+#include <raft/core/resources.hpp>
+
+#include <cuvs/distance/distance.hpp>
+
+#include <cuvs/neighbors/common.hpp>
+
+// TODO: This shouldn't be invoking anything from spatial/knn
+#include "../ann_utils.cuh"
+
+#include <raft/util/cuda_rt_essentials.hpp>
+#include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+namespace cuvs::neighbors::cagra::detail {
+namespace multi_cta_search {
+
+// #define _CLK_BREAKDOWN
+
+template <class INDEX_T, class DISTANCE_T>
+RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parent(
+  INDEX_T* const next_parent_indices,
+  INDEX_T* const itopk_indices,       // [itopk_size * 2]
+  DISTANCE_T* const itopk_distances,  // [itopk_size * 2]
+  INDEX_T* const hash_ptr,
+  const uint32_t hash_bitlen)
+{
+  constexpr uint32_t itopk_size      = 32;
+  constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+  constexpr INDEX_T invalid_index    = ~static_cast<INDEX_T>(0);
+
+  const unsigned warp_id = threadIdx.x / 32;
+  if (warp_id > 0) { return; }
+  if (threadIdx.x == 0) { next_parent_indices[0] = invalid_index; }
+  __syncwarp();
+
+  int j = -1;
+  for (unsigned i = threadIdx.x; i < itopk_size * 2; i += 32) {
+    INDEX_T index    = itopk_indices[i];
+    int is_invalid   = 0;
+    int is_candidate = 0;
+    if (index == invalid_index) {
+      is_invalid = 1;
+    } else if (index & index_msb_1_mask) {
+    } else {
+      is_candidate = 1;
+    }
+
+    const auto ballot_mask  = __ballot_sync(0xffffffff, is_candidate);
+    const auto candidate_id = __popc(ballot_mask & ((1 << threadIdx.x) - 1));
+    for (int k = 0; k < __popc(ballot_mask); k++) {
+      int flag_done = 0;
+      if (is_candidate && candidate_id == k) {
+        is_candidate = 0;
+        if (hashmap::insert<INDEX_T, 1>(hash_ptr, hash_bitlen, index)) {
+          // Use this candidate as next parent
+          index |= index_msb_1_mask;  // set most significant bit as used node
+          if (i < itopk_size) {
+            next_parent_indices[0] = i;
+            itopk_indices[i]       = index;
+          } else {
+            next_parent_indices[0] = j;
+            // Move the next parent node from i-th position to j-th position
+            itopk_indices[j]   = index;
+            itopk_distances[j] = itopk_distances[i];
+            itopk_indices[i]   = invalid_index;
+            itopk_distances[i] = utils::get_max_value<DISTANCE_T>();
+          }
+          flag_done = 1;
+        } else {
+          // Deactivate the node since it has been used by other CTA.
+          itopk_indices[i]   = invalid_index;
+          itopk_distances[i] = utils::get_max_value<DISTANCE_T>();
+          is_invalid         = 1;
+        }
+      }
+      if (__any_sync(0xffffffff, (flag_done > 0))) { return; }
+    }
+    if (i < itopk_size) {
+      j = 31 - __clz(__ballot_sync(0xffffffff, is_invalid));
+      if (j < 0) { return; }
+    }
+  }
+}
+
+template <unsigned MAX_ELEMENTS, class INDEX_T>
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort(float* distances,  // [num_elements]
+                                                      INDEX_T* indices,  // [num_elements]
+                                                      const uint32_t num_elements)
+{
+  const unsigned warp_id = threadIdx.x / 32;
+  if (warp_id > 0) { return; }
+  const unsigned lane_id = threadIdx.x % 32;
+  constexpr unsigned N   = (MAX_ELEMENTS + 31) / 32;
+  float key[N];
+  INDEX_T val[N];
+  for (unsigned i = 0; i < N; i++) {
+    unsigned j = lane_id + (32 * i);
+    if (j < num_elements) {
+      key[i] = distances[j];
+      val[i] = indices[j];
+    } else {
+      key[i] = utils::get_max_value<float>();
+      val[i] = ~static_cast<INDEX_T>(0);
+    }
+  }
+  /* Warp Sort */
+  bitonic::warp_sort<float, INDEX_T, N>(key, val);
+  /* Store sorted results */
+  for (unsigned i = 0; i < N; i++) {
+    unsigned j = (N * lane_id) + i;
+    if (j < num_elements) {
+      distances[j] = key[i];
+      indices[j]   = val[i];
+    }
+  }
+}
+
+//
+// multiple CTAs per single query
+//
+template <std::uint32_t MAX_ELEMENTS, class DATASET_DESCRIPTOR_T, class SAMPLE_FILTER_T>
+RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
+  typename DATASET_DESCRIPTOR_T::INDEX_T* const
+    result_indices_ptr,  // [num_queries, num_cta_per_query, itopk_size]
+  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const
+    result_distances_ptr,  // [num_queries, num_cta_per_query, itopk_size]
+  const DATASET_DESCRIPTOR_T* dataset_desc,
+  const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
+  const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph,   // [dataset_size, graph_degree]
+  const uint32_t graph_degree,
+  const unsigned num_distilation,
+  const uint64_t rand_xor_mask,
+  const typename DATASET_DESCRIPTOR_T::INDEX_T* seed_ptr,  // [num_queries, num_seeds]
+  const uint32_t num_seeds,
+  const uint32_t visited_hash_bitlen,
+  typename DATASET_DESCRIPTOR_T::INDEX_T* const
+    traversed_hashmap_ptr,  // [num_queries, 1 << traversed_hash_bitlen]
+  const uint32_t traversed_hash_bitlen,
+  const uint32_t itopk_size,
+  const uint32_t min_iteration,
+  const uint32_t max_iteration,
+  uint32_t* const num_executed_iterations, /* stats */
+  SAMPLE_FILTER_T sample_filter)
+{
+  using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
+  using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
+  using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
+
+  const auto num_queries       = gridDim.y;
+  const auto query_id          = blockIdx.y;
+  const auto num_cta_per_query = gridDim.x;
+  const auto cta_id            = blockIdx.x;  // local CTA ID
+
+#ifdef _CLK_BREAKDOWN
+  uint64_t clk_init                    = 0;
+  uint64_t clk_compute_1st_distance    = 0;
+  uint64_t clk_topk                    = 0;
+  uint64_t clk_pickup_parents          = 0;
+  uint64_t clk_compute_distance        = 0;
+  uint64_t clk_compute_actual_distance = 0;
+  uint64_t clk_start;
+#define _CLK_START() clk_start = clock64()
+#define _CLK_REC(V)  V += clock64() - clk_start;
+#else
+#define _CLK_START()
+#define _CLK_REC(V)
+#endif
+  _CLK_START();
+
+  extern __shared__ uint8_t smem[];
+
+  // Layout of result_buffer
+  // +----------------+---------+---------------------------+
+  // | internal_top_k | padding | neighbors of parent nodes |
+  // | <itopk_size>   | upto 32 | <graph_degree>            |
+  // +----------------+---------+---------------------------+
+  // |<---        result_buffer_size_32                 --->|
+  const auto result_buffer_size    = itopk_size + graph_degree;
+  const auto result_buffer_size_32 = raft::round_up_safe<uint32_t>(result_buffer_size, 32);
+  assert(result_buffer_size_32 <= MAX_ELEMENTS);
+
+  // Set smem working buffer for the distance calculation
+  dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id);
+
+  auto* __restrict__ result_indices_buffer =
+    reinterpret_cast<INDEX_T*>(smem + dataset_desc->smem_ws_size_in_bytes());
+  auto* __restrict__ result_distances_buffer =
+    reinterpret_cast<DISTANCE_T*>(result_indices_buffer + result_buffer_size_32);
+  auto* __restrict__ local_visited_hashmap_ptr =
+    reinterpret_cast<INDEX_T*>(result_distances_buffer + result_buffer_size_32);
+  auto* __restrict__ parent_indices_buffer =
+    reinterpret_cast<INDEX_T*>(local_visited_hashmap_ptr + hashmap::get_size(visited_hash_bitlen));
+  auto* __restrict__ result_position = reinterpret_cast<int*>(parent_indices_buffer + 1);
+
+  INDEX_T* const local_traversed_hashmap_ptr =
+    traversed_hashmap_ptr + (hashmap::get_size(traversed_hash_bitlen) * query_id);
+
+  constexpr INDEX_T invalid_index    = ~static_cast<INDEX_T>(0);
+  constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+
+  for (unsigned i = threadIdx.x; i < result_buffer_size_32; i += blockDim.x) {
+    result_indices_buffer[i]   = invalid_index;
+    result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
+  }
+  hashmap::init<INDEX_T>(local_visited_hashmap_ptr, visited_hash_bitlen);
+  __syncthreads();
+  _CLK_REC(clk_init);
+
+  // compute distance to randomly selecting nodes
+  _CLK_START();
+  const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr;
+  uint32_t block_id                   = cta_id + (num_cta_per_query * query_id);
+  uint32_t num_blocks                 = num_cta_per_query * num_queries;
+
+  device::compute_distance_to_random_nodes(result_indices_buffer,
+                                           result_distances_buffer,
+                                           *dataset_desc,
+                                           graph_degree,
+                                           num_distilation,
+                                           rand_xor_mask,
+                                           local_seed_ptr,
+                                           num_seeds,
+                                           local_visited_hashmap_ptr,
+                                           visited_hash_bitlen,
+                                           local_traversed_hashmap_ptr,
+                                           traversed_hash_bitlen,
+                                           block_id,
+                                           num_blocks);
+  __syncthreads();
+  _CLK_REC(clk_compute_1st_distance);
+
+  uint32_t iter = 0;
+  while (1) {
+    _CLK_START();
+    if (threadIdx.x < 32) {
+      // [1st warp] Topk with bitonic sort
+      topk_by_bitonic_sort<MAX_ELEMENTS, INDEX_T>(
+        result_distances_buffer, result_indices_buffer, result_buffer_size_32);
+    }
+    __syncthreads();
+    _CLK_REC(clk_topk);
+
+    if (iter + 1 >= max_iteration) { break; }
+
+    _CLK_START();
+    if (threadIdx.x < 32) {
+      // [1st warp] Pick up a next parent
+      pickup_next_parent<INDEX_T, DISTANCE_T>(parent_indices_buffer,
+                                              result_indices_buffer,
+                                              result_distances_buffer,
+                                              local_traversed_hashmap_ptr,
+                                              traversed_hash_bitlen);
+    } else {
+      // [Other warps] Reset visited hashmap
+      hashmap::init<INDEX_T>(local_visited_hashmap_ptr, visited_hash_bitlen, 32);
+    }
+    __syncthreads();
+    _CLK_REC(clk_pickup_parents);
+
+    if ((parent_indices_buffer[0] == invalid_index) && (iter >= min_iteration)) { break; }
+
+    _CLK_START();
+    for (unsigned i = threadIdx.x; i < result_buffer_size_32; i += blockDim.x) {
+      INDEX_T index = result_indices_buffer[i];
+      if (index == invalid_index) { continue; }
+      if ((i >= itopk_size) && (index & index_msb_1_mask)) {
+        // Remove nodes kicked out of the itopk list from the traversed hash table.
+        hashmap::remove<INDEX_T>(
+          local_traversed_hashmap_ptr, traversed_hash_bitlen, index & ~index_msb_1_mask);
+        result_indices_buffer[i]   = invalid_index;
+        result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
+      } else {
+        // Restore visited hashmap by putting nodes on result buffer in it.
+        index &= ~index_msb_1_mask;
+        hashmap::insert(local_visited_hashmap_ptr, visited_hash_bitlen, index);
+      }
+    }
+    // Initialize buffer for compute_distance_to_child_nodes.
+    if (threadIdx.x == blockDim.x - 1) { result_position[0] = result_buffer_size_32; }
+    __syncthreads();
+
+    // Compute the norms between child nodes and query node
+    device::compute_distance_to_child_nodes<INDEX_T, DISTANCE_T, DATASET_DESCRIPTOR_T, 0>(
+      result_indices_buffer,
+      result_distances_buffer,
+      *dataset_desc,
+      knn_graph,
+      graph_degree,
+      local_visited_hashmap_ptr,
+      visited_hash_bitlen,
+      local_traversed_hashmap_ptr,
+      traversed_hash_bitlen,
+      parent_indices_buffer,
+      result_indices_buffer,
+      1,
+#ifdef _CLK_BREAKDOWN
+      clk_compute_actual_distance,
+#endif
+      result_position,
+      result_buffer_size_32);
+    // __syncthreads();
+
+    // Check the state of the nodes in the result buffer which were not updated
+    // by the compute_distance_to_child_nodes above, and if it cannot be used as
+    // a parent node, it is deactivated.
+    for (uint32_t i = threadIdx.x; i < result_position[0]; i += blockDim.x) {
+      INDEX_T index = result_indices_buffer[i];
+      if (index == invalid_index || index & index_msb_1_mask) { continue; }
+      if (hashmap::search<INDEX_T, 1>(local_traversed_hashmap_ptr, traversed_hash_bitlen, index)) {
+        result_indices_buffer[i]   = invalid_index;
+        result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
+      }
+    }
+    __syncthreads();
+    _CLK_REC(clk_compute_distance);
+
+    // Filtering
+    if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                                cuvs::neighbors::filtering::none_sample_filter>::value) {
+      for (unsigned p = threadIdx.x; p < 1; p += blockDim.x) {
+        if (parent_indices_buffer[p] != invalid_index) {
+          const auto parent_id =
+            result_indices_buffer[parent_indices_buffer[p]] & ~index_msb_1_mask;
+          if (!sample_filter(query_id, parent_id)) {
+            // If the parent must not be in the resulting top-k list, remove from the parent list
+            result_distances_buffer[parent_indices_buffer[p]] = utils::get_max_value<DISTANCE_T>();
+            result_indices_buffer[parent_indices_buffer[p]]   = invalid_index;
+          }
+        }
+      }
+      __syncthreads();
+    }
+
+    iter++;
+  }
+
+  // Filtering
+  if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                              cuvs::neighbors::filtering::none_sample_filter>::value) {
+    for (uint32_t i = threadIdx.x; i < result_buffer_size_32; i += blockDim.x) {
+      INDEX_T index = result_indices_buffer[i];
+      if (index == invalid_index) { continue; }
+      index &= ~index_msb_1_mask;
+      if (!sample_filter(query_id, index)) {
+        result_indices_buffer[i]   = invalid_index;
+        result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
+      }
+    }
+    __syncthreads();
+  }
+
+  // Output search results (1st warp only).
+  if (threadIdx.x < 32) {
+    uint32_t offset = 0;
+    for (uint32_t i = threadIdx.x; i < result_buffer_size_32; i += 32) {
+      INDEX_T index = result_indices_buffer[i];
+      bool is_valid = false;
+      if (index != invalid_index) {
+        if (index & index_msb_1_mask) {
+          is_valid = true;
+          index &= ~index_msb_1_mask;
+        } else if ((offset < itopk_size) &&
+                   hashmap::insert<INDEX_T, 1>(
+                     local_traversed_hashmap_ptr, traversed_hash_bitlen, index)) {
+          // If a node that is not used as a parent can be inserted into
+          // the traversed hash table, it is considered a valid result.
+          is_valid = true;
+        }
+      }
+      const auto mask = __ballot_sync(0xffffffff, is_valid);
+      if (is_valid) {
+        const auto j = offset + __popc(mask & ((1 << threadIdx.x) - 1));
+        if (j < itopk_size) {
+          uint32_t k            = j + (itopk_size * (cta_id + (num_cta_per_query * query_id)));
+          result_indices_ptr[k] = index & ~index_msb_1_mask;
+          if (result_distances_ptr != nullptr) {
+            result_distances_ptr[k] = result_distances_buffer[i];
+          }
+        } else {
+          // If it is valid and registered in the traversed hash table but is
+          // not output as a result, it is removed from the hash table.
+          hashmap::remove<INDEX_T>(local_traversed_hashmap_ptr, traversed_hash_bitlen, index);
+        }
+      }
+      offset += __popc(mask);
+    }
+    // If the number of outputs is insufficient, fill in with invalid results.
+    for (uint32_t i = offset + threadIdx.x; i < itopk_size; i += 32) {
+      uint32_t k            = i + (itopk_size * (cta_id + (num_cta_per_query * query_id)));
+      result_indices_ptr[k] = invalid_index;
+      if (result_distances_ptr != nullptr) {
+        result_distances_ptr[k] = utils::get_max_value<DISTANCE_T>();
+      }
+    }
+  }
+
+  if (threadIdx.x == 0 && cta_id == 0 && num_executed_iterations != nullptr) {
+    num_executed_iterations[query_id] = iter + 1;
+  }
+
+#ifdef _CLK_BREAKDOWN
+  if ((threadIdx.x == 0 || threadIdx.x == blockDim.x - 1) && (blockIdx.x == 0) &&
+      ((query_id * 3) % gridDim.y < 3)) {
+    printf(
+      "%s:%d "
+      "query, %d, thread, %d"
+      ", init, %lu"
+      ", 1st_distance, %lu"
+      ", topk, %lu"
+      ", pickup_parents, %lu"
+      ", distance, %lu"
+      ", hash, %lu"
+      "\n",
+      __FILE__,
+      __LINE__,
+      query_id,
+      threadIdx.x,
+      clk_init,
+      clk_compute_1st_distance,
+      clk_topk,
+      clk_pickup_parents,
+      clk_compute_actual_distance,
+      clk_compute_distance - clk_compute_actual_distance);
+  }
+#endif
+}
+
+template <class T>
+RAFT_KERNEL set_value_batch_kernel(T* const dev_ptr,
+                                   const std::size_t ld,
+                                   const T val,
+                                   const std::size_t count,
+                                   const std::size_t batch_size)
+{
+  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= count * batch_size) { return; }
+  const auto batch_id              = tid / count;
+  const auto elem_id               = tid % count;
+  dev_ptr[elem_id + ld * batch_id] = val;
+}
+
+template <class T>
+void set_value_batch(T* const dev_ptr,
+                     const std::size_t ld,
+                     const T val,
+                     const std::size_t count,
+                     const std::size_t batch_size,
+                     cudaStream_t cuda_stream)
+{
+  constexpr std::uint32_t block_size = 256;
+  const auto grid_size               = (count * batch_size + block_size - 1) / block_size;
+  set_value_batch_kernel<T>
+    <<<grid_size, block_size, 0, cuda_stream>>>(dev_ptr, ld, val, count, batch_size);
+}
+
+template <typename DATASET_DESCRIPTOR_T, typename SAMPLE_FILTER_T>
+struct search_kernel_config {
+  // Search kernel function type. Note that the actual values for the template value
+  // parameters do not matter, because they are not part of the function signature. The
+  // second to fourth value parameters will be selected by the choose_* functions below.
+  using kernel_t = decltype(&search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>);
+
+  static auto choose_buffer_size(unsigned result_buffer_size, unsigned block_size) -> kernel_t
+  {
+    if (result_buffer_size <= 64) {
+      return search_kernel<64, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
+    } else if (result_buffer_size <= 128) {
+      return search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
+    } else if (result_buffer_size <= 256) {
+      return search_kernel<256, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
+    }
+    THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256);
+  }
+};
+
+template <typename DataT, typename IndexT, typename DistanceT, typename SampleFilterT>
+void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
+                    raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
+                    IndexT* topk_indices_ptr,       // [num_queries, topk]
+                    DistanceT* topk_distances_ptr,  // [num_queries, topk]
+                    const DataT* queries_ptr,       // [num_queries, dataset_dim]
+                    uint32_t num_queries,
+                    const IndexT* dev_seed_ptr,         // [num_queries, num_seeds]
+                    uint32_t* num_executed_iterations,  // [num_queries,]
+                    const search_params& ps,
+                    uint32_t topk,
+                    // multi_cta_search (params struct)
+                    uint32_t block_size,  //
+                    uint32_t result_buffer_size,
+                    uint32_t smem_size,
+                    uint32_t visited_hash_bitlen,
+                    int64_t traversed_hash_bitlen,
+                    IndexT* traversed_hashmap_ptr,
+                    uint32_t num_cta_per_query,
+                    uint32_t num_seeds,
+                    SampleFilterT sample_filter,
+                    cudaStream_t stream)
+{
+  auto kernel =
+    search_kernel_config<dataset_descriptor_base_t<DataT, IndexT, DistanceT>,
+                         SampleFilterT>::choose_buffer_size(result_buffer_size, block_size);
+
+  RAFT_CUDA_TRY(
+    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+  // Initialize hash table
+  const uint32_t traversed_hash_size = hashmap::get_size(traversed_hash_bitlen);
+  set_value_batch(traversed_hashmap_ptr,
+                  traversed_hash_size,
+                  ~static_cast<IndexT>(0),
+                  traversed_hash_size,
+                  num_queries,
+                  stream);
+
+  dim3 block_dims(block_size, 1, 1);
+  dim3 grid_dims(num_cta_per_query, num_queries, 1);
+  RAFT_LOG_DEBUG("Launching kernel with %u threads, (%u, %u) blocks %u smem",
+                 block_size,
+                 num_cta_per_query,
+                 num_queries,
+                 smem_size);
+
+  kernel<<<grid_dims, block_dims, smem_size, stream>>>(topk_indices_ptr,
+                                                       topk_distances_ptr,
+                                                       dataset_desc.dev_ptr(stream),
+                                                       queries_ptr,
+                                                       graph.data_handle(),
+                                                       graph.extent(1),
+                                                       ps.num_random_samplings,
+                                                       ps.rand_xor_mask,
+                                                       dev_seed_ptr,
+                                                       num_seeds,
+                                                       visited_hash_bitlen,
+                                                       traversed_hashmap_ptr,
+                                                       traversed_hash_bitlen,
+                                                       ps.itopk_size,
+                                                       ps.min_iterations,
+                                                       ps.max_iterations,
+                                                       num_executed_iterations,
+                                                       sample_filter);
+}
+
+}  // namespace multi_cta_search
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 21c88fd607..93a1048bb3 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,8 +66,6 @@
 namespace cuvs::neighbors::cagra::detail {
 namespace single_cta_search {
 
-// #define _CLK_BREAKDOWN
-
 template <unsigned TOPK_BY_BITONIC_SORT, class INDEX_T>
 RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parents(std::uint32_t* const terminate_flag,
                                                      INDEX_T* const next_parent_indices,
@@ -581,13 +579,14 @@ __device__ void search_core(
   using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
 
 #ifdef _CLK_BREAKDOWN
-  std::uint64_t clk_init                 = 0;
-  std::uint64_t clk_compute_1st_distance = 0;
-  std::uint64_t clk_topk                 = 0;
-  std::uint64_t clk_reset_hash           = 0;
-  std::uint64_t clk_pickup_parents       = 0;
-  std::uint64_t clk_restore_hash         = 0;
-  std::uint64_t clk_compute_distance     = 0;
+  std::uint64_t clk_init                    = 0;
+  std::uint64_t clk_compute_1st_distance    = 0;
+  std::uint64_t clk_topk                    = 0;
+  std::uint64_t clk_reset_hash              = 0;
+  std::uint64_t clk_pickup_parents          = 0;
+  std::uint64_t clk_restore_hash            = 0;
+  std::uint64_t clk_compute_distance        = 0;
+  std::uint64_t clk_compute_actual_distance = 0;
   std::uint64_t clk_start;
 #define _CLK_START() clk_start = clock64()
 #define _CLK_REC(V)  V += clock64() - clk_start;
@@ -788,7 +787,12 @@ __device__ void search_core(
                                             0,
                                             parent_list_buffer,
                                             result_indices_buffer,
-                                            search_width);
+                                            search_width
+#ifdef _CLK_BREAKDOWN
+                                            ,
+                                            clk_compute_actual_distance
+#endif
+    );
     __syncthreads();
     _CLK_REC(clk_compute_distance);
 
@@ -945,6 +949,7 @@ __device__ void search_core(
       ", pickup_parents, %lu"
       ", restore_hash, %lu"
       ", distance, %lu"
+      ", hash, %lu"
       "\n",
       __FILE__,
       __LINE__,
@@ -956,7 +961,8 @@ __device__ void search_core(
       clk_reset_hash,
       clk_pickup_parents,
       clk_restore_hash,
-      clk_compute_distance);
+      clk_compute_actual_distance,
+      clk_compute_distance - clk_compute_actual_distance);
   }
 #endif
 }

From 3b3c20ba5c6cd3371b4152e32708f3d766849364 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Fri, 24 Oct 2025 08:19:07 +0900
Subject: [PATCH 017/119] Add fp8x8

---
 .../cagra/compute_distance_vpq-impl.cuh       | 86 ++++++++++++-------
 .../neighbors/detail/cagra/device_common.hpp  |  5 ++
 cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh   | 20 +++++
 3 files changed, 81 insertions(+), 30 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index afb9d0bbcc..5f8e9850a6 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -28,10 +28,10 @@
 namespace cuvs::neighbors::cagra::detail {
 
 #if 1
-using pq_val_pack_t                         = ivf_pq::detail::fp_8bit4<5, true, false>;
+using pq_val_pack_t                         = ivf_pq::detail::fp_8bit8<5, true, false>;
 using pq_val_t                              = typename pq_val_pack_t::unit_t;
-using pq_val_pack_uint_t                    = uint32_t;
-constexpr uint32_t pq_val_pack_num_elements = 4;
+using pq_val_pack_uint_t                    = uint64_t;
+constexpr uint32_t pq_val_pack_num_elements = 8;
 #else
 using pq_val_t                              = half;
 using pq_val_pack_t                         = half2;
@@ -205,17 +205,21 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
           (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
 
         if constexpr (std::is_same_v<pq_val_t, half>) {
-          pq_val_pack_t buf2;
-          buf2.x = r->pq_code_book_ptr()[i];
-          buf2.y = r->pq_code_book_ptr()[i + 1];
-          device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2);
+          // pq_val_pack_t buf2;
+          // buf2.x = r->pq_code_book_ptr()[i];
+          // buf2.y = r->pq_code_book_ptr()[i + 1];
+          // device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2);
         } else {
-          pq_val_pack_t buf4;
-          buf4.x = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
-          buf4.y = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
-          buf4.z = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
-          buf4.w = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
-          device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf4.as_u32());
+          pq_val_pack_t buf8;
+          buf8.x0 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
+          buf8.x1 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
+          buf8.x2 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
+          buf8.x3 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
+          buf8.x4 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 4]));
+          buf8.x5 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 5]));
+          buf8.x6 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 6]));
+          buf8.x7 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 7]));
+          device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf8.as_u64());
         }
       }
     }
@@ -322,43 +326,65 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
               half2 q2;
               // if constexpr (false) {
               if constexpr (std::is_same_v<pq_val_t, half>) {
-                half2 c2;
+                // half2 c2;
+                //// Loading PQ code book from smem
+                // device::lds(c2,
+                //             pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
+                //                                 ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
+
+                //// Loading query vector from smem
+                // device::lds(q2, query_ptr + sizeof(half2) * d);
+                // half2 c2_ = c2;
+                //// L2 distance
+                // auto dist = q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen /
+                // 2]>(vq_vals)[d1]; dist      = dist * dist; norm += static_cast<DISTANCE_T>(dist.x
+                // + dist.y);
+              } else {
+                pq_val_pack_t c_vec;
                 // Loading PQ code book from smem
-                device::lds(c2,
+                device::lds(c_vec.as_u64(),
                             pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
                                                 ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
 
+                half2 c2_;
+
                 // Loading query vector from smem
                 device::lds(q2, query_ptr + sizeof(half2) * d);
-                half2 c2_ = c2;
+                c2_.x = static_cast<half>(c_vec.x0);
+                c2_.y = static_cast<half>(c_vec.x1);
                 // L2 distance
                 auto dist = q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
                 dist      = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
-              } else {
-                pq_val_pack_t c2;
-                // Loading PQ code book from smem
-                device::lds(c2.as_u32(),
-                            pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
-                                                ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
 
-                half2 c2_;
+                d1 += 1;
+                d += kQueryBlock;
 
-                // Loading query vector from smem
                 device::lds(q2, query_ptr + sizeof(half2) * d);
-                c2_.x = static_cast<half>(c2.x);
-                c2_.y = static_cast<half>(c2.y);
+                c2_.x = static_cast<half>(c_vec.x2);
+                c2_.y = static_cast<half>(c_vec.x3);
                 // L2 distance
-                auto dist = q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
-                dist      = dist * dist;
+                dist = q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
+                dist = dist * dist;
+                norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+
+                d1 += 1;
+                d += kQueryBlock;
+
+                device::lds(q2, query_ptr + sizeof(half2) * d);
+                c2_.x = static_cast<half>(c_vec.x4);
+                c2_.y = static_cast<half>(c_vec.x5);
+                // L2 distance
+                dist = q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
+                dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
 
                 d1 += 1;
                 d += kQueryBlock;
 
                 device::lds(q2, query_ptr + sizeof(half2) * d);
-                c2_.x = static_cast<half>(c2.z);
-                c2_.y = static_cast<half>(c2.w);
+                c2_.x = static_cast<half>(c_vec.x6);
+                c2_.y = static_cast<half>(c_vec.x7);
                 // L2 distance
                 dist = q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
                 dist = dist * dist;
diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp
index 9715a0473f..552164fc56 100644
--- a/cpp/src/neighbors/detail/cagra/device_common.hpp
+++ b/cpp/src/neighbors/detail/cagra/device_common.hpp
@@ -304,6 +304,11 @@ RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, uint32_t addr)
   asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "r"(addr));
 }
 
+RAFT_DEVICE_INLINE_FUNCTION void lds(uint64_t& x, uint32_t addr)
+{
+  asm volatile("ld.shared.u64 {%0}, [%1];" : "=l"(x) : "r"(addr));
+}
+
 RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, const uint32_t* addr)
 {
   lds(x, uint32_t(__cvta_generic_to_shared(addr)));
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
index c4e1c08c01..73660f4fef 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
@@ -132,4 +132,24 @@ struct fp_8bit4<5, true, false> {
   HDI uint32_t as_u32() const { return *reinterpret_cast<const uint32_t*>(this); }
 };
 
+template <uint32_t ExpBits, bool Signed, bool SW_Emulation = true>
+struct fp_8bit8 {
+  using unit_t = fp_8bit<ExpBits, Signed>;
+  unit_t x0, x1, x2, x3, x4, x5, x6, x7;
+  HDI fp_8bit8() : x0(0.f), x1(0.f), x2(0.f), x3(0.f), x4(0.f), x5(0.f), x6(0.f), x7(0.f) {}
+
+  HDI uint64_t& as_u64() { return *reinterpret_cast<uint64_t*>(this); }
+  HDI uint64_t as_u64() const { return *reinterpret_cast<const uint64_t*>(this); }
+};
+
+template <>
+struct fp_8bit8<5, true, false> {
+  using unit_t = __nv_fp8_e5m2;
+  unit_t x0, x1, x2, x3, x4, x5, x6, x7;
+  HDI fp_8bit8() : x0(0.f), x1(0.f), x2(0.f), x3(0.f), x4(0.f), x5(0.f), x6(0.f), x7(0.f) {}
+
+  HDI uint64_t& as_u64() { return *reinterpret_cast<uint64_t*>(this); }
+  HDI uint64_t as_u64() const { return *reinterpret_cast<const uint64_t*>(this); }
+};
+
 }  // namespace cuvs::neighbors::ivf_pq::detail

From bc572e03fb65dd60e77fd495e9e5ca5cfbba2586 Mon Sep 17 00:00:00 2001
From: Hiroyuki Ootomo <hootomo@nvidia.com>
Date: Fri, 24 Oct 2025 00:06:54 -0700
Subject: [PATCH 018/119] Fix a bug

---
 .../cagra/compute_distance_vpq-impl.cuh       | 48 +++++++++++--------
 .../neighbors/detail/cagra/device_common.hpp  |  5 ++
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 5f8e9850a6..3e2db41234 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -219,7 +219,7 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
           buf8.x5 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 5]));
           buf8.x6 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 6]));
           buf8.x7 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 7]));
-          device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf8.as_u64());
+          device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf8.as_u64());
         }
       }
     }
@@ -319,10 +319,13 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
             if (PQ_LEN * (v + k) >= dim) break;
 #pragma unroll
             for (std::uint32_t m = 0; m < PQ_LEN / pq_val_pack_num_elements; m++) {
-              constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN);
-              std::uint32_t d1           = m * (pq_val_pack_num_elements / 2) + (PQ_LEN / 2) * v;
-              std::uint32_t d =
-                d1 * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
+              constexpr uint32_t vq_val_pack_num_elements = 2;
+              constexpr auto kQueryBlock                  = DatasetBlockDim / (vlen * PQ_LEN);
+              std::uint32_t vq_half2_index =
+                m * (pq_val_pack_num_elements / vq_val_pack_num_elements) +
+                (PQ_LEN / vq_val_pack_num_elements) * v;
+              std::uint32_t query_val_index =
+                vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
               half2 q2;
               // if constexpr (false) {
               if constexpr (std::is_same_v<pq_val_t, half>) {
@@ -345,48 +348,51 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
                 device::lds(c_vec.as_u64(),
                             pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
                                                 ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
-
                 half2 c2_;
 
                 // Loading query vector from smem
-                device::lds(q2, query_ptr + sizeof(half2) * d);
+                device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
                 c2_.x = static_cast<half>(c_vec.x0);
                 c2_.y = static_cast<half>(c_vec.x1);
                 // L2 distance
-                auto dist = q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
-                dist      = dist * dist;
+                auto dist =
+                  q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
+                dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
 
-                d1 += 1;
-                d += kQueryBlock;
+                vq_half2_index += 1;
+                query_val_index += kQueryBlock;
 
-                device::lds(q2, query_ptr + sizeof(half2) * d);
+                device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
                 c2_.x = static_cast<half>(c_vec.x2);
                 c2_.y = static_cast<half>(c_vec.x3);
                 // L2 distance
-                dist = q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
+                dist =
+                  q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
 
-                d1 += 1;
-                d += kQueryBlock;
+                vq_half2_index += 1;
+                query_val_index += kQueryBlock;
 
-                device::lds(q2, query_ptr + sizeof(half2) * d);
+                device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
                 c2_.x = static_cast<half>(c_vec.x4);
                 c2_.y = static_cast<half>(c_vec.x5);
                 // L2 distance
-                dist = q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
+                dist =
+                  q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
 
-                d1 += 1;
-                d += kQueryBlock;
+                vq_half2_index += 1;
+                query_val_index += kQueryBlock;
 
-                device::lds(q2, query_ptr + sizeof(half2) * d);
+                device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
                 c2_.x = static_cast<half>(c_vec.x6);
                 c2_.y = static_cast<half>(c_vec.x7);
                 // L2 distance
-                dist = q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
+                dist =
+                  q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
               }
diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp
index 552164fc56..83d4bdf161 100644
--- a/cpp/src/neighbors/detail/cagra/device_common.hpp
+++ b/cpp/src/neighbors/detail/cagra/device_common.hpp
@@ -331,6 +331,11 @@ RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const uint32_t& x)
   asm volatile("st.shared.u32 [%0], %1;" : : "r"(addr), "r"(reinterpret_cast<const uint32_t&>(x)));
 }
 
+RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const uint64_t& x)
+{
+  asm volatile("st.shared.u64 [%0], %1;" : : "r"(addr), "l"(reinterpret_cast<const uint64_t&>(x)));
+}
+
 RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const half2& x)
 {
   asm volatile("st.shared.v2.u16 [%0], {%1, %2};"

From ec275d47891ff83501bed1e074b36bfc44bada27 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Sat, 25 Oct 2025 08:29:51 +0900
Subject: [PATCH 019/119] Update 2, 4, 8 configs

---
 .../cagra/compute_distance_vpq-impl.cuh       | 138 +++++++++++++-----
 cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh   |  32 ++--
 2 files changed, 120 insertions(+), 50 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 3e2db41234..06d5fcd46b 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -27,17 +27,29 @@
 
 namespace cuvs::neighbors::cagra::detail {
 
-#if 1
-using pq_val_pack_t                         = ivf_pq::detail::fp_8bit8<5, true, false>;
-using pq_val_t                              = typename pq_val_pack_t::unit_t;
-using pq_val_pack_uint_t                    = uint64_t;
-constexpr uint32_t pq_val_pack_num_elements = 8;
-#else
-using pq_val_t                              = half;
-using pq_val_pack_t                         = half2;
-using pq_val_pack_uint_t                    = uint32_t;
-constexpr uint32_t pq_val_pack_num_elements = 2;
-#endif
+template <uint32_t PQ_LEN>
+struct pq_val_type_t {};
+template <>
+struct pq_val_type_t<2> {
+  using pq_val_pack_t                                = half2;
+  using pq_val_t                                     = half;
+  using pq_val_pack_uint_t                           = uint32_t;
+  static constexpr uint32_t pq_val_pack_num_elements = 2;
+};
+template <>
+struct pq_val_type_t<4> {
+  using pq_val_pack_t                                = ivf_pq::detail::fp_8bit8<5, true, false>;
+  using pq_val_t                                     = typename pq_val_pack_t::unit_t;
+  using pq_val_pack_uint_t                           = typename pq_val_pack_t::uint_t;
+  static constexpr uint32_t pq_val_pack_num_elements = pq_val_pack_t::num_elements;
+};
+template <>
+struct pq_val_type_t<8> {
+  using pq_val_pack_t                                = ivf_pq::detail::fp_8bit8<5, true, false>;
+  using pq_val_t                                     = typename pq_val_pack_t::unit_t;
+  using pq_val_pack_uint_t                           = typename pq_val_pack_t::uint_t;
+  static constexpr uint32_t pq_val_pack_num_elements = pq_val_pack_t::num_elements;
+};
 
 template <cuvs::distance::DistanceType Metric,
           uint32_t TeamSize,
@@ -109,7 +121,8 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
   }
 
   static constexpr std::uint32_t kSMemCodeBookSizeInBytes =
-    (1 << PQ_BITS) * PQ_LEN * utils::size_of<pq_val_pack_uint_t>() / pq_val_pack_num_elements;
+    (1 << PQ_BITS) * PQ_LEN * utils::size_of<typename pq_val_type_t<PQ_LEN>::pq_val_pack_uint_t>() /
+    pq_val_type_t<PQ_LEN>::pq_val_pack_num_elements;
 
   _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl,
                                                  compute_distance_type* compute_distance_impl,
@@ -169,6 +182,10 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
   constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim;
   constexpr auto PQ_BITS          = DescriptorT::kPqBits;
   constexpr auto PQ_LEN           = DescriptorT::kPqLen;
+  using pq_val_config             = pq_val_type_t<PQ_LEN>;
+  using pq_val_t                  = typename pq_val_config::pq_val_t;
+  using pq_val_pack_uint_t        = typename pq_val_config::pq_val_pack_uint_t;
+  using pq_val_pack_t             = typename pq_val_config::pq_val_pack_t;
 
   auto* r = reinterpret_cast<DescriptorT*>(smem_ptr);
 
@@ -190,13 +207,14 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
     __syncthreads();
 
     // Copy PQ table
-    for (unsigned i = threadIdx.x * pq_val_pack_num_elements; i < (1 << PQ_BITS) * PQ_LEN;
-         i += blockDim.x * pq_val_pack_num_elements) {
+    for (unsigned i = threadIdx.x * pq_val_config::pq_val_pack_num_elements;
+         i < (1 << PQ_BITS) * PQ_LEN;
+         i += blockDim.x * pq_val_config::pq_val_pack_num_elements) {
       // Change the order of PQ code book array to reduce the
       // frequency of bank conflicts.
       constexpr auto num_elements_per_bank =
-        pq_val_pack_num_elements /
-        (utils::size_of<pq_val_pack_uint_t>() / utils::size_of<uint32_t>());
+        pq_val_config::pq_val_pack_num_elements /
+        (utils::size_of<pq_val_config::pq_val_pack_uint_t>() / utils::size_of<uint32_t>());
 
       if constexpr (PQ_LEN >= num_elements_per_bank) {  // safety
         constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank;
@@ -204,12 +222,21 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
         const auto smem_index =
           (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
 
-        if constexpr (std::is_same_v<pq_val_t, half>) {
-          // pq_val_pack_t buf2;
-          // buf2.x = r->pq_code_book_ptr()[i];
-          // buf2.y = r->pq_code_book_ptr()[i + 1];
-          // device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2);
+        if constexpr (PQ_LEN == 2) {
+          half2 buf2;
+          buf2.x = r->pq_code_book_ptr()[i];
+          buf2.y = r->pq_code_book_ptr()[i + 1];
+          device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2);
+        } else if constexpr (PQ_LEN == 4) {
+          using pq_val_pack_t = ivf_pq::detail::fp_8bit4<5, true, false>;
+          pq_val_pack_t buf4;
+          buf4.x = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
+          buf4.y = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
+          buf4.z = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
+          buf4.w = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
+          device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf4.as_uint());
         } else {
+          using pq_val_pack_t = ivf_pq::detail::fp_8bit8<5, true, false>;
           pq_val_pack_t buf8;
           buf8.x0 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
           buf8.x1 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
@@ -219,7 +246,7 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
           buf8.x5 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 5]));
           buf8.x6 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 6]));
           buf8.x7 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 7]));
-          device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf8.as_u64());
+          device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf8.as_uint());
         }
       }
     }
@@ -267,6 +294,12 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
   constexpr auto PQ_LEN          = DescriptorT::kPqLen;
   using PQ_CODEBOOK_LOAD_T       = uint32_t;
 
+  using pq_val_config                         = pq_val_type_t<PQ_LEN>;
+  using pq_val_t                              = typename pq_val_config::pq_val_t;
+  using pq_val_pack_uint_t                    = typename pq_val_config::pq_val_pack_uint_t;
+  using pq_val_pack_t                         = typename pq_val_config::pq_val_pack_t;
+  constexpr uint32_t pq_val_pack_num_elements = pq_val_config::pq_val_pack_num_elements;
+
   const uint32_t query_ptr = pq_codebook_ptr + DescriptorT::kSMemCodeBookSizeInBytes;
   static_assert(PQ_BITS == 8, "Only pq_bits == 8 is supported at the moment.");
   constexpr uint32_t vlen = utils::size_of<PQ_CODEBOOK_LOAD_T>() / utils::size_of<uint8_t>();
@@ -328,24 +361,53 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
                 vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
               half2 q2;
               // if constexpr (false) {
-              if constexpr (std::is_same_v<pq_val_t, half>) {
-                // half2 c2;
-                //// Loading PQ code book from smem
-                // device::lds(c2,
-                //             pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
-                //                                 ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
-
-                //// Loading query vector from smem
-                // device::lds(q2, query_ptr + sizeof(half2) * d);
-                // half2 c2_ = c2;
-                //// L2 distance
-                // auto dist = q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen /
-                // 2]>(vq_vals)[d1]; dist      = dist * dist; norm += static_cast<DISTANCE_T>(dist.x
-                // + dist.y);
-              } else {
+              if constexpr (PQ_LEN == 2) {
+                pq_val_pack_t c2;
+                // Loading PQ code book from smem
+                device::lds(c2,
+                            pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
+                                                ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
+
+                // Loading query vector from smem
+                device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
+                // L2 distance
+                auto dist =
+                  q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
+                dist = dist * dist;
+                norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+              } else if constexpr (PQ_LEN == 4) {
+                pq_val_pack_t c_vec;
+                // Loading PQ code book from smem
+                device::lds(c_vec.as_uint(),
+                            pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
+                                                ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
+                half2 c2_;
+
+                // Loading query vector from smem
+                device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
+                c2_.x = static_cast<half>(c_vec.x);
+                c2_.y = static_cast<half>(c_vec.y);
+                // L2 distance
+                auto dist =
+                  q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
+                dist = dist * dist;
+                norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+
+                vq_half2_index += 1;
+                query_val_index += kQueryBlock;
+
+                device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
+                c2_.x = static_cast<half>(c_vec.z);
+                c2_.y = static_cast<half>(c_vec.w);
+                // L2 distance
+                dist =
+                  q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
+                dist = dist * dist;
+                norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+              } else if constexpr (PQ_LEN == 8) {
                 pq_val_pack_t c_vec;
                 // Loading PQ code book from smem
-                device::lds(c_vec.as_u64(),
+                device::lds(c_vec.as_uint(),
                             pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
                                                 ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
                 half2 c2_;
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
index 73660f4fef..10d817dfd3 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
@@ -114,42 +114,50 @@ struct fp_8bit {
 
 template <uint32_t ExpBits, bool Signed, bool SW_Emulation = true>
 struct fp_8bit4 {
-  using unit_t = fp_8bit<ExpBits, Signed>;
+  using unit_t                           = fp_8bit<ExpBits, Signed>;
+  using uint_t                           = uint32_t;
+  static constexpr uint32_t num_elements = 4;
   unit_t x, y, z, w;
   HDI fp_8bit4() : x(0.f), y(0.f), z(0.f), w(0.f) {}
 
-  HDI uint32_t& as_u32() { return *reinterpret_cast<uint32_t*>(this); }
-  HDI uint32_t as_u32() const { return *reinterpret_cast<const uint32_t*>(this); }
+  HDI uint_t& as_uint() { return *reinterpret_cast<uint_t*>(this); }
+  HDI uint_t as_uint() const { return *reinterpret_cast<const uint_t*>(this); }
 };
 
 template <>
 struct fp_8bit4<5, true, false> {
-  using unit_t = __nv_fp8_e5m2;
+  using unit_t                           = __nv_fp8_e5m2;
+  using uint_t                           = uint32_t;
+  static constexpr uint32_t num_elements = 4;
   unit_t x, y, z, w;
   HDI fp_8bit4() : x(0.f), y(0.f), z(0.f), w(0.f) {}
 
-  HDI uint32_t& as_u32() { return *reinterpret_cast<uint32_t*>(this); }
-  HDI uint32_t as_u32() const { return *reinterpret_cast<const uint32_t*>(this); }
+  HDI uint_t& as_uint() { return *reinterpret_cast<uint_t*>(this); }
+  HDI uint_t as_uint() const { return *reinterpret_cast<const uint_t*>(this); }
 };
 
 template <uint32_t ExpBits, bool Signed, bool SW_Emulation = true>
 struct fp_8bit8 {
-  using unit_t = fp_8bit<ExpBits, Signed>;
+  using unit_t                           = fp_8bit<ExpBits, Signed>;
+  using uint_t                           = uint64_t;
+  static constexpr uint32_t num_elements = 8;
   unit_t x0, x1, x2, x3, x4, x5, x6, x7;
   HDI fp_8bit8() : x0(0.f), x1(0.f), x2(0.f), x3(0.f), x4(0.f), x5(0.f), x6(0.f), x7(0.f) {}
 
-  HDI uint64_t& as_u64() { return *reinterpret_cast<uint64_t*>(this); }
-  HDI uint64_t as_u64() const { return *reinterpret_cast<const uint64_t*>(this); }
+  HDI uint_t& as_uint() { return *reinterpret_cast<uint_t*>(this); }
+  HDI uint_t as_uint() const { return *reinterpret_cast<const uint_t*>(this); }
 };
 
 template <>
 struct fp_8bit8<5, true, false> {
-  using unit_t = __nv_fp8_e5m2;
+  using unit_t                           = __nv_fp8_e5m2;
+  using uint_t                           = uint64_t;
+  static constexpr uint32_t num_elements = 8;
   unit_t x0, x1, x2, x3, x4, x5, x6, x7;
   HDI fp_8bit8() : x0(0.f), x1(0.f), x2(0.f), x3(0.f), x4(0.f), x5(0.f), x6(0.f), x7(0.f) {}
 
-  HDI uint64_t& as_u64() { return *reinterpret_cast<uint64_t*>(this); }
-  HDI uint64_t as_u64() const { return *reinterpret_cast<const uint64_t*>(this); }
+  HDI uint_t& as_uint() { return *reinterpret_cast<uint_t*>(this); }
+  HDI uint_t as_uint() const { return *reinterpret_cast<const uint_t*>(this); }
 };
 
 }  // namespace cuvs::neighbors::ivf_pq::detail

From a85d8a3af212d6092ecf6664617476985297e8ac Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Sun, 26 Oct 2025 10:11:24 +0900
Subject: [PATCH 020/119] Fix a bug

---
 .../neighbors/detail/cagra/compute_distance_vpq-impl.cuh    | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 06d5fcd46b..259c348aa2 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -38,7 +38,7 @@ struct pq_val_type_t<2> {
 };
 template <>
 struct pq_val_type_t<4> {
-  using pq_val_pack_t                                = ivf_pq::detail::fp_8bit8<5, true, false>;
+  using pq_val_pack_t                                = ivf_pq::detail::fp_8bit4<5, true, false>;
   using pq_val_t                                     = typename pq_val_pack_t::unit_t;
   using pq_val_pack_uint_t                           = typename pq_val_pack_t::uint_t;
   static constexpr uint32_t pq_val_pack_num_elements = pq_val_pack_t::num_elements;
@@ -228,15 +228,13 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
           buf2.y = r->pq_code_book_ptr()[i + 1];
           device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2);
         } else if constexpr (PQ_LEN == 4) {
-          using pq_val_pack_t = ivf_pq::detail::fp_8bit4<5, true, false>;
           pq_val_pack_t buf4;
           buf4.x = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
           buf4.y = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
           buf4.z = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
           buf4.w = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
           device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf4.as_uint());
-        } else {
-          using pq_val_pack_t = ivf_pq::detail::fp_8bit8<5, true, false>;
+        } else if constexpr (PQ_LEN == 8) {
           pq_val_pack_t buf8;
           buf8.x0 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
           buf8.x1 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));

From d1f628c74e149006b5e3e60a0838117aebce4006 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Mon, 27 Oct 2025 22:52:28 +0900
Subject: [PATCH 021/119] Add F8 query support

---
 .../cagra/compute_distance_vpq-impl.cuh       | 157 ++++++++++++------
 1 file changed, 102 insertions(+), 55 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 259c348aa2..47fbcc0155 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -156,7 +156,9 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
       3. Queries (smem_query_buffer_length elems)
     */
     return sizeof(cagra_q_dataset_descriptor_t) + kSMemCodeBookSizeInBytes +
-           raft::round_up_safe<uint32_t>(dim, DatasetBlockDim) * sizeof(QUERY_T);
+           raft::round_up_safe<uint32_t>(dim, DatasetBlockDim) *
+             utils::size_of<typename pq_val_type_t<PQ_LEN>::pq_val_pack_uint_t>() /
+             pq_val_type_t<PQ_LEN>::pq_val_pack_num_elements;
   }
 };
 
@@ -176,16 +178,17 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
                                                    const typename DescriptorT::DATA_T* queries_ptr,
                                                    uint32_t query_id) -> const DescriptorT*
 {
-  using QUERY_T                   = typename DescriptorT::QUERY_T;
-  using CODE_BOOK_T               = typename DescriptorT::CODE_BOOK_T;
-  using word_type                 = uint32_t;
-  constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim;
-  constexpr auto PQ_BITS          = DescriptorT::kPqBits;
-  constexpr auto PQ_LEN           = DescriptorT::kPqLen;
-  using pq_val_config             = pq_val_type_t<PQ_LEN>;
-  using pq_val_t                  = typename pq_val_config::pq_val_t;
-  using pq_val_pack_uint_t        = typename pq_val_config::pq_val_pack_uint_t;
-  using pq_val_pack_t             = typename pq_val_config::pq_val_pack_t;
+  using QUERY_T                           = typename DescriptorT::QUERY_T;
+  using CODE_BOOK_T                       = typename DescriptorT::CODE_BOOK_T;
+  using word_type                         = uint32_t;
+  constexpr auto kDatasetBlockDim         = DescriptorT::kDatasetBlockDim;
+  constexpr auto PQ_BITS                  = DescriptorT::kPqBits;
+  constexpr auto PQ_LEN                   = DescriptorT::kPqLen;
+  using pq_val_config                     = pq_val_type_t<PQ_LEN>;
+  using pq_val_t                          = typename pq_val_config::pq_val_t;
+  using pq_val_pack_uint_t                = typename pq_val_config::pq_val_pack_uint_t;
+  using pq_val_pack_t                     = typename pq_val_config::pq_val_pack_t;
+  constexpr auto pq_val_pack_num_elements = pq_val_config::pq_val_pack_num_elements;
 
   auto* r = reinterpret_cast<DescriptorT*>(smem_ptr);
 
@@ -257,18 +260,59 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
   auto smem_query_ptr =
     reinterpret_cast<QUERY_T*>(reinterpret_cast<uint8_t*>(smem_ptr) + sizeof(DescriptorT) +
                                DescriptorT::kSMemCodeBookSizeInBytes);
-  for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) {
-    half2 buf2{0, 0};
-    if (i < dim) { buf2.x = mapping(queries_ptr[i]); }
-    if (i + 1 < dim) { buf2.y = mapping(queries_ptr[i + 1]); }
-    if constexpr ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) {
+  for (unsigned i = threadIdx.x * pq_val_config::pq_val_pack_num_elements; i < dim;
+       i += blockDim.x * pq_val_config::pq_val_pack_num_elements) {
+    pq_val_pack_t buf;
+    if constexpr (PQ_LEN == 2) {
+      if (i < dim) { static_cast<pq_val_t>(static_cast<float>(buf.x = mapping(queries_ptr[i]))); }
+      if (i + 1 < dim) {
+        static_cast<pq_val_t>(static_cast<float>(buf.y = mapping(queries_ptr[i + 1])));
+      }
+    } else if constexpr (PQ_LEN == 4) {
+      if (i < dim) { buf.x = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i]))); }
+      if (i + 1 < dim) {
+        buf.y = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 1])));
+      }
+      if (i + 2 < dim) {
+        buf.z = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 2])));
+      }
+      if (i + 3 < dim) {
+        buf.w = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 3])));
+      }
+    } else if constexpr (PQ_LEN == 8) {
+      if (i < dim) { buf.x0 = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i]))); }
+      if (i + 1 < dim) {
+        buf.x1 = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 1])));
+      }
+      if (i + 2 < dim) {
+        buf.x2 = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 2])));
+      }
+      if (i + 3 < dim) {
+        buf.x3 = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 3])));
+      }
+      if (i + 4 < dim) {
+        buf.x4 = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 4])));
+      }
+      if (i + 5 < dim) {
+        buf.x5 = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 5])));
+      }
+      if (i + 6 < dim) {
+        buf.x6 = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 6])));
+      }
+      if (i + 7 < dim) {
+        buf.x7 = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 7])));
+      }
+    }
+
+    if constexpr ((PQ_BITS == 8) && (PQ_LEN % pq_val_pack_num_elements == 0)) {
       // Transpose the queries buffer to avoid bank conflicts in compute_distance.
       constexpr uint32_t vlen = 4;  // **** DO NOT CHANGE ****
-      constexpr auto kStride  = vlen * PQ_LEN / 2;
-      reinterpret_cast<half2*>(smem_query_ptr)[transpose<kDatasetBlockDim / 2, kStride>(i / 2)] =
-        buf2;
+      constexpr auto kStride  = vlen * PQ_LEN / pq_val_pack_num_elements;
+      reinterpret_cast<pq_val_pack_t*>(
+        smem_query_ptr)[transpose<kDatasetBlockDim / pq_val_pack_num_elements, kStride>(
+        i / pq_val_pack_num_elements)] = buf;
     } else {
-      (reinterpret_cast<half2*>(smem_query_ptr + i))[0] = buf2;
+      (reinterpret_cast<pq_val_pack_t*>(smem_query_ptr + i))[0] = buf;
     }
   }
 
@@ -355,104 +399,107 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
               std::uint32_t vq_half2_index =
                 m * (pq_val_pack_num_elements / vq_val_pack_num_elements) +
                 (PQ_LEN / vq_val_pack_num_elements) * v;
-              std::uint32_t query_val_index =
-                vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
-              half2 q2;
+              std::uint32_t query_val_index = vq_half2_index * kQueryBlock +
+                                              elem_offset * (PQ_LEN / pq_val_pack_num_elements) +
+                                              e * TeamSize + laneId;  // Index in pack_t
               // if constexpr (false) {
               if constexpr (PQ_LEN == 2) {
-                pq_val_pack_t c2;
+                pq_val_pack_t c2, q2;
                 // Loading PQ code book from smem
                 device::lds(c2,
                             pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
                                                 ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
 
                 // Loading query vector from smem
-                device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
+                device::lds(q2, query_ptr + sizeof(pq_val_pack_t) * query_val_index);
                 // L2 distance
                 auto dist =
                   q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
               } else if constexpr (PQ_LEN == 4) {
-                pq_val_pack_t c_vec;
+                pq_val_pack_t c_vec, q_vec;
                 // Loading PQ code book from smem
                 device::lds(c_vec.as_uint(),
                             pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
                                                 ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
-                half2 c2_;
+                device::lds(q_vec.as_uint(),
+                            query_ptr + sizeof(pq_val_pack_uint_t) * query_val_index);
+
+                half2 c2_, q2_;
 
                 // Loading query vector from smem
-                device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
                 c2_.x = static_cast<half>(c_vec.x);
                 c2_.y = static_cast<half>(c_vec.y);
+                q2_.x = static_cast<half>(q_vec.x);
+                q2_.y = static_cast<half>(q_vec.y);
                 // L2 distance
-                auto dist =
-                  q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
+                auto dist = q2_ - c2_ -
+                            reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
 
                 vq_half2_index += 1;
-                query_val_index += kQueryBlock;
-
-                device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
                 c2_.x = static_cast<half>(c_vec.z);
                 c2_.y = static_cast<half>(c_vec.w);
+                q2_.x = static_cast<half>(q_vec.z);
+                q2_.y = static_cast<half>(q_vec.w);
                 // L2 distance
-                dist =
-                  q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
+                dist = q2_ - c2_ -
+                       reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
               } else if constexpr (PQ_LEN == 8) {
-                pq_val_pack_t c_vec;
+                pq_val_pack_t c_vec, q_vec;
                 // Loading PQ code book from smem
                 device::lds(c_vec.as_uint(),
                             pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
                                                 ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
-                half2 c2_;
+                device::lds(q_vec.as_uint(),
+                            query_ptr + sizeof(pq_val_pack_uint_t) * query_val_index);
+                half2 c2_, q2_;
 
                 // Loading query vector from smem
-                device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
                 c2_.x = static_cast<half>(c_vec.x0);
                 c2_.y = static_cast<half>(c_vec.x1);
+                q2_.x = static_cast<half>(q_vec.x0);
+                q2_.y = static_cast<half>(q_vec.x1);
                 // L2 distance
-                auto dist =
-                  q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
+                auto dist = q2_ - c2_ -
+                            reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
 
                 vq_half2_index += 1;
-                query_val_index += kQueryBlock;
-
-                device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
                 c2_.x = static_cast<half>(c_vec.x2);
                 c2_.y = static_cast<half>(c_vec.x3);
+                q2_.x = static_cast<half>(q_vec.x2);
+                q2_.y = static_cast<half>(q_vec.x3);
                 // L2 distance
-                dist =
-                  q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
+                dist = q2_ - c2_ -
+                       reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
 
                 vq_half2_index += 1;
-                query_val_index += kQueryBlock;
-
-                device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
                 c2_.x = static_cast<half>(c_vec.x4);
                 c2_.y = static_cast<half>(c_vec.x5);
+                q2_.x = static_cast<half>(q_vec.x4);
+                q2_.y = static_cast<half>(q_vec.x5);
                 // L2 distance
-                dist =
-                  q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
+                dist = q2_ - c2_ -
+                       reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
 
                 vq_half2_index += 1;
-                query_val_index += kQueryBlock;
-
-                device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
                 c2_.x = static_cast<half>(c_vec.x6);
                 c2_.y = static_cast<half>(c_vec.x7);
+                q2_.x = static_cast<half>(q_vec.x6);
+                q2_.y = static_cast<half>(q_vec.x7);
                 // L2 distance
-                dist =
-                  q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
+                dist = q2_ - c2_ -
+                       reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
               }

From e05dfb00d2faba492d6d84f82679155d4767a2c3 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Thu, 30 Oct 2025 22:55:18 +0900
Subject: [PATCH 022/119] Fix query vec id calc

---
 .../cagra/compute_distance_vpq-impl.cuh       | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 47fbcc0155..8c68b00e99 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -258,10 +258,10 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
 
   constexpr cuvs::spatial::knn::detail::utils::mapping<QUERY_T> mapping{};
   auto smem_query_ptr =
-    reinterpret_cast<QUERY_T*>(reinterpret_cast<uint8_t*>(smem_ptr) + sizeof(DescriptorT) +
-                               DescriptorT::kSMemCodeBookSizeInBytes);
-  for (unsigned i = threadIdx.x * pq_val_config::pq_val_pack_num_elements; i < dim;
-       i += blockDim.x * pq_val_config::pq_val_pack_num_elements) {
+    reinterpret_cast<pq_val_t*>(reinterpret_cast<uint8_t*>(smem_ptr) + sizeof(DescriptorT) +
+                                DescriptorT::kSMemCodeBookSizeInBytes);
+  for (unsigned i = threadIdx.x * pq_val_pack_num_elements; i < dim;
+       i += blockDim.x * pq_val_pack_num_elements) {
     pq_val_pack_t buf;
     if constexpr (PQ_LEN == 2) {
       if (i < dim) { static_cast<pq_val_t>(static_cast<float>(buf.x = mapping(queries_ptr[i]))); }
@@ -399,10 +399,14 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
               std::uint32_t vq_half2_index =
                 m * (pq_val_pack_num_elements / vq_val_pack_num_elements) +
                 (PQ_LEN / vq_val_pack_num_elements) * v;
-              std::uint32_t query_val_index = vq_half2_index * kQueryBlock +
-                                              elem_offset * (PQ_LEN / pq_val_pack_num_elements) +
-                                              e * TeamSize + laneId;  // Index in pack_t
-              // if constexpr (false) {
+              const uint32_t query_vec_element_id =
+                (elem_offset + e * vlen * TeamSize + v + laneId * vlen) * PQ_LEN;
+
+              constexpr auto kStride = vlen * PQ_LEN / pq_val_pack_num_elements;
+              const auto query_val_index =
+                transpose<DatasetBlockDim / pq_val_pack_num_elements, kStride>(
+                  query_vec_element_id / pq_val_pack_num_elements);
+
               if constexpr (PQ_LEN == 2) {
                 pq_val_pack_t c2, q2;
                 // Loading PQ code book from smem

From 5f2b78f57dfd3bbf5d23f26c3b67f20ea288e892 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Fri, 31 Oct 2025 01:18:31 +0900
Subject: [PATCH 023/119] Improve performance

---
 .../detail/cagra/compute_distance_vpq-impl.cuh | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 8c68b00e99..dbe1e7c3c3 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -399,13 +399,19 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
               std::uint32_t vq_half2_index =
                 m * (pq_val_pack_num_elements / vq_val_pack_num_elements) +
                 (PQ_LEN / vq_val_pack_num_elements) * v;
-              const uint32_t query_vec_element_id =
-                (elem_offset + e * vlen * TeamSize + v + laneId * vlen) * PQ_LEN;
 
-              constexpr auto kStride = vlen * PQ_LEN / pq_val_pack_num_elements;
-              const auto query_val_index =
-                transpose<DatasetBlockDim / pq_val_pack_num_elements, kStride>(
-                  query_vec_element_id / pq_val_pack_num_elements);
+              uint32_t query_val_index;
+              if constexpr (PQ_LEN == 2) {
+                query_val_index =
+                  vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
+              } else {
+                const uint32_t query_vec_element_id =
+                  (elem_offset + e * vlen * TeamSize + v + laneId * vlen) * PQ_LEN /
+                  pq_val_pack_num_elements;
+                constexpr auto kStride = vlen * PQ_LEN / pq_val_pack_num_elements;
+                query_val_index = transpose<DatasetBlockDim / pq_val_pack_num_elements, kStride>(
+                  query_vec_element_id);
+              }
 
               if constexpr (PQ_LEN == 2) {
                 pq_val_pack_t c2, q2;

From 484f9e6279ec6a784ce6c3e4e8816d822634616b Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Fri, 31 Oct 2025 12:29:49 +0900
Subject: [PATCH 024/119] Improve performance

---
 .../cagra/compute_distance_vpq-impl.cuh       | 42 ++++++-------
 cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh   | 62 ++++++++++---------
 2 files changed, 51 insertions(+), 53 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 259c348aa2..448f4c3252 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -229,21 +229,21 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
           device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2);
         } else if constexpr (PQ_LEN == 4) {
           pq_val_pack_t buf4;
-          buf4.x = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
-          buf4.y = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
-          buf4.z = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
-          buf4.w = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
+          buf4.data.x1[0] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
+          buf4.data.x1[1] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
+          buf4.data.x1[2] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
+          buf4.data.x1[3] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
           device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf4.as_uint());
         } else if constexpr (PQ_LEN == 8) {
           pq_val_pack_t buf8;
-          buf8.x0 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
-          buf8.x1 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
-          buf8.x2 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
-          buf8.x3 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
-          buf8.x4 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 4]));
-          buf8.x5 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 5]));
-          buf8.x6 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 6]));
-          buf8.x7 = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 7]));
+          buf8.data.x1[0] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
+          buf8.data.x1[1] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
+          buf8.data.x1[2] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
+          buf8.data.x1[3] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
+          buf8.data.x1[4] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 4]));
+          buf8.data.x1[5] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 5]));
+          buf8.data.x1[6] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 6]));
+          buf8.data.x1[7] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 7]));
           device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf8.as_uint());
         }
       }
@@ -383,8 +383,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
 
                 // Loading query vector from smem
                 device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
-                c2_.x = static_cast<half>(c_vec.x);
-                c2_.y = static_cast<half>(c_vec.y);
+                c2_ = c_vec.as_half2(0);
                 // L2 distance
                 auto dist =
                   q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
@@ -395,8 +394,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
                 query_val_index += kQueryBlock;
 
                 device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
-                c2_.x = static_cast<half>(c_vec.z);
-                c2_.y = static_cast<half>(c_vec.w);
+                c2_ = c_vec.as_half2(1);
                 // L2 distance
                 dist =
                   q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
@@ -412,8 +410,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
 
                 // Loading query vector from smem
                 device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
-                c2_.x = static_cast<half>(c_vec.x0);
-                c2_.y = static_cast<half>(c_vec.x1);
+                c2_ = c_vec.as_half2(0);
                 // L2 distance
                 auto dist =
                   q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
@@ -424,8 +421,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
                 query_val_index += kQueryBlock;
 
                 device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
-                c2_.x = static_cast<half>(c_vec.x2);
-                c2_.y = static_cast<half>(c_vec.x3);
+                c2_ = c_vec.as_half2(1);
                 // L2 distance
                 dist =
                   q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
@@ -436,8 +432,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
                 query_val_index += kQueryBlock;
 
                 device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
-                c2_.x = static_cast<half>(c_vec.x4);
-                c2_.y = static_cast<half>(c_vec.x5);
+                c2_ = c_vec.as_half2(2);
                 // L2 distance
                 dist =
                   q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
@@ -448,8 +443,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
                 query_val_index += kQueryBlock;
 
                 device::lds(q2, query_ptr + sizeof(half2) * query_val_index);
-                c2_.x = static_cast<half>(c_vec.x6);
-                c2_.y = static_cast<half>(c_vec.x7);
+                c2_ = c_vec.as_half2(3);
                 // L2 distance
                 dist =
                   q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
index 10d817dfd3..78c8db6fff 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
@@ -112,52 +112,56 @@ struct fp_8bit {
   }
 };
 
-template <uint32_t ExpBits, bool Signed, bool SW_Emulation = true>
-struct fp_8bit4 {
-  using unit_t                           = fp_8bit<ExpBits, Signed>;
-  using uint_t                           = uint32_t;
-  static constexpr uint32_t num_elements = 4;
-  unit_t x, y, z, w;
-  HDI fp_8bit4() : x(0.f), y(0.f), z(0.f), w(0.f) {}
-
-  HDI uint_t& as_uint() { return *reinterpret_cast<uint_t*>(this); }
-  HDI uint_t as_uint() const { return *reinterpret_cast<const uint_t*>(this); }
-};
+template <uint32_t ExpBits, bool Signed, bool SW_Emulation = false>
+struct fp_8bit4 {};
 
 template <>
 struct fp_8bit4<5, true, false> {
   using unit_t                           = __nv_fp8_e5m2;
+  using x2_t                             = __nv_fp8x2_storage_t;
   using uint_t                           = uint32_t;
   static constexpr uint32_t num_elements = 4;
-  unit_t x, y, z, w;
-  HDI fp_8bit4() : x(0.f), y(0.f), z(0.f), w(0.f) {}
 
-  HDI uint_t& as_uint() { return *reinterpret_cast<uint_t*>(this); }
-  HDI uint_t as_uint() const { return *reinterpret_cast<const uint_t*>(this); }
-};
+  union {
+    unit_t x1[4];
+    x2_t x2[2];
+    uint_t u;
+  } data;
 
-template <uint32_t ExpBits, bool Signed, bool SW_Emulation = true>
-struct fp_8bit8 {
-  using unit_t                           = fp_8bit<ExpBits, Signed>;
-  using uint_t                           = uint64_t;
-  static constexpr uint32_t num_elements = 8;
-  unit_t x0, x1, x2, x3, x4, x5, x6, x7;
-  HDI fp_8bit8() : x0(0.f), x1(0.f), x2(0.f), x3(0.f), x4(0.f), x5(0.f), x6(0.f), x7(0.f) {}
+  HDI fp_8bit4() { data.u = 0; }
 
-  HDI uint_t& as_uint() { return *reinterpret_cast<uint_t*>(this); }
-  HDI uint_t as_uint() const { return *reinterpret_cast<const uint_t*>(this); }
+  HDI uint_t& as_uint() { return data.u; }
+  HDI uint_t as_uint() const { return data.u; }
+  HDI half2 as_half2(const uint32_t i) const
+  {
+    return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2);
+  }
 };
 
+template <uint32_t ExpBits, bool Signed, bool SW_Emulation = false>
+struct fp_8bit8 {};
+
 template <>
 struct fp_8bit8<5, true, false> {
   using unit_t                           = __nv_fp8_e5m2;
+  using x2_t                             = __nv_fp8x2_storage_t;
   using uint_t                           = uint64_t;
   static constexpr uint32_t num_elements = 8;
-  unit_t x0, x1, x2, x3, x4, x5, x6, x7;
-  HDI fp_8bit8() : x0(0.f), x1(0.f), x2(0.f), x3(0.f), x4(0.f), x5(0.f), x6(0.f), x7(0.f) {}
 
-  HDI uint_t& as_uint() { return *reinterpret_cast<uint_t*>(this); }
-  HDI uint_t as_uint() const { return *reinterpret_cast<const uint_t*>(this); }
+  union {
+    unit_t x1[8];
+    x2_t x2[4];
+    uint_t u;
+  } data;
+
+  HDI fp_8bit8() { data.u = 0; }
+
+  HDI uint_t& as_uint() { return data.u; }
+  HDI uint_t as_uint() const { return data.u; }
+  HDI half2 as_half2(const uint32_t i) const
+  {
+    return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2);
+  }
 };
 
 }  // namespace cuvs::neighbors::ivf_pq::detail

From 6bcb0e6d83a073d084760543170722ab8970d29a Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Sun, 2 Nov 2025 13:05:02 +0900
Subject: [PATCH 025/119] Fix template switch

---
 .../detail/cagra/compute_distance_vpq-impl.cuh | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 2333f2e905..b7bcd9e5f2 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -214,19 +214,19 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
         const auto smem_index =
           (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
 
-        if constexpr (PQ_LEN == 2) {
+        if constexpr (pq_val_config::pq_val_pack_num_elements == 2) {
           half2 buf2;
           buf2.x = r->pq_code_book_ptr()[i];
           buf2.y = r->pq_code_book_ptr()[i + 1];
           device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2);
-        } else if constexpr (PQ_LEN == 4) {
+        } else if constexpr (pq_val_config::pq_val_pack_num_elements == 4) {
           pq_val_pack_t buf4;
           buf4.data.x1[0] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
           buf4.data.x1[1] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
           buf4.data.x1[2] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
           buf4.data.x1[3] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
           device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf4.as_uint());
-        } else if constexpr (PQ_LEN == 8) {
+        } else if constexpr (pq_val_config::pq_val_pack_num_elements == 8) {
           pq_val_pack_t buf8;
           buf8.data.x1[0] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
           buf8.data.x1[1] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
@@ -252,12 +252,12 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
   for (unsigned i = threadIdx.x * pq_val_pack_num_elements; i < dim;
        i += blockDim.x * pq_val_pack_num_elements) {
     pq_val_pack_t buf;
-    if constexpr (PQ_LEN == 2) {
+    if constexpr (pq_val_config::pq_val_pack_num_elements == 2) {
       if (i < dim) { static_cast<pq_val_t>(static_cast<float>(buf.x = mapping(queries_ptr[i]))); }
       if (i + 1 < dim) {
         static_cast<pq_val_t>(static_cast<float>(buf.y = mapping(queries_ptr[i + 1])));
       }
-    } else if constexpr (PQ_LEN == 4) {
+    } else if constexpr (pq_val_config::pq_val_pack_num_elements == 4) {
       if (i < dim) {
         buf.data.x1[0] = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i])));
       }
@@ -270,7 +270,7 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
       if (i + 3 < dim) {
         buf.data.x1[3] = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 3])));
       }
-    } else if constexpr (PQ_LEN == 8) {
+    } else if constexpr (pq_val_config::pq_val_pack_num_elements == 8) {
       if (i < dim) {
         buf.data.x1[0] = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i])));
       }
@@ -406,7 +406,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
                   query_vec_element_id);
               }
 
-              if constexpr (PQ_LEN == 2) {
+              if constexpr (pq_val_pack_num_elements == 2) {
                 pq_val_pack_t c2, q2;
                 // Loading PQ code book from smem
                 device::lds(c2,
@@ -420,7 +420,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
                   q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
-              } else if constexpr (PQ_LEN == 4) {
+              } else if constexpr (pq_val_pack_num_elements == 4) {
                 pq_val_pack_t c_vec, q_vec;
                 // Loading PQ code book from smem
                 device::lds(c_vec.as_uint(),
@@ -448,7 +448,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
                        reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
-              } else if constexpr (PQ_LEN == 8) {
+              } else if constexpr (pq_val_pack_num_elements == 8) {
                 pq_val_pack_t c_vec, q_vec;
                 // Loading PQ code book from smem
                 device::lds(c_vec.as_uint(),

From 581eba1ea34ae8855c8bbf38ab03532201710f48 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Sun, 2 Nov 2025 15:38:53 +0900
Subject: [PATCH 026/119] Fix pq_val_config

---
 .../cagra/compute_distance_vpq-impl.cuh       | 37 +++++++++----------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 48b49073d1..e9e536bbfb 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -17,9 +17,7 @@
 namespace cuvs::neighbors::cagra::detail {
 
 template <uint32_t PQ_LEN>
-struct pq_val_type_t {};
-template <>
-struct pq_val_type_t<2> {
+struct pq_val_type_t {
   using pq_val_pack_t                                = half2;
   using pq_val_t                                     = half;
   using pq_val_pack_uint_t                           = uint32_t;
@@ -165,16 +163,17 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
                                                    const typename DescriptorT::DATA_T* queries_ptr,
                                                    uint32_t query_id) -> const DescriptorT*
 {
-  using QUERY_T                   = typename DescriptorT::QUERY_T;
-  using CODE_BOOK_T               = typename DescriptorT::CODE_BOOK_T;
-  using word_type                 = uint32_t;
-  constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim;
-  constexpr auto PQ_BITS          = DescriptorT::kPqBits;
-  constexpr auto PQ_LEN           = DescriptorT::kPqLen;
-  using pq_val_config             = pq_val_type_t<PQ_LEN>;
-  using pq_val_t                  = typename pq_val_config::pq_val_t;
-  using pq_val_pack_uint_t        = typename pq_val_config::pq_val_pack_uint_t;
-  using pq_val_pack_t             = typename pq_val_config::pq_val_pack_t;
+  using QUERY_T                           = typename DescriptorT::QUERY_T;
+  using CODE_BOOK_T                       = typename DescriptorT::CODE_BOOK_T;
+  using word_type                         = uint32_t;
+  constexpr auto kDatasetBlockDim         = DescriptorT::kDatasetBlockDim;
+  constexpr auto PQ_BITS                  = DescriptorT::kPqBits;
+  constexpr auto PQ_LEN                   = DescriptorT::kPqLen;
+  using pq_val_config                     = pq_val_type_t<PQ_LEN>;
+  using pq_val_t                          = typename pq_val_config::pq_val_t;
+  using pq_val_pack_uint_t                = typename pq_val_config::pq_val_pack_uint_t;
+  using pq_val_pack_t                     = typename pq_val_config::pq_val_pack_t;
+  constexpr auto pq_val_pack_num_elements = pq_val_config::pq_val_pack_num_elements;
 
   auto* r = reinterpret_cast<DescriptorT*>(smem_ptr);
 
@@ -211,19 +210,19 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
         const auto smem_index =
           (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
 
-        if constexpr (PQ_LEN == 2) {
+        if constexpr (pq_val_pack_num_elements == 2) {
           half2 buf2;
           buf2.x = r->pq_code_book_ptr()[i];
           buf2.y = r->pq_code_book_ptr()[i + 1];
           device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2);
-        } else if constexpr (PQ_LEN == 4) {
+        } else if constexpr (pq_val_pack_num_elements == 4) {
           pq_val_pack_t buf4;
           buf4.data.x1[0] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
           buf4.data.x1[1] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
           buf4.data.x1[2] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
           buf4.data.x1[3] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
           device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf4.as_uint());
-        } else if constexpr (PQ_LEN == 8) {
+        } else if constexpr (pq_val_pack_num_elements == 8) {
           pq_val_pack_t buf8;
           buf8.data.x1[0] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
           buf8.data.x1[1] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
@@ -348,7 +347,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
                 vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
               half2 q2;
               // if constexpr (false) {
-              if constexpr (PQ_LEN == 2) {
+              if constexpr (pq_val_pack_num_elements == 2) {
                 pq_val_pack_t c2;
                 // Loading PQ code book from smem
                 device::lds(c2,
@@ -362,7 +361,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
                   q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
-              } else if constexpr (PQ_LEN == 4) {
+              } else if constexpr (pq_val_pack_num_elements == 4) {
                 pq_val_pack_t c_vec;
                 // Loading PQ code book from smem
                 device::lds(c_vec.as_uint(),
@@ -389,7 +388,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
                   q2 - c2_ - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
-              } else if constexpr (PQ_LEN == 8) {
+              } else if constexpr (pq_val_pack_num_elements == 8) {
                 pq_val_pack_t c_vec;
                 // Loading PQ code book from smem
                 device::lds(c_vec.as_uint(),

From a088cfd3a40df04cb53c1a516a0b8c2a0f043e51 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Mon, 3 Nov 2025 14:52:26 +0900
Subject: [PATCH 027/119] Improve smem index calculation

---
 cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index c63476f3c3..0a3cd9e4df 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -395,6 +395,10 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
               if constexpr (PQ_LEN == 2) {
                 query_val_index =
                   vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
+              } else if constexpr (PQ_LEN == pq_val_pack_num_elements) {
+                query_val_index = elem_offset +
+                                  v * (DatasetBlockDim / (pq_val_pack_num_elements * vlen)) +
+                                  e * TeamSize + laneId;
               } else {
                 const uint32_t query_vec_element_id =
                   (elem_offset + e * vlen * TeamSize + v + laneId * vlen) * PQ_LEN /

From 12809f3633bdc948facaa1c5aea7023c7e04bbb7 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Tue, 11 Nov 2025 12:27:40 +0900
Subject: [PATCH 028/119] Update fp8 pack dtype

---
 .../cagra/compute_distance_vpq-impl.cuh       | 234 +++++++++---------
 .../neighbors/detail/cagra/device_common.hpp  |  37 +++
 cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh   |  53 ----
 3 files changed, 155 insertions(+), 169 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 0a3cd9e4df..d626fbb422 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -5,7 +5,6 @@
 
 #pragma once
 
-#include "../../ivf_pq/ivf_pq_fp_8bit.cuh"
 #include "compute_distance_vpq.hpp"
 
 #include <cuvs/distance/distance.hpp>
@@ -17,25 +16,18 @@
 namespace cuvs::neighbors::cagra::detail {
 
 template <uint32_t PQ_LEN>
-struct pq_val_type_t {
-  using pq_val_pack_t                                = half2;
-  using pq_val_t                                     = half;
-  using pq_val_pack_uint_t                           = uint32_t;
-  static constexpr uint32_t pq_val_pack_num_elements = 2;
+struct smem_val_type_t {
+  using smem_val_pack_t                         = device::fp8xN<PQ_LEN, 5>;
+  using smem_val_t                              = typename smem_val_pack_t::unit_t;
+  using smem_val_pack_uint_t                    = typename smem_val_pack_t::uint_t;
+  static constexpr uint32_t num_packed_elements = smem_val_pack_t::num_elements;
 };
 template <>
-struct pq_val_type_t<4> {
-  using pq_val_pack_t                                = ivf_pq::detail::fp_8bit4<5, true, false>;
-  using pq_val_t                                     = typename pq_val_pack_t::unit_t;
-  using pq_val_pack_uint_t                           = typename pq_val_pack_t::uint_t;
-  static constexpr uint32_t pq_val_pack_num_elements = pq_val_pack_t::num_elements;
-};
-template <>
-struct pq_val_type_t<8> {
-  using pq_val_pack_t                                = ivf_pq::detail::fp_8bit8<5, true, false>;
-  using pq_val_t                                     = typename pq_val_pack_t::unit_t;
-  using pq_val_pack_uint_t                           = typename pq_val_pack_t::uint_t;
-  static constexpr uint32_t pq_val_pack_num_elements = pq_val_pack_t::num_elements;
+struct smem_val_type_t<2> {
+  using smem_val_pack_t                         = half2;
+  using smem_val_t                              = half;
+  using smem_val_pack_uint_t                    = uint32_t;
+  static constexpr uint32_t num_packed_elements = 2;
 };
 
 template <cuvs::distance::DistanceType Metric,
@@ -108,8 +100,9 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
   }
 
   static constexpr std::uint32_t kSMemCodeBookSizeInBytes =
-    (1 << PQ_BITS) * PQ_LEN * utils::size_of<typename pq_val_type_t<PQ_LEN>::pq_val_pack_uint_t>() /
-    pq_val_type_t<PQ_LEN>::pq_val_pack_num_elements;
+    (1 << PQ_BITS) * PQ_LEN *
+    utils::size_of<typename smem_val_type_t<PQ_LEN>::smem_val_pack_uint_t>() /
+    smem_val_type_t<PQ_LEN>::num_packed_elements;
 
   _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl,
                                                  compute_distance_type* compute_distance_impl,
@@ -144,8 +137,8 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
     */
     return sizeof(cagra_q_dataset_descriptor_t) + kSMemCodeBookSizeInBytes +
            raft::round_up_safe<uint32_t>(dim, DatasetBlockDim) *
-             utils::size_of<typename pq_val_type_t<PQ_LEN>::pq_val_pack_uint_t>() /
-             pq_val_type_t<PQ_LEN>::pq_val_pack_num_elements;
+             utils::size_of<typename smem_val_type_t<PQ_LEN>::smem_val_pack_uint_t>() /
+             smem_val_type_t<PQ_LEN>::num_packed_elements;
   }
 };
 
@@ -165,17 +158,17 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
                                                    const typename DescriptorT::DATA_T* queries_ptr,
                                                    uint32_t query_id) -> const DescriptorT*
 {
-  using QUERY_T                           = typename DescriptorT::QUERY_T;
-  using CODE_BOOK_T                       = typename DescriptorT::CODE_BOOK_T;
-  using word_type                         = uint32_t;
-  constexpr auto kDatasetBlockDim         = DescriptorT::kDatasetBlockDim;
-  constexpr auto PQ_BITS                  = DescriptorT::kPqBits;
-  constexpr auto PQ_LEN                   = DescriptorT::kPqLen;
-  using pq_val_config                     = pq_val_type_t<PQ_LEN>;
-  using pq_val_t                          = typename pq_val_config::pq_val_t;
-  using pq_val_pack_uint_t                = typename pq_val_config::pq_val_pack_uint_t;
-  using pq_val_pack_t                     = typename pq_val_config::pq_val_pack_t;
-  constexpr auto pq_val_pack_num_elements = pq_val_config::pq_val_pack_num_elements;
+  using QUERY_T                      = typename DescriptorT::QUERY_T;
+  using CODE_BOOK_T                  = typename DescriptorT::CODE_BOOK_T;
+  using word_type                    = uint32_t;
+  constexpr auto kDatasetBlockDim    = DescriptorT::kDatasetBlockDim;
+  constexpr auto PQ_BITS             = DescriptorT::kPqBits;
+  constexpr auto PQ_LEN              = DescriptorT::kPqLen;
+  using smem_val_config              = smem_val_type_t<PQ_LEN>;
+  using smem_val_t                   = typename smem_val_config::smem_val_t;
+  using smem_val_pack_uint_t         = typename smem_val_config::smem_val_pack_uint_t;
+  using smem_val_pack_t              = typename smem_val_config::smem_val_pack_t;
+  constexpr auto num_packed_elements = smem_val_config::num_packed_elements;
 
   auto* r = reinterpret_cast<DescriptorT*>(smem_ptr);
 
@@ -197,14 +190,14 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
     __syncthreads();
 
     // Copy PQ table
-    for (unsigned i = threadIdx.x * pq_val_config::pq_val_pack_num_elements;
+    for (unsigned i = threadIdx.x * smem_val_config::num_packed_elements;
          i < (1 << PQ_BITS) * PQ_LEN;
-         i += blockDim.x * pq_val_config::pq_val_pack_num_elements) {
+         i += blockDim.x * smem_val_config::num_packed_elements) {
       // Change the order of PQ code book array to reduce the
       // frequency of bank conflicts.
       constexpr auto num_elements_per_bank =
-        pq_val_config::pq_val_pack_num_elements /
-        (utils::size_of<pq_val_config::pq_val_pack_uint_t>() / utils::size_of<uint32_t>());
+        smem_val_config::num_packed_elements /
+        (utils::size_of<smem_val_config::smem_val_pack_uint_t>() / utils::size_of<uint32_t>());
 
       if constexpr (PQ_LEN >= num_elements_per_bank) {  // safety
         constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank;
@@ -212,29 +205,39 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
         const auto smem_index =
           (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
 
-        if constexpr (pq_val_pack_num_elements == 2) {
+        if constexpr (num_packed_elements == 2) {
           half2 buf2;
           buf2.x = r->pq_code_book_ptr()[i];
           buf2.y = r->pq_code_book_ptr()[i + 1];
-          device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2);
-        } else if constexpr (pq_val_pack_num_elements == 4) {
-          pq_val_pack_t buf4;
-          buf4.data.x1[0] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
-          buf4.data.x1[1] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
-          buf4.data.x1[2] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
-          buf4.data.x1[3] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
-          device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf4.as_uint());
-        } else if constexpr (pq_val_pack_num_elements == 8) {
-          pq_val_pack_t buf8;
-          buf8.data.x1[0] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
-          buf8.data.x1[1] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
-          buf8.data.x1[2] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
-          buf8.data.x1[3] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
-          buf8.data.x1[4] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 4]));
-          buf8.data.x1[5] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 5]));
-          buf8.data.x1[6] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 6]));
-          buf8.data.x1[7] = static_cast<pq_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 7]));
-          device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf8.as_uint());
+          device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_t), buf2);
+        } else if constexpr (num_packed_elements == 4) {
+          smem_val_pack_t buf4;
+          buf4.data.x1[0] = static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
+          buf4.data.x1[1] =
+            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
+          buf4.data.x1[2] =
+            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
+          buf4.data.x1[3] =
+            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
+          device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_uint_t), buf4.as_uint());
+        } else if constexpr (num_packed_elements == 8) {
+          smem_val_pack_t buf8;
+          buf8.data.x1[0] = static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
+          buf8.data.x1[1] =
+            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
+          buf8.data.x1[2] =
+            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
+          buf8.data.x1[3] =
+            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
+          buf8.data.x1[4] =
+            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 4]));
+          buf8.data.x1[5] =
+            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 5]));
+          buf8.data.x1[6] =
+            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 6]));
+          buf8.data.x1[7] =
+            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 7]));
+          device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_uint_t), buf8.as_uint());
         }
       }
     }
@@ -245,65 +248,65 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
 
   constexpr cuvs::spatial::knn::detail::utils::mapping<QUERY_T> mapping{};
   auto smem_query_ptr =
-    reinterpret_cast<pq_val_t*>(reinterpret_cast<uint8_t*>(smem_ptr) + sizeof(DescriptorT) +
-                                DescriptorT::kSMemCodeBookSizeInBytes);
-  for (unsigned i = threadIdx.x * pq_val_pack_num_elements; i < dim;
-       i += blockDim.x * pq_val_pack_num_elements) {
-    pq_val_pack_t buf;
-    if constexpr (pq_val_config::pq_val_pack_num_elements == 2) {
-      if (i < dim) { static_cast<pq_val_t>(static_cast<float>(buf.x = mapping(queries_ptr[i]))); }
+    reinterpret_cast<smem_val_t*>(reinterpret_cast<uint8_t*>(smem_ptr) + sizeof(DescriptorT) +
+                                  DescriptorT::kSMemCodeBookSizeInBytes);
+  for (unsigned i = threadIdx.x * num_packed_elements; i < dim;
+       i += blockDim.x * num_packed_elements) {
+    smem_val_pack_t buf;
+    if constexpr (smem_val_config::num_packed_elements == 2) {
+      if (i < dim) { static_cast<smem_val_t>(static_cast<float>(buf.x = mapping(queries_ptr[i]))); }
       if (i + 1 < dim) {
-        static_cast<pq_val_t>(static_cast<float>(buf.y = mapping(queries_ptr[i + 1])));
+        static_cast<smem_val_t>(static_cast<float>(buf.y = mapping(queries_ptr[i + 1])));
       }
-    } else if constexpr (pq_val_config::pq_val_pack_num_elements == 4) {
+    } else if constexpr (smem_val_config::num_packed_elements == 4) {
       if (i < dim) {
-        buf.data.x1[0] = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i])));
+        buf.data.x1[0] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i])));
       }
       if (i + 1 < dim) {
-        buf.data.x1[1] = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 1])));
+        buf.data.x1[1] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 1])));
       }
       if (i + 2 < dim) {
-        buf.data.x1[2] = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 2])));
+        buf.data.x1[2] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 2])));
       }
       if (i + 3 < dim) {
-        buf.data.x1[3] = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 3])));
+        buf.data.x1[3] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 3])));
       }
-    } else if constexpr (pq_val_config::pq_val_pack_num_elements == 8) {
+    } else if constexpr (smem_val_config::num_packed_elements == 8) {
       if (i < dim) {
-        buf.data.x1[0] = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i])));
+        buf.data.x1[0] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i])));
       }
       if (i + 1 < dim) {
-        buf.data.x1[1] = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 1])));
+        buf.data.x1[1] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 1])));
       }
       if (i + 2 < dim) {
-        buf.data.x1[2] = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 2])));
+        buf.data.x1[2] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 2])));
       }
       if (i + 3 < dim) {
-        buf.data.x1[3] = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 3])));
+        buf.data.x1[3] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 3])));
       }
       if (i + 4 < dim) {
-        buf.data.x1[4] = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 4])));
+        buf.data.x1[4] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 4])));
       }
       if (i + 5 < dim) {
-        buf.data.x1[5] = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 5])));
+        buf.data.x1[5] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 5])));
       }
       if (i + 6 < dim) {
-        buf.data.x1[6] = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 6])));
+        buf.data.x1[6] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 6])));
       }
       if (i + 7 < dim) {
-        buf.data.x1[7] = static_cast<pq_val_t>(static_cast<float>(mapping(queries_ptr[i + 7])));
+        buf.data.x1[7] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 7])));
       }
     }
 
-    if constexpr ((PQ_BITS == 8) && (PQ_LEN % pq_val_pack_num_elements == 0)) {
+    if constexpr ((PQ_BITS == 8) && (PQ_LEN % num_packed_elements == 0)) {
       // Transpose the queries buffer to avoid bank conflicts in compute_distance.
       constexpr uint32_t vlen = 4;  // **** DO NOT CHANGE ****
-      constexpr auto kStride  = vlen * PQ_LEN / pq_val_pack_num_elements;
-      reinterpret_cast<pq_val_pack_t*>(
-        smem_query_ptr)[transpose<kDatasetBlockDim / pq_val_pack_num_elements, kStride>(
-        i / pq_val_pack_num_elements)] = buf;
+      constexpr auto kStride  = vlen * PQ_LEN / num_packed_elements;
+      reinterpret_cast<smem_val_pack_t*>(
+        smem_query_ptr)[transpose<kDatasetBlockDim / num_packed_elements, kStride>(
+        i / num_packed_elements)] = buf;
     } else {
-      (reinterpret_cast<pq_val_pack_t*>(smem_query_ptr + i))[0] = buf;
+      (reinterpret_cast<smem_val_pack_t*>(smem_query_ptr + i))[0] = buf;
     }
   }
 
@@ -327,11 +330,11 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
   constexpr auto PQ_LEN          = DescriptorT::kPqLen;
   using PQ_CODEBOOK_LOAD_T       = uint32_t;
 
-  using pq_val_config                         = pq_val_type_t<PQ_LEN>;
-  using pq_val_t                              = typename pq_val_config::pq_val_t;
-  using pq_val_pack_uint_t                    = typename pq_val_config::pq_val_pack_uint_t;
-  using pq_val_pack_t                         = typename pq_val_config::pq_val_pack_t;
-  constexpr uint32_t pq_val_pack_num_elements = pq_val_config::pq_val_pack_num_elements;
+  using smem_val_config                  = smem_val_type_t<PQ_LEN>;
+  using smem_val_t                       = typename smem_val_config::smem_val_t;
+  using smem_val_pack_uint_t             = typename smem_val_config::smem_val_pack_uint_t;
+  using smem_val_pack_t                  = typename smem_val_config::smem_val_pack_t;
+  constexpr uint32_t num_packed_elements = smem_val_config::num_packed_elements;
 
   const uint32_t query_ptr = pq_codebook_ptr + DescriptorT::kSMemCodeBookSizeInBytes;
   static_assert(PQ_BITS == 8, "Only pq_bits == 8 is supported at the moment.");
@@ -364,7 +367,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
     }
     //
     if constexpr (PQ_LEN % 2 == 0) {
-      if constexpr (PQ_LEN >= pq_val_pack_num_elements) {  // safety
+      if constexpr (PQ_LEN >= num_packed_elements) {  // safety
         // **** Use half2 for distance computation ****
 #pragma unroll
         for (std::uint32_t e = 0; e < nelem; e++) {
@@ -384,52 +387,51 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
           for (std::uint32_t v = 0; v < vlen; v++) {
             if (PQ_LEN * (v + k) >= dim) break;
 #pragma unroll
-            for (std::uint32_t m = 0; m < PQ_LEN / pq_val_pack_num_elements; m++) {
+            for (std::uint32_t m = 0; m < PQ_LEN / num_packed_elements; m++) {
               constexpr uint32_t vq_val_pack_num_elements = 2;
               constexpr auto kQueryBlock                  = DatasetBlockDim / (vlen * PQ_LEN);
-              std::uint32_t vq_half2_index =
-                m * (pq_val_pack_num_elements / vq_val_pack_num_elements) +
-                (PQ_LEN / vq_val_pack_num_elements) * v;
+              std::uint32_t vq_half2_index = m * (num_packed_elements / vq_val_pack_num_elements) +
+                                             (PQ_LEN / vq_val_pack_num_elements) * v;
 
               uint32_t query_val_index;
               if constexpr (PQ_LEN == 2) {
                 query_val_index =
                   vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
-              } else if constexpr (PQ_LEN == pq_val_pack_num_elements) {
+              } else if constexpr (PQ_LEN == num_packed_elements) {
                 query_val_index = elem_offset +
-                                  v * (DatasetBlockDim / (pq_val_pack_num_elements * vlen)) +
+                                  v * (DatasetBlockDim / (num_packed_elements * vlen)) +
                                   e * TeamSize + laneId;
               } else {
                 const uint32_t query_vec_element_id =
                   (elem_offset + e * vlen * TeamSize + v + laneId * vlen) * PQ_LEN /
-                  pq_val_pack_num_elements;
-                constexpr auto kStride = vlen * PQ_LEN / pq_val_pack_num_elements;
-                query_val_index = transpose<DatasetBlockDim / pq_val_pack_num_elements, kStride>(
-                  query_vec_element_id);
+                  num_packed_elements;
+                constexpr auto kStride = vlen * PQ_LEN / num_packed_elements;
+                query_val_index =
+                  transpose<DatasetBlockDim / num_packed_elements, kStride>(query_vec_element_id);
               }
 
-              if constexpr (pq_val_pack_num_elements == 2) {
-                pq_val_pack_t c2, q2;
+              if constexpr (num_packed_elements == 2) {
+                smem_val_pack_t c2, q2;
                 // Loading PQ code book from smem
                 device::lds(c2,
-                            pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
+                            pq_codebook_ptr + sizeof(smem_val_pack_uint_t) *
                                                 ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
 
                 // Loading query vector from smem
-                device::lds(q2, query_ptr + sizeof(pq_val_pack_t) * query_val_index);
+                device::lds(q2, query_ptr + sizeof(smem_val_pack_t) * query_val_index);
                 // L2 distance
                 auto dist =
                   q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
-              } else if constexpr (pq_val_pack_num_elements == 4) {
-                pq_val_pack_t c_vec, q_vec;
+              } else if constexpr (num_packed_elements == 4) {
+                smem_val_pack_t c_vec, q_vec;
                 // Loading PQ code book from smem
                 device::lds(c_vec.as_uint(),
-                            pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
+                            pq_codebook_ptr + sizeof(smem_val_pack_uint_t) *
                                                 ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
                 device::lds(q_vec.as_uint(),
-                            query_ptr + sizeof(pq_val_pack_uint_t) * query_val_index);
+                            query_ptr + sizeof(smem_val_pack_uint_t) * query_val_index);
 
                 half2 c2_, q2_;
 
@@ -450,14 +452,14 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
                        reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
-              } else if constexpr (pq_val_pack_num_elements == 8) {
-                pq_val_pack_t c_vec, q_vec;
+              } else if constexpr (num_packed_elements == 8) {
+                smem_val_pack_t c_vec, q_vec;
                 // Loading PQ code book from smem
                 device::lds(c_vec.as_uint(),
-                            pq_codebook_ptr + sizeof(pq_val_pack_uint_t) *
+                            pq_codebook_ptr + sizeof(smem_val_pack_uint_t) *
                                                 ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
                 device::lds(q_vec.as_uint(),
-                            query_ptr + sizeof(pq_val_pack_uint_t) * query_val_index);
+                            query_ptr + sizeof(smem_val_pack_uint_t) * query_val_index);
                 half2 c2_, q2_;
 
                 // Loading query vector from smem
@@ -521,8 +523,8 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
 #pragma unroll
         for (std::uint32_t v = 0; v < vlen; v++) {
           if (PQ_LEN * (v + k) >= dim) break;
-          CODE_BOOK_T pq_vals[PQ_LEN];
-          device::lds(pq_vals, pq_codebook_ptr + sizeof(CODE_BOOK_T) * PQ_LEN * (pq_code & 0xff));
+          CODE_BOOK_T smem_vals[PQ_LEN];
+          device::lds(smem_vals, pq_codebook_ptr + sizeof(CODE_BOOK_T) * PQ_LEN * (pq_code & 0xff));
 #pragma unroll
           for (std::uint32_t m = 0; m < PQ_LEN; m++) {
             const std::uint32_t d1 = m + (PQ_LEN * v);
@@ -530,7 +532,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
             // if (d >= dataset_dim) break;
             DISTANCE_T diff;
             device::lds(diff, query_ptr + sizeof(QUERY_T) * d);
-            diff -= static_cast<DISTANCE_T>(pq_vals[m]);
+            diff -= static_cast<DISTANCE_T>(smem_vals[m]);
             diff -=
               static_cast<DISTANCE_T>(reinterpret_cast<CODE_BOOK_T(&)[PQ_LEN * vlen]>(vq_vals)[d1]);
             norm += diff * diff;
diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp
index aa852803b0..aa0e3847b5 100644
--- a/cpp/src/neighbors/detail/cagra/device_common.hpp
+++ b/cpp/src/neighbors/detail/cagra/device_common.hpp
@@ -249,6 +249,43 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes(
   }
 }
 
+template <uint32_t Bit>
+struct uintN_t {};
+template <>
+struct uintN_t<32> {
+  using type = uint32_t;
+};
+template <>
+struct uintN_t<64> {
+  using type = uint64_t;
+};
+
+template <uint32_t NumPacked, uint32_t ExpBits>
+struct fp8xN {};
+
+template <uint32_t NumPacked>
+struct fp8xN<NumPacked, 5> {
+  using uint_t                           = typename uintN_t<8 * NumPacked>::type;
+  using unit_t                           = __nv_fp8_e5m2;
+  using x2_t                             = __nv_fp8x2_storage_t;
+  static constexpr uint32_t num_elements = NumPacked;
+
+  union {
+    unit_t x1[num_elements];
+    x2_t x2[num_elements / 2];
+    uint_t u;
+  } data;
+
+  HDI fp8xN() { data.u = 0; }
+
+  HDI uint_t& as_uint() { return data.u; }
+  HDI uint_t as_uint() const { return data.u; }
+  HDI half2 as_half2(const uint32_t i) const
+  {
+    return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2);
+  }
+};
+
 RAFT_DEVICE_INLINE_FUNCTION void lds(float& x, uint32_t addr)
 {
   asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "r"(addr));
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
index 2ebf562488..61b9e595fc 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
@@ -100,57 +100,4 @@ struct fp_8bit {
     return r;
   }
 };
-
-template <uint32_t ExpBits, bool Signed, bool SW_Emulation = false>
-struct fp_8bit4 {};
-
-template <>
-struct fp_8bit4<5, true, false> {
-  using unit_t                           = __nv_fp8_e5m2;
-  using x2_t                             = __nv_fp8x2_storage_t;
-  using uint_t                           = uint32_t;
-  static constexpr uint32_t num_elements = 4;
-
-  union {
-    unit_t x1[4];
-    x2_t x2[2];
-    uint_t u;
-  } data;
-
-  HDI fp_8bit4() { data.u = 0; }
-
-  HDI uint_t& as_uint() { return data.u; }
-  HDI uint_t as_uint() const { return data.u; }
-  HDI half2 as_half2(const uint32_t i) const
-  {
-    return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2);
-  }
-};
-
-template <uint32_t ExpBits, bool Signed, bool SW_Emulation = false>
-struct fp_8bit8 {};
-
-template <>
-struct fp_8bit8<5, true, false> {
-  using unit_t                           = __nv_fp8_e5m2;
-  using x2_t                             = __nv_fp8x2_storage_t;
-  using uint_t                           = uint64_t;
-  static constexpr uint32_t num_elements = 8;
-
-  union {
-    unit_t x1[8];
-    x2_t x2[4];
-    uint_t u;
-  } data;
-
-  HDI fp_8bit8() { data.u = 0; }
-
-  HDI uint_t& as_uint() { return data.u; }
-  HDI uint_t as_uint() const { return data.u; }
-  HDI half2 as_half2(const uint32_t i) const
-  {
-    return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2);
-  }
-};
-
 }  // namespace cuvs::neighbors::ivf_pq::detail

From f91d0413becb4fffd37c1aea9b63f2a89eebb8c8 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Tue, 11 Nov 2025 12:41:57 +0900
Subject: [PATCH 029/119] Refactoring

---
 .../cagra/compute_distance_vpq-impl.cuh       | 158 ++++--------------
 1 file changed, 29 insertions(+), 129 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index d626fbb422..b0b022c5d3 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -210,34 +210,13 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
           buf2.x = r->pq_code_book_ptr()[i];
           buf2.y = r->pq_code_book_ptr()[i + 1];
           device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_t), buf2);
-        } else if constexpr (num_packed_elements == 4) {
-          smem_val_pack_t buf4;
-          buf4.data.x1[0] = static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
-          buf4.data.x1[1] =
-            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
-          buf4.data.x1[2] =
-            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
-          buf4.data.x1[3] =
-            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
-          device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_uint_t), buf4.as_uint());
-        } else if constexpr (num_packed_elements == 8) {
-          smem_val_pack_t buf8;
-          buf8.data.x1[0] = static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i]));
-          buf8.data.x1[1] =
-            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 1]));
-          buf8.data.x1[2] =
-            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 2]));
-          buf8.data.x1[3] =
-            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 3]));
-          buf8.data.x1[4] =
-            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 4]));
-          buf8.data.x1[5] =
-            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 5]));
-          buf8.data.x1[6] =
-            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 6]));
-          buf8.data.x1[7] =
-            static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + 7]));
-          device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_uint_t), buf8.as_uint());
+        } else if constexpr (num_packed_elements == 4 || num_packed_elements == 8) {
+          smem_val_pack_t buf;
+#pragma unroll
+          for (uint32_t k = 0; k < num_packed_elements; k++) {
+            buf.data.x1[k] = static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[k]));
+          }
+          device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_uint_t), buf.as_uint());
         }
       }
     }
@@ -253,48 +232,17 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
   for (unsigned i = threadIdx.x * num_packed_elements; i < dim;
        i += blockDim.x * num_packed_elements) {
     smem_val_pack_t buf;
-    if constexpr (smem_val_config::num_packed_elements == 2) {
+    if constexpr (num_packed_elements == 2) {
       if (i < dim) { static_cast<smem_val_t>(static_cast<float>(buf.x = mapping(queries_ptr[i]))); }
       if (i + 1 < dim) {
         static_cast<smem_val_t>(static_cast<float>(buf.y = mapping(queries_ptr[i + 1])));
       }
-    } else if constexpr (smem_val_config::num_packed_elements == 4) {
-      if (i < dim) {
-        buf.data.x1[0] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i])));
-      }
-      if (i + 1 < dim) {
-        buf.data.x1[1] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 1])));
-      }
-      if (i + 2 < dim) {
-        buf.data.x1[2] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 2])));
-      }
-      if (i + 3 < dim) {
-        buf.data.x1[3] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 3])));
-      }
-    } else if constexpr (smem_val_config::num_packed_elements == 8) {
-      if (i < dim) {
-        buf.data.x1[0] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i])));
-      }
-      if (i + 1 < dim) {
-        buf.data.x1[1] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 1])));
-      }
-      if (i + 2 < dim) {
-        buf.data.x1[2] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 2])));
-      }
-      if (i + 3 < dim) {
-        buf.data.x1[3] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 3])));
-      }
-      if (i + 4 < dim) {
-        buf.data.x1[4] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 4])));
-      }
-      if (i + 5 < dim) {
-        buf.data.x1[5] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 5])));
-      }
-      if (i + 6 < dim) {
-        buf.data.x1[6] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 6])));
-      }
-      if (i + 7 < dim) {
-        buf.data.x1[7] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + 7])));
+    } else if constexpr (num_packed_elements == 4 || num_packed_elements == 8) {
+#pragma unroll
+      for (uint32_t k = 0; k < num_packed_elements; k++) {
+        if (i + k < dim) {
+          buf.data.x1[k] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + k])));
+        }
       }
     }
 
@@ -424,7 +372,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
                   q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
                 dist = dist * dist;
                 norm += static_cast<DISTANCE_T>(dist.x + dist.y);
-              } else if constexpr (num_packed_elements == 4) {
+              } else if constexpr (num_packed_elements == 4 || num_packed_elements == 8) {
                 smem_val_pack_t c_vec, q_vec;
                 // Loading PQ code book from smem
                 device::lds(c_vec.as_uint(),
@@ -435,68 +383,20 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
 
                 half2 c2_, q2_;
 
-                // Loading query vector from smem
-                c2_ = c_vec.as_half2(0);
-                q2_ = q_vec.as_half2(0);
-                // L2 distance
-                auto dist = q2_ - c2_ -
-                            reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
-                dist = dist * dist;
-                norm += static_cast<DISTANCE_T>(dist.x + dist.y);
-
-                vq_half2_index += 1;
-                c2_ = c_vec.as_half2(1);
-                q2_ = q_vec.as_half2(1);
-                // L2 distance
-                dist = q2_ - c2_ -
-                       reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
-                dist = dist * dist;
-                norm += static_cast<DISTANCE_T>(dist.x + dist.y);
-              } else if constexpr (num_packed_elements == 8) {
-                smem_val_pack_t c_vec, q_vec;
-                // Loading PQ code book from smem
-                device::lds(c_vec.as_uint(),
-                            pq_codebook_ptr + sizeof(smem_val_pack_uint_t) *
-                                                ((1 << PQ_BITS) * m + ((pq_code & 0xff))));
-                device::lds(q_vec.as_uint(),
-                            query_ptr + sizeof(smem_val_pack_uint_t) * query_val_index);
-                half2 c2_, q2_;
-
-                // Loading query vector from smem
-                c2_ = c_vec.as_half2(0);
-                q2_ = q_vec.as_half2(0);
-                // L2 distance
-                auto dist = q2_ - c2_ -
-                            reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
-                dist = dist * dist;
-                norm += static_cast<DISTANCE_T>(dist.x + dist.y);
-
-                vq_half2_index += 1;
-                c2_ = c_vec.as_half2(1);
-                q2_ = q_vec.as_half2(1);
-                // L2 distance
-                dist = q2_ - c2_ -
-                       reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
-                dist = dist * dist;
-                norm += static_cast<DISTANCE_T>(dist.x + dist.y);
-
-                vq_half2_index += 1;
-                c2_ = c_vec.as_half2(2);
-                q2_ = q_vec.as_half2(2);
-                // L2 distance
-                dist = q2_ - c2_ -
-                       reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
-                dist = dist * dist;
-                norm += static_cast<DISTANCE_T>(dist.x + dist.y);
-
-                vq_half2_index += 1;
-                c2_ = c_vec.as_half2(3);
-                q2_ = q_vec.as_half2(3);
-                // L2 distance
-                dist = q2_ - c2_ -
-                       reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
-                dist = dist * dist;
-                norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+#pragma unroll
+                for (uint32_t bi = 0; bi < num_packed_elements / 2; bi++) {
+                  // Loading query vector from smem
+                  c2_ = c_vec.as_half2(bi);
+                  q2_ = q_vec.as_half2(bi);
+                  // L2 distance
+                  auto dist =
+                    q2_ - c2_ -
+                    reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
+                  dist = dist * dist;
+                  norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+
+                  vq_half2_index += 1;
+                }
               }
             }
             pq_code >>= 8;

From 7c8ecd461e44a29ee7bb734d076f101a622e22e9 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Tue, 11 Nov 2025 16:06:11 +0900
Subject: [PATCH 030/119] Fix a bug

---
 cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index b0b022c5d3..786a294df7 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -214,7 +214,8 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
           smem_val_pack_t buf;
 #pragma unroll
           for (uint32_t k = 0; k < num_packed_elements; k++) {
-            buf.data.x1[k] = static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[k]));
+            buf.data.x1[k] =
+              static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + k]));
           }
           device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_uint_t), buf.as_uint());
         }

From 7b461156e77e63af45d9d62053d643e863ef4475 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Tue, 11 Nov 2025 18:37:42 +0900
Subject: [PATCH 031/119] Add EnableFP8 flag

---
 .../cagra/compute_distance_vpq-impl.cuh       | 42 ++++++++++++-------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 786a294df7..d017372924 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -15,21 +15,27 @@
 
 namespace cuvs::neighbors::cagra::detail {
 
-template <uint32_t PQ_LEN>
-struct smem_val_type_t {
-  using smem_val_pack_t                         = device::fp8xN<PQ_LEN, 5>;
-  using smem_val_t                              = typename smem_val_pack_t::unit_t;
-  using smem_val_pack_uint_t                    = typename smem_val_pack_t::uint_t;
-  static constexpr uint32_t num_packed_elements = smem_val_pack_t::num_elements;
-};
-template <>
-struct smem_val_type_t<2> {
+template <uint32_t PQ_LEN, bool EnableFP8, class Enable = void>
+struct smem_val_type_t;
+
+template <uint32_t PQ_LEN, bool EnableFP8>
+struct smem_val_type_t<PQ_LEN, EnableFP8, std::enable_if_t<PQ_LEN == 2 || !EnableFP8>> {
   using smem_val_pack_t                         = half2;
   using smem_val_t                              = half;
   using smem_val_pack_uint_t                    = uint32_t;
   static constexpr uint32_t num_packed_elements = 2;
 };
 
+template <uint32_t PQ_LEN, bool EnableFP8>
+struct smem_val_type_t<PQ_LEN,
+                       EnableFP8,
+                       std::enable_if_t<(PQ_LEN == 4 || PQ_LEN == 8) && EnableFP8>> {
+  using smem_val_pack_t                         = device::fp8xN<PQ_LEN, 5>;
+  using smem_val_t                              = typename smem_val_pack_t::unit_t;
+  using smem_val_pack_uint_t                    = typename smem_val_pack_t::uint_t;
+  static constexpr uint32_t num_packed_elements = smem_val_pack_t::num_elements;
+};
+
 template <cuvs::distance::DistanceType Metric,
           uint32_t TeamSize,
           uint32_t DatasetBlockDim,
@@ -38,7 +44,8 @@ template <cuvs::distance::DistanceType Metric,
           typename CodebookT,
           typename DataT,
           typename IndexT,
-          typename DistanceT>
+          typename DistanceT,
+          bool EnableFP8 = true>
 struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, IndexT, DistanceT> {
   using base_type   = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
   using CODE_BOOK_T = CodebookT;
@@ -57,6 +64,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
   constexpr static inline auto kDatasetBlockDim = DatasetBlockDim;
   constexpr static inline auto kPqBits          = PQ_BITS;
   constexpr static inline auto kPqLen           = PQ_LEN;
+  constexpr static inline auto kEnableFP8       = EnableFP8;
 
   static_assert(std::is_same_v<CODE_BOOK_T, half>, "Only CODE_BOOK_T = `half` is supported now");
 
@@ -101,8 +109,8 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
 
   static constexpr std::uint32_t kSMemCodeBookSizeInBytes =
     (1 << PQ_BITS) * PQ_LEN *
-    utils::size_of<typename smem_val_type_t<PQ_LEN>::smem_val_pack_uint_t>() /
-    smem_val_type_t<PQ_LEN>::num_packed_elements;
+    utils::size_of<typename smem_val_type_t<PQ_LEN, EnableFP8>::smem_val_pack_uint_t>() /
+    smem_val_type_t<PQ_LEN, EnableFP8>::num_packed_elements;
 
   _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl,
                                                  compute_distance_type* compute_distance_impl,
@@ -137,8 +145,8 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
     */
     return sizeof(cagra_q_dataset_descriptor_t) + kSMemCodeBookSizeInBytes +
            raft::round_up_safe<uint32_t>(dim, DatasetBlockDim) *
-             utils::size_of<typename smem_val_type_t<PQ_LEN>::smem_val_pack_uint_t>() /
-             smem_val_type_t<PQ_LEN>::num_packed_elements;
+             utils::size_of<typename smem_val_type_t<PQ_LEN, EnableFP8>::smem_val_pack_uint_t>() /
+             smem_val_type_t<PQ_LEN, EnableFP8>::num_packed_elements;
   }
 };
 
@@ -164,7 +172,8 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
   constexpr auto kDatasetBlockDim    = DescriptorT::kDatasetBlockDim;
   constexpr auto PQ_BITS             = DescriptorT::kPqBits;
   constexpr auto PQ_LEN              = DescriptorT::kPqLen;
-  using smem_val_config              = smem_val_type_t<PQ_LEN>;
+  constexpr auto EnableFP8           = DescriptorT::kEnableFP8;
+  using smem_val_config              = smem_val_type_t<PQ_LEN, EnableFP8>;
   using smem_val_t                   = typename smem_val_config::smem_val_t;
   using smem_val_pack_uint_t         = typename smem_val_config::smem_val_pack_uint_t;
   using smem_val_pack_t              = typename smem_val_config::smem_val_pack_t;
@@ -277,9 +286,10 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
   constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim;
   constexpr auto PQ_BITS         = DescriptorT::kPqBits;
   constexpr auto PQ_LEN          = DescriptorT::kPqLen;
+  constexpr auto EnableFP8       = DescriptorT::kEnableFP8;
   using PQ_CODEBOOK_LOAD_T       = uint32_t;
 
-  using smem_val_config                  = smem_val_type_t<PQ_LEN>;
+  using smem_val_config                  = smem_val_type_t<PQ_LEN, EnableFP8>;
   using smem_val_t                       = typename smem_val_config::smem_val_t;
   using smem_val_pack_uint_t             = typename smem_val_config::smem_val_pack_uint_t;
   using smem_val_pack_t                  = typename smem_val_config::smem_val_pack_t;

From d49959c8a72747f393b3ee7afe4691ab7f573282 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Tue, 11 Nov 2025 19:17:51 +0900
Subject: [PATCH 032/119] Fix a bug

---
 cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index d017372924..13566627fb 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -353,7 +353,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
                                              (PQ_LEN / vq_val_pack_num_elements) * v;
 
               uint32_t query_val_index;
-              if constexpr (PQ_LEN == 2) {
+              if constexpr (num_packed_elements == 2) {
                 query_val_index =
                   vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
               } else if constexpr (PQ_LEN == num_packed_elements) {

From ed906f7cac8f8c2ba060b8daf00137e6b5055c5d Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 12 Nov 2025 11:25:38 +0900
Subject: [PATCH 033/119] Fix a bug in compute_distance_00_generate.py

---
 cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
index 7beb86c5a9..da16bac177 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
@@ -8,7 +8,8 @@
  * SPDX-FileCopyrightText: Copyright (c) 2024-{datetime.datetime.today().year}, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
-
+"""
+template += """
 /*
  * NOTE: this file is generated by compute_distance_00_generate.py
  *

From c0e9ddd2273d413bcf5f5d5bd978c1b3b4ef18ac Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 12 Nov 2025 12:00:08 +0900
Subject: [PATCH 034/119] Update VPQ instances

---
 cpp/CMakeLists.txt                            |  144 +-
 .../detail/cagra/compute_distance-ext.cuh     | 1344 +++++++++++++++--
 .../detail/cagra/compute_distance.cu          |  360 ++++-
 .../cagra/compute_distance_00_generate.py     |   19 +-
 .../cagra/compute_distance_vpq-impl.cuh       |   20 +-
 .../detail/cagra/compute_distance_vpq.hpp     |    3 +-
 ...float_uint32_dim1024_t32_8pq_8subd_half.cu |   41 -
 ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu |   31 +
 ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu |   31 +
 ...d_float_uint32_dim128_t4_8pq_8subd_half.cu |   41 -
 ...int32_dim128_t4_8pq_8subd_half_fp8false.cu |   31 +
 ...uint32_dim128_t4_8pq_8subd_half_fp8true.cu |   31 +
 ...int32_dim128_t8_8pq_2subd_half_fp8false.cu |   31 +
 ...uint32_dim128_t8_8pq_2subd_half_fp8true.cu |   31 +
 ...int32_dim128_t8_8pq_4subd_half_fp8false.cu |   31 +
 ...uint32_dim128_t8_8pq_4subd_half_fp8true.cu |   31 +
 ...nt32_dim256_t16_8pq_2subd_half_fp8false.cu |   31 +
 ...int32_dim256_t16_8pq_2subd_half_fp8true.cu |   31 +
 ...nt32_dim256_t16_8pq_4subd_half_fp8false.cu |   31 +
 ...int32_dim256_t16_8pq_4subd_half_fp8true.cu |   31 +
 ...d_float_uint32_dim256_t8_8pq_8subd_half.cu |   41 -
 ...int32_dim256_t8_8pq_8subd_half_fp8false.cu |   31 +
 ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu |   31 +
 ..._float_uint32_dim512_t16_8pq_8subd_half.cu |   41 -
 ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu |   31 +
 ...int32_dim512_t16_8pq_8subd_half_fp8true.cu |   31 +
 ...nt32_dim512_t32_8pq_2subd_half_fp8false.cu |   31 +
 ...int32_dim512_t32_8pq_2subd_half_fp8true.cu |   31 +
 ...nt32_dim512_t32_8pq_4subd_half_fp8false.cu |   31 +
 ...int32_dim512_t32_8pq_4subd_half_fp8true.cu |   31 +
 ...ed_float_uint32_dim64_t4_8pq_2subd_half.cu |   41 -
 ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu |   31 +
 ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu |   31 +
 ...ed_float_uint32_dim64_t4_8pq_4subd_half.cu |   41 -
 ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu |   31 +
 ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu |   31 +
 ..._half_uint32_dim1024_t32_8pq_8subd_half.cu |   41 -
 ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu |   31 +
 ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu |   31 +
 ...ed_half_uint32_dim128_t4_8pq_8subd_half.cu |   41 -
 ...int32_dim128_t4_8pq_8subd_half_fp8false.cu |   31 +
 ...int32_dim128_t4_8pq_8subd_half_fp8true.cu} |   11 +-
 ...nt32_dim128_t8_8pq_2subd_half_fp8false.cu} |    5 +-
 ...int32_dim128_t8_8pq_2subd_half_fp8true.cu} |    7 +-
 ...int32_dim128_t8_8pq_4subd_half_fp8false.cu |   31 +
 ...int32_dim128_t8_8pq_4subd_half_fp8true.cu} |    5 +-
 ...t32_dim256_t16_8pq_2subd_half_fp8false.cu} |    5 +-
 ...nt32_dim256_t16_8pq_2subd_half_fp8true.cu} |    7 +-
 ...t32_dim256_t16_8pq_4subd_half_fp8false.cu} |    5 +-
 ...nt32_dim256_t16_8pq_4subd_half_fp8true.cu} |    7 +-
 ...ed_half_uint32_dim256_t8_8pq_8subd_half.cu |   41 -
 ...int32_dim256_t8_8pq_8subd_half_fp8false.cu |   31 +
 ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu |   31 +
 ...d_half_uint32_dim512_t16_8pq_8subd_half.cu |   41 -
 ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu |   31 +
 ...int32_dim512_t16_8pq_8subd_half_fp8true.cu |   31 +
 ...t32_dim512_t32_8pq_2subd_half_fp8false.cu} |    5 +-
 ...nt32_dim512_t32_8pq_2subd_half_fp8true.cu} |    7 +-
 ...t32_dim512_t32_8pq_4subd_half_fp8false.cu} |    5 +-
 ...nt32_dim512_t32_8pq_4subd_half_fp8true.cu} |    7 +-
 ...ded_half_uint32_dim64_t4_8pq_2subd_half.cu |   41 -
 ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu |   31 +
 ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu |   31 +
 ...ded_half_uint32_dim64_t4_8pq_4subd_half.cu |   41 -
 ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu |   31 +
 ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu |   31 +
 ..._int8_uint32_dim1024_t32_8pq_8subd_half.cu |   41 -
 ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu |   31 +
 ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu |   31 +
 ...ed_int8_uint32_dim128_t4_8pq_8subd_half.cu |   41 -
 ...int32_dim128_t4_8pq_8subd_half_fp8false.cu |   31 +
 ...int32_dim128_t4_8pq_8subd_half_fp8true.cu} |    9 +-
 ...int32_dim128_t8_8pq_2subd_half_fp8false.cu |   31 +
 ...int32_dim128_t8_8pq_2subd_half_fp8true.cu} |    5 +-
 ...int32_dim128_t8_8pq_4subd_half_fp8false.cu |   31 +
 ...uint32_dim128_t8_8pq_4subd_half_fp8true.cu |   31 +
 ...nt32_dim256_t16_8pq_2subd_half_fp8false.cu |   31 +
 ...nt32_dim256_t16_8pq_2subd_half_fp8true.cu} |    5 +-
 ...nt32_dim256_t16_8pq_4subd_half_fp8false.cu |   31 +
 ...nt32_dim256_t16_8pq_4subd_half_fp8true.cu} |    5 +-
 ...ed_int8_uint32_dim256_t8_8pq_8subd_half.cu |   41 -
 ...int32_dim256_t8_8pq_8subd_half_fp8false.cu |   31 +
 ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu |   31 +
 ...d_int8_uint32_dim512_t16_8pq_8subd_half.cu |   41 -
 ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu |   31 +
 ...int32_dim512_t16_8pq_8subd_half_fp8true.cu |   31 +
 ...nt32_dim512_t32_8pq_2subd_half_fp8false.cu |   31 +
 ...nt32_dim512_t32_8pq_2subd_half_fp8true.cu} |    5 +-
 ...nt32_dim512_t32_8pq_4subd_half_fp8false.cu |   31 +
 ...nt32_dim512_t32_8pq_4subd_half_fp8true.cu} |    5 +-
 ...ded_int8_uint32_dim64_t4_8pq_2subd_half.cu |   41 -
 ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu |   31 +
 ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu |   31 +
 ...ded_int8_uint32_dim64_t4_8pq_4subd_half.cu |   41 -
 ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu |   31 +
 ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu |   31 +
 ...uint8_uint32_dim1024_t32_8pq_8subd_half.cu |   41 -
 ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu |   31 +
 ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu |   31 +
 ...d_uint8_uint32_dim128_t4_8pq_8subd_half.cu |   41 -
 ...int32_dim128_t4_8pq_8subd_half_fp8false.cu |   31 +
 ...int32_dim128_t4_8pq_8subd_half_fp8true.cu} |    9 +-
 ...int32_dim128_t8_8pq_2subd_half_fp8false.cu |   31 +
 ...int32_dim128_t8_8pq_2subd_half_fp8true.cu} |    5 +-
 ...int32_dim128_t8_8pq_4subd_half_fp8false.cu |   31 +
 ...uint32_dim128_t8_8pq_4subd_half_fp8true.cu |   31 +
 ...nt32_dim256_t16_8pq_2subd_half_fp8false.cu |   31 +
 ...nt32_dim256_t16_8pq_2subd_half_fp8true.cu} |    5 +-
 ...nt32_dim256_t16_8pq_4subd_half_fp8false.cu |   31 +
 ...nt32_dim256_t16_8pq_4subd_half_fp8true.cu} |    5 +-
 ...d_uint8_uint32_dim256_t8_8pq_8subd_half.cu |   41 -
 ...int32_dim256_t8_8pq_8subd_half_fp8false.cu |   31 +
 ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu |   31 +
 ..._uint8_uint32_dim512_t16_8pq_8subd_half.cu |   41 -
 ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu |   31 +
 ...int32_dim512_t16_8pq_8subd_half_fp8true.cu |   31 +
 ...nt32_dim512_t32_8pq_2subd_half_fp8false.cu |   31 +
 ...nt32_dim512_t32_8pq_2subd_half_fp8true.cu} |    5 +-
 ...nt32_dim512_t32_8pq_4subd_half_fp8false.cu |   31 +
 ...nt32_dim512_t32_8pq_4subd_half_fp8true.cu} |    5 +-
 ...ed_uint8_uint32_dim64_t4_8pq_2subd_half.cu |   41 -
 ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu |   31 +
 ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu |   31 +
 ...ed_uint8_uint32_dim64_t4_8pq_4subd_half.cu |   41 -
 ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu |   31 +
 ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu |   31 +
 126 files changed, 3949 insertions(+), 1301 deletions(-)
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8true.cu} (82%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8false.cu} (82%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8true.cu} (82%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8true.cu} (82%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8false.cu} (82%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8true.cu} (82%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8false.cu} (82%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8true.cu} (82%)
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8false.cu} (82%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8true.cu} (82%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8false.cu} (82%)
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8true.cu} (82%)
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu} (82%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu} (82%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu} (82%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu} (82%)
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu} (82%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu} (82%)
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu} (82%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu} (82%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu} (82%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu} (82%)
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu} (82%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
 rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu} (82%)
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f0bdb78987..3d6fb22558 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -253,54 +253,102 @@ if(NOT BUILD_CPU_ONLY)
     src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu
     src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu
     src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu
-    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
     src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu
     src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu
     src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
index 32a57766da..c0fef4f0cf 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
@@ -83,7 +83,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            float,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
+                                           8,
+                                           2,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           2,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
@@ -92,7 +113,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            float,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            256,
@@ -101,7 +123,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            float,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           2,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           2,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
                                            512,
@@ -110,7 +153,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            float,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
+                                           8,
+                                           4,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            4,
                                            64,
@@ -119,7 +173,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            float,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
@@ -128,7 +183,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            float,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           4,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           4,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            256,
@@ -137,7 +213,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            float,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           4,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
                                            512,
@@ -146,7 +233,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            float,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            4,
                                            128,
@@ -155,7 +243,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            float,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           128,
+                                           8,
+                                           8,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           256,
+                                           8,
+                                           8,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            256,
@@ -164,7 +273,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            float,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           512,
+                                           8,
+                                           8,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            512,
@@ -173,7 +293,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            float,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
                                            1024,
@@ -182,7 +303,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            float,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           1024,
+                                           8,
+                                           8,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float,
+                                           false>;
 extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
                                                 8,
                                                 128,
@@ -245,7 +377,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            half,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
+                                           8,
+                                           2,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           2,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
@@ -254,7 +407,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            half,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            256,
@@ -263,7 +417,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            half,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           2,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           2,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
                                            512,
@@ -272,7 +447,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            half,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
+                                           8,
+                                           4,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            4,
                                            64,
@@ -281,7 +467,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            half,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
@@ -290,7 +477,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            half,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           4,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           4,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            256,
@@ -299,7 +507,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            half,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           4,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
                                            512,
@@ -308,7 +527,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            half,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            4,
                                            128,
@@ -317,7 +537,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            half,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           128,
+                                           8,
+                                           8,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           256,
+                                           8,
+                                           8,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            256,
@@ -326,7 +567,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            half,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           512,
+                                           8,
+                                           8,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            512,
@@ -335,7 +587,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            half,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
                                            1024,
@@ -344,7 +597,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            half,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           1024,
+                                           8,
+                                           8,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float,
+                                           false>;
 extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
                                                 8,
                                                 128,
@@ -407,7 +671,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            int8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
+                                           8,
+                                           2,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           2,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
@@ -416,7 +701,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            int8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            256,
@@ -425,7 +711,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            int8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           2,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           2,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
                                            512,
@@ -434,7 +741,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            int8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
+                                           8,
+                                           4,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            4,
                                            64,
@@ -443,7 +761,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            int8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
@@ -452,7 +771,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            int8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           4,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           4,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            256,
@@ -461,7 +801,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            int8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           4,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
                                            512,
@@ -470,7 +821,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            int8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            4,
                                            128,
@@ -479,7 +831,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            int8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           128,
+                                           8,
+                                           8,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           256,
+                                           8,
+                                           8,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            256,
@@ -488,7 +861,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            int8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           512,
+                                           8,
+                                           8,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            512,
@@ -497,7 +881,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            int8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
                                            1024,
@@ -506,7 +891,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            int8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           1024,
+                                           8,
+                                           8,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float,
+                                           false>;
 extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
                                                 8,
                                                 128,
@@ -569,7 +965,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            uint8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
+                                           8,
+                                           2,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           2,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
@@ -578,7 +995,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            uint8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            256,
@@ -587,7 +1005,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            uint8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           2,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           2,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
                                            512,
@@ -596,7 +1035,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            uint8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           64,
+                                           8,
+                                           4,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            4,
                                            64,
@@ -605,7 +1055,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            uint8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            128,
@@ -614,7 +1065,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            uint8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           4,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           4,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            256,
@@ -623,7 +1095,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            uint8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           4,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
                                            512,
@@ -632,7 +1115,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            uint8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            4,
                                            128,
@@ -641,7 +1125,28 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            uint8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           4,
+                                           128,
+                                           8,
+                                           8,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           256,
+                                           8,
+                                           8,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            8,
                                            256,
@@ -650,7 +1155,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            uint8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           512,
+                                           8,
+                                           8,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float,
+                                           true>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            16,
                                            512,
@@ -659,7 +1175,8 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            uint8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           false>;
 extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            32,
                                            1024,
@@ -668,7 +1185,18 @@ extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                            half,
                                            uint8_t,
                                            uint32_t,
-                                           float>;
+                                           float,
+                                           true>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           1024,
+                                           8,
+                                           8,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float,
+                                           false>;
 extern template struct standard_descriptor_spec<DistanceType::BitwiseHamming,
                                                 8,
                                                 128,
@@ -698,18 +1226,39 @@ extern template struct instance_selector<
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      float,
+                      uint32_t,
+                      float,
+                      false>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, half, uint32_t, float>,
@@ -719,18 +1268,30 @@ extern template struct instance_selector<
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float, false>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, int8_t, uint32_t, float>,
@@ -740,18 +1301,93 @@ extern template struct instance_selector<
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      2,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      2,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      4,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      4,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      512,
+                      8,
+                      8,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, uint8_t, uint32_t, float>,
@@ -761,18 +1397,174 @@ extern template struct instance_selector<
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      8,
+                      128,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      8,
+                      128,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      4,
+                      128,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      8,
+                      256,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      512,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      512,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 32, 512, uint8_t, uint32_t, float>>;
@@ -787,18 +1579,39 @@ using descriptor_instances = instance_selector<
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      float,
+                      uint32_t,
+                      float,
+                      false>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, half, uint32_t, float>,
@@ -808,18 +1621,30 @@ using descriptor_instances = instance_selector<
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float, false>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, int8_t, uint32_t, float>,
@@ -829,18 +1654,93 @@ using descriptor_instances = instance_selector<
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      2,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      2,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      4,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      4,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      512,
+                      8,
+                      8,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, uint8_t, uint32_t, float>,
@@ -850,18 +1750,174 @@ using descriptor_instances = instance_selector<
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      8,
+                      128,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      8,
+                      128,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      4,
+                      128,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      8,
+                      256,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      512,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      512,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 32, 512, uint8_t, uint32_t, float>>;
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu
index fa708f2a4f..ec4eec28ac 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu
@@ -28,18 +28,39 @@ template struct instance_selector<
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      float,
+                      uint32_t,
+                      float,
+                      false>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, half, uint32_t, float>,
@@ -49,18 +70,30 @@ template struct instance_selector<
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float, false>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, int8_t, uint32_t, float>,
@@ -70,18 +103,93 @@ template struct instance_selector<
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      2,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      2,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      4,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      4,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      512,
+                      8,
+                      8,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
   standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, uint8_t, uint32_t, float>,
@@ -91,18 +199,174 @@ template struct instance_selector<
   standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      8,
+                      128,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      8,
+                      128,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      4,
+                      128,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      8,
+                      256,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      512,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      512,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 8, 128, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 16, 256, uint8_t, uint32_t, float>,
   standard_descriptor_spec<DistanceType::BitwiseHamming, 32, 512, uint8_t, uint32_t, float>>;
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
index da16bac177..0365145022 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
@@ -89,15 +89,16 @@
             for code_book_t in code_book_types:
                 for pq_bit in pq_bits:
                     for metric in ['L2Expanded']:
-                        path = f"compute_distance_vpq_{metric}_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}.cu"
-                        includes = '#include "compute_distance_vpq-impl.cuh"'
-                        params = f"{metric_prefix}{metric}, {team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}"
-                        spec = f"vpq_descriptor_spec<{params}>"
-                        content = f"""template struct {spec};"""
-                        specs.append(spec)
-                        with open(path, "w") as f:
-                            f.write(template.format(includes=includes, content=content))
-                            cmake_list.append(f"  src/neighbors/detail/cagra/{path}")
+                        for enable_fp8 in ['true', 'false']:
+                            path = f"compute_distance_vpq_{metric}_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}_fp8{enable_fp8}.cu"
+                            includes = '#include "compute_distance_vpq-impl.cuh"'
+                            params = f"{metric_prefix}{metric}, {team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}, {enable_fp8}"
+                            spec = f"vpq_descriptor_spec<{params}>"
+                            content = f"""template struct {spec};"""
+                            specs.append(spec)
+                            with open(path, "w") as f:
+                                f.write(template.format(includes=includes, content=content))
+                                cmake_list.append(f"  src/neighbors/detail/cagra/{path}")
 
 # CAGRA (Binary Hamming distance)
 for (mxdim, team) in mxdim_team:
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 13566627fb..5de2478702 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -45,7 +45,7 @@ template <cuvs::distance::DistanceType Metric,
           typename DataT,
           typename IndexT,
           typename DistanceT,
-          bool EnableFP8 = true>
+          bool EnableFP8>
 struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, IndexT, DistanceT> {
   using base_type   = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
   using CODE_BOOK_T = CodebookT;
@@ -481,7 +481,8 @@ template <cuvs::distance::DistanceType Metric,
           typename CodebookT,
           typename DataT,
           typename IndexT,
-          typename DistanceT>
+          typename DistanceT,
+          bool EnableFP8>
 RAFT_KERNEL __launch_bounds__(1, 1)
   vpq_dataset_descriptor_init_kernel(dataset_descriptor_base_t<DataT, IndexT, DistanceT>* out,
                                      const std::uint8_t* encoded_dataset_ptr,
@@ -499,7 +500,8 @@ RAFT_KERNEL __launch_bounds__(1, 1)
                                                  CodebookT,
                                                  DataT,
                                                  IndexT,
-                                                 DistanceT>;
+                                                 DistanceT,
+                                                 EnableFP8>;
   using base_type = typename desc_type::base_type;
   new (out) desc_type(
     reinterpret_cast<typename base_type::setup_workspace_type*>(&setup_workspace_vpq<desc_type>),
@@ -520,7 +522,8 @@ template <cuvs::distance::DistanceType Metric,
           typename CodebookT,
           typename DataT,
           typename IndexT,
-          typename DistanceT>
+          typename DistanceT,
+          bool EnableFP8>
 dataset_descriptor_host<DataT, IndexT, DistanceT>
 vpq_descriptor_spec<Metric,
                     TeamSize,
@@ -530,7 +533,8 @@ vpq_descriptor_spec<Metric,
                     CodebookT,
                     DataT,
                     IndexT,
-                    DistanceT>::init_(const cagra::search_params& params,
+                    DistanceT,
+                    EnableFP8>::init_(const cagra::search_params& params,
                                       const std::uint8_t* encoded_dataset_ptr,
                                       uint32_t encoded_dataset_dim,
                                       const CodebookT* vq_code_book_ptr,
@@ -546,7 +550,8 @@ vpq_descriptor_spec<Metric,
                                                  CodebookT,
                                                  DataT,
                                                  IndexT,
-                                                 DistanceT>;
+                                                 DistanceT,
+                                                 EnableFP8>;
   using base_type = typename desc_type::base_type;
 
   desc_type dd_host{nullptr,
@@ -568,7 +573,8 @@ vpq_descriptor_spec<Metric,
                                                         CodebookT,
                                                         DataT,
                                                         IndexT,
-                                                        DistanceT>
+                                                        DistanceT,
+                                                        EnableFP8>
                        <<<1, 1, 0, stream>>>(dev_ptr,
                                              encoded_dataset_ptr,
                                              encoded_dataset_dim,
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
index 2b69a1cef4..ece7323907 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
@@ -21,7 +21,8 @@ template <cuvs::distance::DistanceType Metric,
           typename CodebookT,
           typename DataT,
           typename IndexT,
-          typename DistanceT>
+          typename DistanceT,
+          bool EnableFP8>
 struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
   using base_type = instance_spec<DataT, IndexT, DistanceT>;
   using typename base_type::data_type;
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu
deleted file mode 100644
index 5c458a281a..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    1024,
-                                    8,
-                                    8,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..0eeba4602c
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    1024,
+                                    8,
+                                    8,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
new file mode 100644
index 0000000000..4a059f133e
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    1024,
+                                    8,
+                                    8,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu
deleted file mode 100644
index d5579a2be0..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    128,
-                                    8,
-                                    8,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..314f233573
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    128,
+                                    8,
+                                    8,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
new file mode 100644
index 0000000000..369b44f743
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    128,
+                                    8,
+                                    8,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
new file mode 100644
index 0000000000..87927cd478
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    2,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
new file mode 100644
index 0000000000..33232e7a64
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    2,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
new file mode 100644
index 0000000000..f7290b2b5e
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    4,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
new file mode 100644
index 0000000000..d4b0360c01
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    4,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
new file mode 100644
index 0000000000..07b9021ad7
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    2,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
new file mode 100644
index 0000000000..92aafecd4f
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    2,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
new file mode 100644
index 0000000000..75f433ed4a
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    4,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
new file mode 100644
index 0000000000..25cdfcf44b
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    4,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu
deleted file mode 100644
index ae33cdc65a..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    256,
-                                    8,
-                                    8,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..12c1166902
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    256,
+                                    8,
+                                    8,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
new file mode 100644
index 0000000000..4fd44ce5a8
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    256,
+                                    8,
+                                    8,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu
deleted file mode 100644
index dcd1d6a074..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    512,
-                                    8,
-                                    8,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..c2a3b9f565
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    512,
+                                    8,
+                                    8,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
new file mode 100644
index 0000000000..29a694b72d
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    512,
+                                    8,
+                                    8,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
new file mode 100644
index 0000000000..48782764f2
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    2,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
new file mode 100644
index 0000000000..da99ab9173
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    2,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
new file mode 100644
index 0000000000..0164636430
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    4,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
new file mode 100644
index 0000000000..d6918aab34
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    4,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu
deleted file mode 100644
index 740ad40f21..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    2,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
new file mode 100644
index 0000000000..9ba5ae5005
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    2,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
new file mode 100644
index 0000000000..b9a4f4ebdf
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    2,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu
deleted file mode 100644
index 6a01a5c0d5..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    4,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
new file mode 100644
index 0000000000..65a1455dca
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    4,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
new file mode 100644
index 0000000000..fc41ff9109
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    4,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu
deleted file mode 100644
index a9766124f8..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    1024,
-                                    8,
-                                    8,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..ed1f9afc26
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    1024,
+                                    8,
+                                    8,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
new file mode 100644
index 0000000000..37fd1ad8c5
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    1024,
+                                    8,
+                                    8,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu
deleted file mode 100644
index c5d6c72a6f..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    128,
-                                    8,
-                                    8,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..0a50234576
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    128,
+                                    8,
+                                    8,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
index 680f594261..56b4a2f6fd 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -18,13 +18,14 @@ namespace cuvs::neighbors::cagra::detail {
 
 using namespace cuvs::distance;
 template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
+                                    4,
                                     128,
                                     8,
-                                    4,
+                                    8,
+                                    half,
                                     half,
-                                    float,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
index bbbc147de7..02c0559dd9 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     half,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    false>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
index 9b29bb8ffc..fa6c5305d2 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -23,8 +23,9 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     8,
                                     2,
                                     half,
-                                    float,
+                                    half,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
new file mode 100644
index 0000000000..4680d19ab9
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    4,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
index b0883184c1..11e75f61b6 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     half,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
index 6d7850cf8c..42bb7660d9 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     half,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    false>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
index 08cd7590bc..520e36c602 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -23,8 +23,9 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     8,
                                     2,
                                     half,
-                                    float,
+                                    half,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
index 040fa4456d..023bed430d 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     half,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    false>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
index 6610d1d87b..c40e843fa5 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -23,8 +23,9 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     8,
                                     4,
                                     half,
-                                    float,
+                                    half,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu
deleted file mode 100644
index 2114941e3e..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    256,
-                                    8,
-                                    8,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..f2e07f0c5e
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    256,
+                                    8,
+                                    8,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
new file mode 100644
index 0000000000..123c117c32
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    256,
+                                    8,
+                                    8,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu
deleted file mode 100644
index 0b78982890..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    512,
-                                    8,
-                                    8,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..4d94ea3c71
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    512,
+                                    8,
+                                    8,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
new file mode 100644
index 0000000000..9a55456931
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    512,
+                                    8,
+                                    8,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
index 70ae484456..8fac7c2659 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     half,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    false>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
index e251d13331..c83b911d36 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -23,8 +23,9 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     8,
                                     2,
                                     half,
-                                    float,
+                                    half,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
index c0f889af28..2b801907b9 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     half,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    false>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
index 0b37928ab7..c07ede51e9 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -23,8 +23,9 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     8,
                                     4,
                                     half,
-                                    float,
+                                    half,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu
deleted file mode 100644
index be699ca98a..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    2,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
new file mode 100644
index 0000000000..2652edfc8c
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    2,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
new file mode 100644
index 0000000000..95aadfc5d9
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    2,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu
deleted file mode 100644
index 36592482e1..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    4,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
new file mode 100644
index 0000000000..85f46ec0f5
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    4,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
new file mode 100644
index 0000000000..db6c599e14
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    4,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu
deleted file mode 100644
index e2d68ae772..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    1024,
-                                    8,
-                                    8,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..b9b38960af
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    1024,
+                                    8,
+                                    8,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
new file mode 100644
index 0000000000..1bc6a46138
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    1024,
+                                    8,
+                                    8,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu
deleted file mode 100644
index 65cdfb0998..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    128,
-                                    8,
-                                    8,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..4b856ff203
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    128,
+                                    8,
+                                    8,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
index d59e7c9078..2e84b879e8 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -18,13 +18,14 @@ namespace cuvs::neighbors::cagra::detail {
 
 using namespace cuvs::distance;
 template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
+                                    4,
                                     128,
                                     8,
-                                    4,
+                                    8,
                                     half,
                                     int8_t,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
new file mode 100644
index 0000000000..1a03321b2f
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    2,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
index f4bb7d1e31..b46995999d 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     int8_t,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
new file mode 100644
index 0000000000..05d9febaeb
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    4,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
new file mode 100644
index 0000000000..4e3a5322d3
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    4,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
new file mode 100644
index 0000000000..99a955fcba
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    2,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
index ff0672de06..b0eb39d62b 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     int8_t,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
new file mode 100644
index 0000000000..e8fe498589
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    4,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
index bc160382be..e24dc1ef20 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     int8_t,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu
deleted file mode 100644
index bf24d343fc..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    256,
-                                    8,
-                                    8,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..8c40f8482e
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    256,
+                                    8,
+                                    8,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
new file mode 100644
index 0000000000..b857508f52
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    256,
+                                    8,
+                                    8,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu
deleted file mode 100644
index 6cfa5ede30..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    512,
-                                    8,
-                                    8,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..cc3e33adc1
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    512,
+                                    8,
+                                    8,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
new file mode 100644
index 0000000000..de0860b278
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    512,
+                                    8,
+                                    8,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
new file mode 100644
index 0000000000..c07ce1ff7a
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    2,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
index 894a1eae7b..ff7158f5d2 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     int8_t,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
new file mode 100644
index 0000000000..ea1a6e975b
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    4,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
index 4aa48daee0..c21c9c5c10 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     int8_t,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu
deleted file mode 100644
index aedfb0ef44..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    2,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
new file mode 100644
index 0000000000..b707ad056b
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    2,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
new file mode 100644
index 0000000000..9c273805d9
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    2,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu
deleted file mode 100644
index 42a56de4a5..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    4,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
new file mode 100644
index 0000000000..3fa2ef8170
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    4,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
new file mode 100644
index 0000000000..c1d4456e2d
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    4,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu
deleted file mode 100644
index 6217a0047c..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    1024,
-                                    8,
-                                    8,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..1e09109eb8
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    1024,
+                                    8,
+                                    8,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
new file mode 100644
index 0000000000..9ea862c9bc
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    1024,
+                                    8,
+                                    8,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu
deleted file mode 100644
index 6d06771052..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    128,
-                                    8,
-                                    8,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..bfcc48f462
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    128,
+                                    8,
+                                    8,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
index b0e824d788..238572cf5f 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -18,13 +18,14 @@ namespace cuvs::neighbors::cagra::detail {
 
 using namespace cuvs::distance;
 template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
+                                    4,
                                     128,
                                     8,
-                                    4,
+                                    8,
                                     half,
                                     uint8_t,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
new file mode 100644
index 0000000000..58698a9760
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    2,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
index fc9f5043ac..8388bae580 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     uint8_t,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
new file mode 100644
index 0000000000..584a58fcf1
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    4,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
new file mode 100644
index 0000000000..2f8b58b9e1
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    4,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
new file mode 100644
index 0000000000..b735134e70
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    2,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
index b7755c2d17..71d93ebe04 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     uint8_t,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
new file mode 100644
index 0000000000..ba28f84414
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    4,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
index 5457ea76e7..70653e69e2 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     uint8_t,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu
deleted file mode 100644
index 4da8b5c0d7..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    256,
-                                    8,
-                                    8,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..81a29015de
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    256,
+                                    8,
+                                    8,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
new file mode 100644
index 0000000000..6254aae41a
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    256,
+                                    8,
+                                    8,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu
deleted file mode 100644
index c1f63841b4..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    512,
-                                    8,
-                                    8,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
new file mode 100644
index 0000000000..2223290eff
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    512,
+                                    8,
+                                    8,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
new file mode 100644
index 0000000000..f3f7c0ae07
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    512,
+                                    8,
+                                    8,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
new file mode 100644
index 0000000000..cc487728cd
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    2,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
index 4225ea81a3..0da175b065 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     uint8_t,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
new file mode 100644
index 0000000000..690b8a90f7
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    4,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
similarity index 82%
rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
index dfcecd31b3..d3c5e032f8 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -25,6 +25,7 @@ template struct vpq_descriptor_spec<DistanceType::L2Expanded,
                                     half,
                                     uint8_t,
                                     uint32_t,
-                                    float>;
+                                    float,
+                                    true>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu
deleted file mode 100644
index 3d458c0c94..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    2,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
new file mode 100644
index 0000000000..b5ae8f18d8
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    2,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
new file mode 100644
index 0000000000..97f100c53f
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    2,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu
deleted file mode 100644
index 6f59b47bbe..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    4,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
new file mode 100644
index 0000000000..f17eae07db
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    4,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    false>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
new file mode 100644
index 0000000000..b94a11d287
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    4,
+                                    64,
+                                    8,
+                                    4,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float,
+                                    true>;
+
+}  // namespace cuvs::neighbors::cagra::detail

From 35259e3fe5476abb7ab244b08f310108dadb71b3 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 12 Nov 2025 13:09:01 +0900
Subject: [PATCH 035/119] Add `smem_dtype` option

---
 cpp/include/cuvs/neighbors/cagra.hpp                    | 6 ++++++
 cpp/src/neighbors/detail/cagra/cagra_search.cuh         | 4 ++++
 cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp | 6 ++++++
 3 files changed, 16 insertions(+)

diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 6192b263c3..5458bbccc7 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -194,6 +194,8 @@ enum class search_algo {
 
 enum class hash_mode { HASH = 0, SMALL = 1, AUTO = 100 };
 
+enum class internal_dtype { F16 = 0, E5M2 = 1, AUTO = 100 };
+
 struct search_params : cuvs::neighbors::search_params {
   /** Maximum number of queries to search at the same time (batch size). Auto select when 0.*/
   size_t max_queries = 0;
@@ -267,6 +269,10 @@ struct search_params : cuvs::neighbors::search_params {
    * negative, in which case the filtering rate is automatically calculated.
    */
   float filtering_rate = -1.0;
+
+  /** Data type of the query vector and codebook table on shared memory. Currently, only VPQ
+   * supports FP8. **/
+  internal_dtype smem_dtype = internal_dtype::AUTO;
 };
 
 /**
diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index 26e0aafd2d..1213736a21 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -148,6 +148,10 @@ void search_main(raft::resources const& res,
   // Dispatch search parameters based on the dataset kind.
   if (auto* strided_dset = dynamic_cast<const strided_dataset<T, ds_idx_type>*>(&index.data());
       strided_dset != nullptr) {
+    if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO ||
+        params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::F16) {
+      RAFT_LOG_WARN("In this search mode, only AUTO or F16 are supported as the smem_dtype.");
+    }
     // Search using a plain (strided) row-major dataset
     RAFT_EXPECTS(index.metric() != cuvs::distance::DistanceType::CosineExpanded ||
                    index.dataset_norms().has_value(),
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
index ece7323907..0f55b3efb2 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
@@ -64,12 +64,18 @@ struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
                        const DatasetT& dataset,
                        cuvs::distance::DistanceType metric) -> double
   {
+    const auto fp8_natively_supported = raft::getComputeCapability().first >= 9;
+    const auto use_fp8 =
+      params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::E5M2 ||
+      (params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::AUTO && fp8_natively_supported);
+
     // If explicit team_size is specified and doesn't match the instance, discard it
     if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; }
     if (cuvs::distance::DistanceType::L2Expanded != metric) { return -1.0; }
     // Match codebook params
     if (dataset.pq_bits() != PqBits) { return -1.0; }
     if (dataset.pq_len() != PqLen) { return -1.0; }
+    if (use_fp8 != EnableFP8) { return -1.0; }
     // Otherwise, favor the closest dataset dimensionality.
     constexpr std::uint32_t preferred_load_elmes_per_thread =
       16; /*magic number that is good based on experiments.*/

From 7639d0205f8b20726e381be87735dbcdbcd8a10d Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 12 Nov 2025 13:29:37 +0900
Subject: [PATCH 036/119] Remove unnecessary include

---
 cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
index 61b9e595fc..7f38342461 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -21,7 +21,6 @@
 
 #include <cub/cub.cuh>
 #include <cuda_fp16.h>
-#include <cuda_fp8.h>
 
 namespace cuvs::neighbors::ivf_pq::detail {
 
@@ -100,4 +99,5 @@ struct fp_8bit {
     return r;
   }
 };
+
 }  // namespace cuvs::neighbors::ivf_pq::detail

From 710232ae9124bab5195118877d7e92ab203171e9 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 12 Nov 2025 15:30:41 +0900
Subject: [PATCH 037/119] Remove unnecessary files

---
 ..._L2Expanded_dim128_t8_uint32_t_uint64_t.cu | 41 -------------------
 ...L2Expanded_dim256_t16_uint32_t_uint64_t.cu | 41 -------------------
 ...L2Expanded_dim512_t32_uint32_t_uint64_t.cu | 41 -------------------
 ...q_L2Expanded_dim64_t4_uint32_t_uint64_t.cu | 41 -------------------
 4 files changed, 164 deletions(-)
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu
deleted file mode 100644
index 10ddfc0163..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vrabitq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                                        8,
-                                        128,
-                                        float,
-                                        uint32_t,
-                                        uint32_t,
-                                        uint64_t,
-                                        float,
-                                        float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu
deleted file mode 100644
index e057457a6a..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vrabitq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                                        16,
-                                        256,
-                                        float,
-                                        uint32_t,
-                                        uint32_t,
-                                        uint64_t,
-                                        float,
-                                        float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu
deleted file mode 100644
index c30bd76785..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vrabitq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                                        32,
-                                        512,
-                                        float,
-                                        uint32_t,
-                                        uint32_t,
-                                        uint64_t,
-                                        float,
-                                        float>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu
deleted file mode 100644
index 472dd9821f..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vrabitq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vrabitq_descriptor_spec<DistanceType::L2Expanded,
-                                        4,
-                                        64,
-                                        float,
-                                        uint32_t,
-                                        uint32_t,
-                                        uint64_t,
-                                        float,
-                                        float>;
-
-}  // namespace cuvs::neighbors::cagra::detail

From d183be67dbe8d07dc6ed8657701e6b1cb4d63ac0 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Thu, 13 Nov 2025 01:16:28 +0900
Subject: [PATCH 038/119] Remove unnecessary file

---
 cpp/tests/neighbors/vpq_utils.cuh | 77 -------------------------------
 1 file changed, 77 deletions(-)
 delete mode 100644 cpp/tests/neighbors/vpq_utils.cuh

diff --git a/cpp/tests/neighbors/vpq_utils.cuh b/cpp/tests/neighbors/vpq_utils.cuh
deleted file mode 100644
index 383e5ef063..0000000000
--- a/cpp/tests/neighbors/vpq_utils.cuh
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cuvs/neighbors/common.hpp>
-
-namespace cuvs::neighbors {
-template <class data_t, class math_t>
-__global__ void decode_vpq_dataset_kernel(data_t* const decoded_dataset_ptr,
-                                          const uint32_t ldd,
-                                          const math_t* const vq_codebook_ptr,
-                                          const uint32_t ldv,
-                                          const math_t* const pq_codebook_ptr,
-                                          const uint32_t pq_subspace_dim,
-                                          const uint32_t pq_table_size,
-                                          const uint32_t dataset_dim,
-                                          const size_t dataset_size,
-                                          const uint8_t* const data_ptr,
-                                          const uint32_t ldi)
-{
-  constexpr uint32_t warp_size = 32;
-  const size_t batch_id        = (blockIdx.x * blockDim.x + threadIdx.x) / warp_size;
-  if (batch_id >= dataset_size) { return; }
-
-  const auto local_data_ptr = data_ptr + ldi * batch_id;
-  const auto vq_code        = *reinterpret_cast<const uint32_t*>(local_data_ptr);
-  const auto pq_code_ptr    = local_data_ptr + sizeof(uint32_t);
-  const auto vq_vec_ptr     = vq_codebook_ptr + vq_code * ldv;
-  auto local_dst_ptr        = decoded_dataset_ptr + batch_id * ldd;
-
-  const auto lane_id = threadIdx.x % warp_size;
-  for (uint32_t i = lane_id; i < dataset_dim; i += warp_size) {
-    const auto pq_code = pq_code_ptr[i / pq_subspace_dim];
-    const auto pq_v    = pq_codebook_ptr[pq_code * pq_subspace_dim + (i % pq_subspace_dim)];
-
-    local_dst_ptr[i] = static_cast<data_t>(vq_vec_ptr[i]) + static_cast<data_t>(pq_v);
-  }
-}
-
-template <class data_t, class math_t>
-void decode_vpq_dataset(raft::device_matrix_view<data_t, int64_t> decoded_dataset,
-                        const cuvs::neighbors::vpq_dataset<math_t, int64_t>& vpq_dataset,
-                        cudaStream_t cuda_stream)
-{
-  const auto dataset_size = decoded_dataset.extent(0);
-  RAFT_EXPECTS(vpq_dataset.data.extent(0) == dataset_size, "Dataset sizes mismatch");
-
-  constexpr uint32_t block_size  = 256;
-  constexpr uint32_t warp_size   = 32;
-  constexpr int64_t vecs_per_cta = block_size / warp_size;
-  const auto grid_size = raft::div_rounding_up_safe(decoded_dataset.extent(0), vecs_per_cta);
-
-  decode_vpq_dataset_kernel<data_t, math_t>
-    <<<grid_size, block_size, 0, cuda_stream>>>(decoded_dataset.data_handle(),
-                                                decoded_dataset.stride(0),
-                                                vpq_dataset.vq_code_book.data_handle(),
-                                                vpq_dataset.vq_code_book.stride(0),
-                                                vpq_dataset.pq_code_book.data_handle(),
-                                                vpq_dataset.pq_len(),
-                                                1u << vpq_dataset.pq_bits(),
-                                                vpq_dataset.dim(),
-                                                dataset_size,
-                                                vpq_dataset.data.data_handle(),
-                                                vpq_dataset.data.stride(0));
-}
-}  // namespace cuvs::neighbors

From e7d3d42c5de6a512f48163cffdbf5dfb146b10f7 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Thu, 13 Nov 2025 01:36:12 +0900
Subject: [PATCH 039/119] Revert "Remove unnecessary file"

This reverts commit d183be67dbe8d07dc6ed8657701e6b1cb4d63ac0.
---
 cpp/tests/neighbors/vpq_utils.cuh | 77 +++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 cpp/tests/neighbors/vpq_utils.cuh

diff --git a/cpp/tests/neighbors/vpq_utils.cuh b/cpp/tests/neighbors/vpq_utils.cuh
new file mode 100644
index 0000000000..383e5ef063
--- /dev/null
+++ b/cpp/tests/neighbors/vpq_utils.cuh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cuvs/neighbors/common.hpp>
+
+namespace cuvs::neighbors {
+template <class data_t, class math_t>
+__global__ void decode_vpq_dataset_kernel(data_t* const decoded_dataset_ptr,
+                                          const uint32_t ldd,
+                                          const math_t* const vq_codebook_ptr,
+                                          const uint32_t ldv,
+                                          const math_t* const pq_codebook_ptr,
+                                          const uint32_t pq_subspace_dim,
+                                          const uint32_t pq_table_size,
+                                          const uint32_t dataset_dim,
+                                          const size_t dataset_size,
+                                          const uint8_t* const data_ptr,
+                                          const uint32_t ldi)
+{
+  constexpr uint32_t warp_size = 32;
+  const size_t batch_id        = (blockIdx.x * blockDim.x + threadIdx.x) / warp_size;
+  if (batch_id >= dataset_size) { return; }
+
+  const auto local_data_ptr = data_ptr + ldi * batch_id;
+  const auto vq_code        = *reinterpret_cast<const uint32_t*>(local_data_ptr);
+  const auto pq_code_ptr    = local_data_ptr + sizeof(uint32_t);
+  const auto vq_vec_ptr     = vq_codebook_ptr + vq_code * ldv;
+  auto local_dst_ptr        = decoded_dataset_ptr + batch_id * ldd;
+
+  const auto lane_id = threadIdx.x % warp_size;
+  for (uint32_t i = lane_id; i < dataset_dim; i += warp_size) {
+    const auto pq_code = pq_code_ptr[i / pq_subspace_dim];
+    const auto pq_v    = pq_codebook_ptr[pq_code * pq_subspace_dim + (i % pq_subspace_dim)];
+
+    local_dst_ptr[i] = static_cast<data_t>(vq_vec_ptr[i]) + static_cast<data_t>(pq_v);
+  }
+}
+
+template <class data_t, class math_t>
+void decode_vpq_dataset(raft::device_matrix_view<data_t, int64_t> decoded_dataset,
+                        const cuvs::neighbors::vpq_dataset<math_t, int64_t>& vpq_dataset,
+                        cudaStream_t cuda_stream)
+{
+  const auto dataset_size = decoded_dataset.extent(0);
+  RAFT_EXPECTS(vpq_dataset.data.extent(0) == dataset_size, "Dataset sizes mismatch");
+
+  constexpr uint32_t block_size  = 256;
+  constexpr uint32_t warp_size   = 32;
+  constexpr int64_t vecs_per_cta = block_size / warp_size;
+  const auto grid_size = raft::div_rounding_up_safe(decoded_dataset.extent(0), vecs_per_cta);
+
+  decode_vpq_dataset_kernel<data_t, math_t>
+    <<<grid_size, block_size, 0, cuda_stream>>>(decoded_dataset.data_handle(),
+                                                decoded_dataset.stride(0),
+                                                vpq_dataset.vq_code_book.data_handle(),
+                                                vpq_dataset.vq_code_book.stride(0),
+                                                vpq_dataset.pq_code_book.data_handle(),
+                                                vpq_dataset.pq_len(),
+                                                1u << vpq_dataset.pq_bits(),
+                                                vpq_dataset.dim(),
+                                                dataset_size,
+                                                vpq_dataset.data.data_handle(),
+                                                vpq_dataset.data.stride(0));
+}
+}  // namespace cuvs::neighbors

From 24089bc21fc215ca3e413001cd516d90eee903ae Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Thu, 13 Nov 2025 01:37:01 +0900
Subject: [PATCH 040/119] Fix Copyright

---
 cpp/tests/neighbors/vpq_utils.cuh | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/cpp/tests/neighbors/vpq_utils.cuh b/cpp/tests/neighbors/vpq_utils.cuh
index 383e5ef063..8ceb371413 100644
--- a/cpp/tests/neighbors/vpq_utils.cuh
+++ b/cpp/tests/neighbors/vpq_utils.cuh
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #include <cuvs/neighbors/common.hpp>
 

From 13725b301aaab5c85b8a0e1b7f9d84e2ea2e70c3 Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Fri, 6 Feb 2026 05:28:57 -0800
Subject: [PATCH 041/119] add a new max_node_id parameter to the CAGRA search
 API, allowing users to constrain random seed node selection to a subset of
 the dataset. This is useful when the graph is smaller than the dataset, such
 as during iterative build with compression.

---
 cpp/include/cuvs/neighbors/cagra.hpp          |  7 +++++++
 .../neighbors/detail/cagra/device_common.hpp  |  6 ++++--
 .../cagra/search_multi_cta_kernel-inl.cuh     |  9 ++++++---
 .../cagra/search_single_cta_kernel-inl.cuh    | 20 +++++++++++++------
 4 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 9b9e8eb0e6..ef55507869 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -247,6 +247,13 @@ struct search_params : cuvs::neighbors::search_params {
   /** Bit mask used for initial random seed node selection. */
   uint64_t rand_xor_mask = 0x128394;
 
+  /** 
+   * Maximum node ID for random seed selection. 
+   * When > 0, random seeds are constrained to [0, max_node_id) instead of [0, dataset_size).
+   * This is useful when the graph is smaller than the dataset (e.g., iterative build with compression).
+   * Default 0 means no constraint (use dataset_size).
+   */
+  uint32_t max_node_id = 0;
   /** Whether to use the persistent version of the kernel (only SINGLE_CTA is supported a.t.m.) */
   bool persistent = false;
   /** Persistent kernel: time in seconds before the kernel stops if no requests received. */
diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp
index 8a5bb6ba1f..df22b28081 100644
--- a/cpp/src/neighbors/detail/cagra/device_common.hpp
+++ b/cpp/src/neighbors/detail/cagra/device_common.hpp
@@ -102,11 +102,13 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes(
   IndexT* __restrict__ traversed_hash_ptr,
   const uint32_t traversed_hash_bitlen,
   const uint32_t block_id   = 0,
-  const uint32_t num_blocks = 1)
+  const uint32_t num_blocks = 1,
+  const IndexT max_node_id = 0)
 {
   const auto team_size_bits = dataset_desc.team_size_bitshift_from_smem();
   const auto max_i = raft::round_up_safe<uint32_t>(num_pickup, warp_size >> team_size_bits);
   const auto compute_distance = dataset_desc.compute_distance_impl;
+  const IndexT seed_index_limit = max_node_id > 0 ? max_node_id : dataset_desc.size;
 
   for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += (blockDim.x >> team_size_bits)) {
     const bool valid_i = (i < num_pickup);
@@ -122,7 +124,7 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes(
         if (seed_ptr && (gid < num_seeds)) {
           seed_index = seed_ptr[gid];
         } else {
-          seed_index = device::xorshift64(gid ^ rand_xor_mask) % dataset_desc.size;
+          seed_index = device::xorshift64(gid ^ rand_xor_mask) % seed_index_limit;
         }
       }
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
index c8b885dffe..916767c00b 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
@@ -193,7 +193,8 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
   const uint32_t min_iteration,
   const uint32_t max_iteration,
   uint32_t* const num_executed_iterations, /* stats */
-  SAMPLE_FILTER_T sample_filter)
+  SAMPLE_FILTER_T sample_filter,
+  const typename DATASET_DESCRIPTOR_T::INDEX_T max_node_id = 0)
 {
   using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
@@ -281,7 +282,8 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
                                            local_traversed_hashmap_ptr,
                                            traversed_hash_bitlen,
                                            block_id,
-                                           num_blocks);
+                                           num_blocks,
+                                           max_node_id);
   __syncthreads();
   _CLK_REC(clk_compute_1st_distance);
 
@@ -627,7 +629,8 @@ void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dat
                                                        ps.min_iterations,
                                                        ps.max_iterations,
                                                        num_executed_iterations,
-                                                       sample_filter);
+                                                       sample_filter,
+                                                       ps.max_node_id);
 }
 
 }  // namespace multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 404817e582..956f9ac3e4 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -702,7 +702,8 @@ RAFT_DEVICE_INLINE_FUNCTION void search_core(
   const std::uint32_t small_hash_bitlen,
   const std::uint32_t small_hash_reset_interval,
   const std::uint32_t query_id,
-  SAMPLE_FILTER_T sample_filter)
+  SAMPLE_FILTER_T sample_filter,
+  const typename DATASET_DESCRIPTOR_T::INDEX_T max_node_id = 0)
 {
   using LOAD_T = device::LOAD_128BIT_T;
 
@@ -791,7 +792,10 @@ RAFT_DEVICE_INLINE_FUNCTION void search_core(
                                            local_visited_hashmap_ptr,
                                            hash_bitlen,
                                            (INDEX_T*)nullptr,
-                                           0);
+                                           0,
+                                           0,
+                                           1,
+                                           max_node_id);
   __syncthreads();
   _CLK_REC(clk_compute_1st_distance);
 
@@ -1124,7 +1128,8 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
   const std::uint32_t hash_bitlen,
   const std::uint32_t small_hash_bitlen,
   const std::uint32_t small_hash_reset_interval,
-  SAMPLE_FILTER_T sample_filter)
+  SAMPLE_FILTER_T sample_filter,
+  const typename DATASET_DESCRIPTOR_T::INDEX_T max_node_id = 0)
 {
   const auto query_id = blockIdx.y;
   search_core<TOPK_BY_BITONIC_SORT,
@@ -1155,7 +1160,8 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
                                small_hash_bitlen,
                                small_hash_reset_interval,
                                query_id,
-                               sample_filter);
+                               sample_filter,
+                               max_node_id);
 }
 
 // To make sure we avoid false sharing on both CPU and GPU, we enforce cache line size to the
@@ -1317,7 +1323,8 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel_p(
                                  small_hash_bitlen,
                                  small_hash_reset_interval,
                                  query_id,
-                                 sample_filter);
+                                 sample_filter,
+                                 0);  // TODO: persistent kernel doesn't support max_node_id yet
 
     // make sure all writes are visible even for the host
     //     (e.g. when result buffers are in pinned memory)
@@ -2341,7 +2348,8 @@ control is returned in this thread (in persistent_runner_t constructor), so we'r
                                                            hash_bitlen,
                                                            small_hash_bitlen,
                                                            small_hash_reset_interval,
-                                                           sample_filter);
+                                                           sample_filter,
+                                                           ps.max_node_id);
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 }

From 0746446836771c80ed535f7d0a566ba1f8c18ac6 Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Fri, 6 Feb 2026 06:51:31 -0800
Subject: [PATCH 042/119] Changed the max node id parameter name to graph_size
 for clarity; removed max_node_id from the search parameters structure

---
 cpp/include/cuvs/neighbors/cagra.hpp                |  7 -------
 cpp/src/neighbors/detail/cagra/device_common.hpp    |  4 ++--
 .../detail/cagra/search_multi_cta_kernel-inl.cuh    |  6 +++---
 .../detail/cagra/search_single_cta_kernel-inl.cuh   | 13 ++++++-------
 4 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index ef55507869..9b9e8eb0e6 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -247,13 +247,6 @@ struct search_params : cuvs::neighbors::search_params {
   /** Bit mask used for initial random seed node selection. */
   uint64_t rand_xor_mask = 0x128394;
 
-  /** 
-   * Maximum node ID for random seed selection. 
-   * When > 0, random seeds are constrained to [0, max_node_id) instead of [0, dataset_size).
-   * This is useful when the graph is smaller than the dataset (e.g., iterative build with compression).
-   * Default 0 means no constraint (use dataset_size).
-   */
-  uint32_t max_node_id = 0;
   /** Whether to use the persistent version of the kernel (only SINGLE_CTA is supported a.t.m.) */
   bool persistent = false;
   /** Persistent kernel: time in seconds before the kernel stops if no requests received. */
diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp
index df22b28081..8cbbe1d366 100644
--- a/cpp/src/neighbors/detail/cagra/device_common.hpp
+++ b/cpp/src/neighbors/detail/cagra/device_common.hpp
@@ -103,12 +103,12 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes(
   const uint32_t traversed_hash_bitlen,
   const uint32_t block_id   = 0,
   const uint32_t num_blocks = 1,
-  const IndexT max_node_id = 0)
+  const IndexT graph_size = 0)
 {
   const auto team_size_bits = dataset_desc.team_size_bitshift_from_smem();
   const auto max_i = raft::round_up_safe<uint32_t>(num_pickup, warp_size >> team_size_bits);
   const auto compute_distance = dataset_desc.compute_distance_impl;
-  const IndexT seed_index_limit = max_node_id > 0 ? max_node_id : dataset_desc.size;
+  const IndexT seed_index_limit = graph_size > 0 ? graph_size : dataset_desc.size;
 
   for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += (blockDim.x >> team_size_bits)) {
     const bool valid_i = (i < num_pickup);
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
index 916767c00b..a2e9a43ff0 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
@@ -194,7 +194,7 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
   const uint32_t max_iteration,
   uint32_t* const num_executed_iterations, /* stats */
   SAMPLE_FILTER_T sample_filter,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T max_node_id = 0)
+  const typename DATASET_DESCRIPTOR_T::INDEX_T graph_size = 0)
 {
   using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
@@ -283,7 +283,7 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
                                            traversed_hash_bitlen,
                                            block_id,
                                            num_blocks,
-                                           max_node_id);
+                                           graph_size);
   __syncthreads();
   _CLK_REC(clk_compute_1st_distance);
 
@@ -630,7 +630,7 @@ void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dat
                                                        ps.max_iterations,
                                                        num_executed_iterations,
                                                        sample_filter,
-                                                       ps.max_node_id);
+                                                       static_cast<uint32_t>(graph.extent(0)));
 }
 
 }  // namespace multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 956f9ac3e4..5d465c25b5 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -703,7 +703,7 @@ RAFT_DEVICE_INLINE_FUNCTION void search_core(
   const std::uint32_t small_hash_reset_interval,
   const std::uint32_t query_id,
   SAMPLE_FILTER_T sample_filter,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T max_node_id = 0)
+  const typename DATASET_DESCRIPTOR_T::INDEX_T graph_size = 0)
 {
   using LOAD_T = device::LOAD_128BIT_T;
 
@@ -795,7 +795,7 @@ RAFT_DEVICE_INLINE_FUNCTION void search_core(
                                            0,
                                            0,
                                            1,
-                                           max_node_id);
+                                           graph_size);
   __syncthreads();
   _CLK_REC(clk_compute_1st_distance);
 
@@ -1129,7 +1129,7 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
   const std::uint32_t small_hash_bitlen,
   const std::uint32_t small_hash_reset_interval,
   SAMPLE_FILTER_T sample_filter,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T max_node_id = 0)
+  const typename DATASET_DESCRIPTOR_T::INDEX_T graph_size = 0)
 {
   const auto query_id = blockIdx.y;
   search_core<TOPK_BY_BITONIC_SORT,
@@ -1161,7 +1161,7 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
                                small_hash_reset_interval,
                                query_id,
                                sample_filter,
-                               max_node_id);
+                               graph_size);
 }
 
 // To make sure we avoid false sharing on both CPU and GPU, we enforce cache line size to the
@@ -1323,8 +1323,7 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel_p(
                                  small_hash_bitlen,
                                  small_hash_reset_interval,
                                  query_id,
-                                 sample_filter,
-                                 0);  // TODO: persistent kernel doesn't support max_node_id yet
+                                 sample_filter);
 
     // make sure all writes are visible even for the host
     //     (e.g. when result buffers are in pinned memory)
@@ -2349,7 +2348,7 @@ control is returned in this thread (in persistent_runner_t constructor), so we'r
                                                            small_hash_bitlen,
                                                            small_hash_reset_interval,
                                                            sample_filter,
-                                                           ps.max_node_id);
+                                                           static_cast<uint32_t>(graph.extent(0)));
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 }

From 70a69d922ec757a7093cecde07eb5a87f3c44de2 Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Fri, 6 Feb 2026 08:55:07 -0800
Subject: [PATCH 043/119] wrote test

---
 .../detail/cagra/search_multi_kernel.cuh      |  16 +-
 cpp/tests/CMakeLists.txt                      |   1 +
 .../bug_graph_smaller_than_dataset.cu         | 159 ++++++++++++++++++
 3 files changed, 171 insertions(+), 5 deletions(-)
 create mode 100644 cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu

diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
index f7d353d864..0ee7439ac6 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -104,7 +104,8 @@ RAFT_KERNEL random_pickup_kernel(
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, ldr]
   const std::uint32_t ldr,                                                // (*) ldr >= num_pickup
   typename DATASET_DESCRIPTOR_T::INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << bitlen]
-  const std::uint32_t hash_bitlen)
+  const std::uint32_t hash_bitlen,
+  const typename DATASET_DESCRIPTOR_T::INDEX_T graph_size = 0)
 {
   using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
@@ -119,6 +120,8 @@ RAFT_KERNEL random_pickup_kernel(
   dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id);
   __syncthreads();
 
+  const INDEX_T seed_index_limit = graph_size > 0 ? graph_size : dataset_desc->size;
+
   INDEX_T best_index_team_local;
   DISTANCE_T best_norm2_team_local = utils::get_max_value<DISTANCE_T>();
   for (unsigned i = 0; i < num_distilation; i++) {
@@ -128,7 +131,7 @@ RAFT_KERNEL random_pickup_kernel(
     } else {
       // Chose a seed node randomly
       seed_index =
-        device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % dataset_desc->size;
+        device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % seed_index_limit;
     }
 
     DISTANCE_T norm2 = dataset_desc->compute_distance(seed_index, true);
@@ -166,7 +169,8 @@ void random_pickup(const dataset_descriptor_host<DataT, IndexT, DistanceT>& data
                    std::size_t ldr,                  // (*) ldr >= num_pickup
                    IndexT* visited_hashmap_ptr,      // [num_queries, 1 << bitlen]
                    std::uint32_t hash_bitlen,
-                   cudaStream_t cuda_stream)
+                   cudaStream_t cuda_stream,
+                   IndexT graph_size = 0)
 {
   const auto block_size                = 256u;
   const auto num_teams_per_threadblock = block_size / dataset_desc.team_size;
@@ -185,7 +189,8 @@ void random_pickup(const dataset_descriptor_host<DataT, IndexT, DistanceT>& data
     result_distances_ptr,
     ldr,
     visited_hashmap_ptr,
-    hash_bitlen);
+    hash_bitlen,
+    graph_size);
 }
 
 template <class INDEX_T>
@@ -826,7 +831,8 @@ struct search
                                             result_buffer_allocation_size,
                                             hashmap.data(),
                                             hash_bitlen,
-                                            stream);
+                                            stream,
+                                            static_cast<IndexT>(this->dataset_size));
 
     unsigned iter = 0;
     while (1) {
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 9fc620b4cb..3643b2e12d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -156,6 +156,7 @@ ConfigureTest(
 ConfigureTest(
   NAME NEIGHBORS_ANN_CAGRA_TEST_BUGS
   PATH neighbors/ann_cagra/bug_extreme_inputs_oob.cu neighbors/ann_cagra/bug_multi_cta_crash.cu
+       neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu
   GPUS 1
   PERCENT 100
 )
diff --git a/cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu b/cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu
new file mode 100644
index 0000000000..8a29779bbd
--- /dev/null
+++ b/cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu
@@ -0,0 +1,159 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include <cuvs/neighbors/cagra.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <cstdint>
+
+namespace cuvs::neighbors::cagra {
+
+/**
+ * @brief Test verifying graph.extent(0) is used for random seed selection
+ * 
+ * This test ensures that CAGRA search kernels correctly use graph.extent(0)
+ * (graph size) rather than dataset.size for random seed node selection.
+ * 
+ * The bug: random seed selection previously used dataset_desc.size, which
+ * could cause OOB access if the graph size differed from dataset size
+ * (e.g., in CAGRA-Q iterative builds with compression).
+ * 
+ * The fix: kernels now receive graph.extent(0) as graph_size parameter,
+ * ensuring seeds are always within valid graph node range [0, graph_size).
+ */
+class cagra_graph_smaller_than_dataset_test : public ::testing::Test {
+ public:
+  using data_type  = float;
+  using index_type = uint32_t;
+
+ protected:
+  void run()
+  {
+    // Create a dataset with 10000 points
+    constexpr int64_t n_dataset = 10000;
+    constexpr int64_t n_dim     = 128;
+    constexpr int64_t n_queries = 100;
+    constexpr int64_t k         = 10;
+
+    // Build index normally
+    auto dataset = raft::make_device_matrix<data_type, int64_t>(res, n_dataset, n_dim);
+    raft::random::RngState r(1234ULL);
+    raft::random::uniform(res, r, dataset.data_handle(), n_dataset * n_dim, data_type(-1), data_type(1));
+
+    cagra::index_params index_params;
+    index_params.graph_degree              = 32;
+    index_params.intermediate_graph_degree = 64;
+    
+    auto index = cagra::build(res, index_params, raft::make_const_mdspan(dataset.view()));
+    raft::resource::sync_stream(res);
+
+    // Get the graph from the index
+    auto original_graph = index.graph();
+    ASSERT_EQ(original_graph.extent(0), n_dataset);
+
+    // Recreate the bug scenario: LARGE dataset, SMALL graph
+    // (like iterative_build_graph does in intermediate iterations)
+    constexpr int64_t n_graph = n_dataset / 2;  // Only 5000 nodes in graph
+    
+    // Step 1: Build index on SMALL subset (5000 points)
+    auto small_dataset_view = raft::make_device_matrix_view<const data_type, int64_t>(
+      dataset.data_handle(), n_graph, n_dim);
+    
+    cagra::index_params small_index_params;
+    small_index_params.graph_degree = 32;
+    auto small_index = cagra::build(res, small_index_params, small_dataset_view);
+    raft::resource::sync_stream(res);
+    
+    // Step 2: Update to FULL dataset (10000 points) but keep small graph (5000 nodes)
+    // This creates the exact bug scenario: dataset.size=10000, graph.extent(0)=5000
+    small_index.update_dataset(res, raft::make_const_mdspan(dataset.view()));
+    
+    // Verify the mismatch - THIS IS THE BUG SCENARIO!
+    ASSERT_EQ(small_index.graph().extent(0), n_graph);      // Graph has 5000 nodes
+    ASSERT_EQ(small_index.size(), n_dataset);                // Dataset has 10000 points
+    ASSERT_NE(small_index.graph().extent(0), small_index.size()); // Mismatch!
+
+    // Create queries
+    auto queries = raft::make_device_matrix<data_type, int64_t>(res, n_queries, n_dim);
+    raft::random::uniform(res, r, queries.data_handle(), n_queries * n_dim, data_type(-1), data_type(1));
+
+    // Allocate output
+    auto neighbors = raft::make_device_matrix<index_type, int64_t>(res, n_queries, k);
+    auto distances = raft::make_device_matrix<data_type, int64_t>(res, n_queries, k);
+
+    // Setup search params
+    cagra::search_params search_params;
+    search_params.itopk_size = 64;
+    search_params.search_width = 1;
+    search_params.max_iterations = 10;
+    search_params.algo = cagra::search_algo::SINGLE_CTA;
+
+    // THIS SHOULD NOT CRASH OR CAUSE OOB ACCESS
+    // Before fix: random seeds use dataset.size (10000) -> tries to access graph[7000] -> CRASH!
+    // After fix: random seeds use graph.extent(0) (5000) -> only accesses graph[0-4999] -> SAFE!
+    cagra::search(res,
+                  search_params,
+                  small_index,
+                  raft::make_const_mdspan(queries.view()),
+                  neighbors.view(),
+                  distances.view());
+
+    raft::resource::sync_stream(res);
+
+    // Verify results are valid (neighbors should be < graph size)
+    auto neighbors_host = raft::make_host_matrix<index_type, int64_t>(n_queries, k);
+    raft::copy(neighbors_host.data_handle(),
+               neighbors.data_handle(),
+               n_queries * k,
+               raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+
+    // All neighbor indices should be valid (< n_graph)
+    for (int64_t i = 0; i < n_queries * k; i++) {
+      ASSERT_LT(neighbors_host.data_handle()[i], n_graph)
+        << "Neighbor index " << neighbors_host.data_handle()[i]
+        << " is >= graph size " << n_graph;
+    }
+
+    // Test with MULTI_CTA algorithm as well (also had the same bug)
+    search_params.algo = cagra::search_algo::MULTI_CTA;
+    
+    cagra::search(res,
+                  search_params,
+                  small_index,
+                  raft::make_const_mdspan(queries.view()),
+                  neighbors.view(),
+                  distances.view());
+
+    raft::resource::sync_stream(res);
+
+    // Verify again
+    raft::copy(neighbors_host.data_handle(),
+               neighbors.data_handle(),
+               n_queries * k,
+               raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+
+    for (int64_t i = 0; i < n_queries * k; i++) {
+      ASSERT_LT(neighbors_host.data_handle()[i], n_graph)
+        << "Neighbor index " << neighbors_host.data_handle()[i]
+        << " is >= graph size " << n_graph << " (MULTI_CTA)";
+    }
+  }
+
+ private:
+  raft::resources res;
+};
+
+TEST_F(cagra_graph_smaller_than_dataset_test, search_with_smaller_graph) { this->run(); }
+
+}  // namespace cuvs::neighbors::cagra

From f428e54bf64ad1ba1f67ae01be6ce68742ac85ed Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Fri, 6 Feb 2026 08:55:34 -0800
Subject: [PATCH 044/119] minor pre-commit changes

---
 .../detail/cagra/search_multi_kernel.cuh      |  2 +-
 .../bug_graph_smaller_than_dataset.cu         | 45 ++++++++++---------
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
index 0ee7439ac6..045c63fe59 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
diff --git a/cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu b/cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu
index 8a29779bbd..b06c1cba92 100644
--- a/cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu
+++ b/cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu
@@ -19,14 +19,14 @@ namespace cuvs::neighbors::cagra {
 
 /**
  * @brief Test verifying graph.extent(0) is used for random seed selection
- * 
+ *
  * This test ensures that CAGRA search kernels correctly use graph.extent(0)
  * (graph size) rather than dataset.size for random seed node selection.
- * 
+ *
  * The bug: random seed selection previously used dataset_desc.size, which
  * could cause OOB access if the graph size differed from dataset size
  * (e.g., in CAGRA-Q iterative builds with compression).
- * 
+ *
  * The fix: kernels now receive graph.extent(0) as graph_size parameter,
  * ensuring seeds are always within valid graph node range [0, graph_size).
  */
@@ -47,12 +47,13 @@ class cagra_graph_smaller_than_dataset_test : public ::testing::Test {
     // Build index normally
     auto dataset = raft::make_device_matrix<data_type, int64_t>(res, n_dataset, n_dim);
     raft::random::RngState r(1234ULL);
-    raft::random::uniform(res, r, dataset.data_handle(), n_dataset * n_dim, data_type(-1), data_type(1));
+    raft::random::uniform(
+      res, r, dataset.data_handle(), n_dataset * n_dim, data_type(-1), data_type(1));
 
     cagra::index_params index_params;
     index_params.graph_degree              = 32;
     index_params.intermediate_graph_degree = 64;
-    
+
     auto index = cagra::build(res, index_params, raft::make_const_mdspan(dataset.view()));
     raft::resource::sync_stream(res);
 
@@ -63,28 +64,29 @@ class cagra_graph_smaller_than_dataset_test : public ::testing::Test {
     // Recreate the bug scenario: LARGE dataset, SMALL graph
     // (like iterative_build_graph does in intermediate iterations)
     constexpr int64_t n_graph = n_dataset / 2;  // Only 5000 nodes in graph
-    
+
     // Step 1: Build index on SMALL subset (5000 points)
     auto small_dataset_view = raft::make_device_matrix_view<const data_type, int64_t>(
       dataset.data_handle(), n_graph, n_dim);
-    
+
     cagra::index_params small_index_params;
     small_index_params.graph_degree = 32;
-    auto small_index = cagra::build(res, small_index_params, small_dataset_view);
+    auto small_index                = cagra::build(res, small_index_params, small_dataset_view);
     raft::resource::sync_stream(res);
-    
+
     // Step 2: Update to FULL dataset (10000 points) but keep small graph (5000 nodes)
     // This creates the exact bug scenario: dataset.size=10000, graph.extent(0)=5000
     small_index.update_dataset(res, raft::make_const_mdspan(dataset.view()));
-    
+
     // Verify the mismatch - THIS IS THE BUG SCENARIO!
-    ASSERT_EQ(small_index.graph().extent(0), n_graph);      // Graph has 5000 nodes
-    ASSERT_EQ(small_index.size(), n_dataset);                // Dataset has 10000 points
-    ASSERT_NE(small_index.graph().extent(0), small_index.size()); // Mismatch!
+    ASSERT_EQ(small_index.graph().extent(0), n_graph);             // Graph has 5000 nodes
+    ASSERT_EQ(small_index.size(), n_dataset);                      // Dataset has 10000 points
+    ASSERT_NE(small_index.graph().extent(0), small_index.size());  // Mismatch!
 
     // Create queries
     auto queries = raft::make_device_matrix<data_type, int64_t>(res, n_queries, n_dim);
-    raft::random::uniform(res, r, queries.data_handle(), n_queries * n_dim, data_type(-1), data_type(1));
+    raft::random::uniform(
+      res, r, queries.data_handle(), n_queries * n_dim, data_type(-1), data_type(1));
 
     // Allocate output
     auto neighbors = raft::make_device_matrix<index_type, int64_t>(res, n_queries, k);
@@ -92,10 +94,10 @@ class cagra_graph_smaller_than_dataset_test : public ::testing::Test {
 
     // Setup search params
     cagra::search_params search_params;
-    search_params.itopk_size = 64;
-    search_params.search_width = 1;
+    search_params.itopk_size     = 64;
+    search_params.search_width   = 1;
     search_params.max_iterations = 10;
-    search_params.algo = cagra::search_algo::SINGLE_CTA;
+    search_params.algo           = cagra::search_algo::SINGLE_CTA;
 
     // THIS SHOULD NOT CRASH OR CAUSE OOB ACCESS
     // Before fix: random seeds use dataset.size (10000) -> tries to access graph[7000] -> CRASH!
@@ -120,13 +122,12 @@ class cagra_graph_smaller_than_dataset_test : public ::testing::Test {
     // All neighbor indices should be valid (< n_graph)
     for (int64_t i = 0; i < n_queries * k; i++) {
       ASSERT_LT(neighbors_host.data_handle()[i], n_graph)
-        << "Neighbor index " << neighbors_host.data_handle()[i]
-        << " is >= graph size " << n_graph;
+        << "Neighbor index " << neighbors_host.data_handle()[i] << " is >= graph size " << n_graph;
     }
 
     // Test with MULTI_CTA algorithm as well (also had the same bug)
     search_params.algo = cagra::search_algo::MULTI_CTA;
-    
+
     cagra::search(res,
                   search_params,
                   small_index,
@@ -145,8 +146,8 @@ class cagra_graph_smaller_than_dataset_test : public ::testing::Test {
 
     for (int64_t i = 0; i < n_queries * k; i++) {
       ASSERT_LT(neighbors_host.data_handle()[i], n_graph)
-        << "Neighbor index " << neighbors_host.data_handle()[i]
-        << " is >= graph size " << n_graph << " (MULTI_CTA)";
+        << "Neighbor index " << neighbors_host.data_handle()[i] << " is >= graph size " << n_graph
+        << " (MULTI_CTA)";
     }
   }
 

From 9b1dff803df5831b11edf6c8ff6fd8773439cde7 Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Tue, 13 Jan 2026 09:12:51 -0800
Subject: [PATCH 045/119] started updating the index building for itrative
 cagra q build

---
 cpp/.clangd                                   |  65 ----
 cpp/.clangd_headers/cuda_runtime.h            |  52 +++
 cpp/bench/ann/src/common/benchmark.hpp        |   1 +
 .../neighbors/detail/cagra/cagra_build.cuh    | 139 +++++---
 .../neighbors/detail/cagra/cagra_search.cuh   |   2 +
 .../detail/cagra/compute_distance.hpp         |   2 +
 .../detail/cagra/search_multi_kernel.cuh      |   2 +
 cpp/tests/neighbors/ann_cagra.cuh             | 311 +++++++++---------
 python/cuvs_bench/cuvs_bench/run/__main__.py  |   1 +
 python/cuvs_bench/cuvs_bench/run/run.py       |   2 +
 10 files changed, 315 insertions(+), 262 deletions(-)
 delete mode 100644 cpp/.clangd
 create mode 100644 cpp/.clangd_headers/cuda_runtime.h

diff --git a/cpp/.clangd b/cpp/.clangd
deleted file mode 100644
index 7c4fe036dd..0000000000
--- a/cpp/.clangd
+++ /dev/null
@@ -1,65 +0,0 @@
-# https://clangd.llvm.org/config
-
-# Apply a config conditionally to all C files
-If:
-  PathMatch: .*\.(c|h)$
-
----
-
-# Apply a config conditionally to all C++ files
-If:
-  PathMatch: .*\.(c|h)pp
-
----
-
-# Apply a config conditionally to all CUDA files
-If:
-  PathMatch: .*\.cuh?
-CompileFlags:
-  Add:
-    - "-x"
-    - "cuda"
-    # No error on unknown CUDA versions
-    - "-Wno-unknown-cuda-version"
-    # Allow variadic CUDA functions
-    - "-Xclang=-fcuda-allow-variadic-functions"
-Diagnostics:
-  Suppress:
-    - "variadic_device_fn"
-    - "attributes_not_allowed"
-
----
-
-# Tweak the clangd parse settings for all files
-CompileFlags:
-  Add:
-    # report all errors
-    - "-ferror-limit=0"
-    - "-fmacro-backtrace-limit=0"
-    - "-ftemplate-backtrace-limit=0"
-    # Skip the CUDA version check
-    - "--no-cuda-version-check"
-  Remove:
-    # remove gcc's -fcoroutines
-    - -fcoroutines
-    # remove nvc++ flags unknown to clang
-    - "-gpu=*"
-    - "-stdpar*"
-    # remove nvcc flags unknown to clang
-    - "-arch*"
-    - "-gencode*"
-    - "--generate-code*"
-    - "-ccbin*"
-    - "-t=*"
-    - "--threads*"
-    - "-Xptxas*"
-    - "-Xcudafe*"
-    - "-Xfatbin*"
-    - "-Xcompiler*"
-    - "--diag-suppress*"
-    - "--diag_suppress*"
-    - "--compiler-options*"
-    - "--expt-extended-lambda"
-    - "--expt-relaxed-constexpr"
-    - "-forward-unknown-to-host-compiler"
-    - "-Werror=cross-execution-space-call"
diff --git a/cpp/.clangd_headers/cuda_runtime.h b/cpp/.clangd_headers/cuda_runtime.h
new file mode 100644
index 0000000000..b3bdecf5a3
--- /dev/null
+++ b/cpp/.clangd_headers/cuda_runtime.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+
+// Basic CUDA types needed for RAFT/cuVS analysis
+enum cudaError_t { cudaSuccess = 0, cudaErrorMemoryAllocation = 2 };
+enum cudaMemcpyKind { cudaMemcpyHostToHost = 0, cudaMemcpyHostToDevice = 1, cudaMemcpyDeviceToHost = 2, cudaMemcpyDeviceToDevice = 3 };
+
+// Global memory type enum (often used outside the struct)
+// Moved out of struct to avoid type mismatch errors
+enum cudaMemoryType { 
+    cudaMemoryTypeHost = 1, 
+    cudaMemoryTypeDevice = 2, 
+    cudaMemoryTypeManaged = 3, 
+    cudaMemoryTypeUnregistered = 4 
+};
+
+struct cudaPointerAttributes {
+    // Remove internal enum definition to avoid "different enumeration types" error
+    // Use the global enum type
+    enum cudaMemoryType type;
+    int device;
+    void* devicePointer;
+    void* hostPointer;
+    int isManaged;
+};
+
+typedef struct CUstream_st* cudaStream_t;
+typedef struct cudaDeviceProp* cudaDeviceProp_t; // Incomplete type is usually enough for pointers
+
+// Stub functions (declarations only)
+inline cudaError_t cudaPointerGetAttributes(cudaPointerAttributes* attributes, const void* ptr) { return cudaSuccess; }
+inline cudaError_t cudaMalloc(void** devPtr, size_t size) { return cudaSuccess; }
+inline cudaError_t cudaFree(void* devPtr) { return cudaSuccess; }
+inline cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) { return cudaSuccess; }
+inline cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0) { return cudaSuccess; }
+inline cudaError_t cudaStreamSynchronize(cudaStream_t stream) { return cudaSuccess; }
+inline cudaError_t cudaDeviceSynchronize() { return cudaSuccess; }
+
+// Error handling stubs
+inline cudaError_t cudaGetLastError() { return cudaSuccess; }
+inline cudaError_t cudaPeekAtLastError() { return cudaSuccess; }
+inline const char* cudaGetErrorName(cudaError_t error) { return "cudaSuccess"; }
+inline const char* cudaGetErrorString(cudaError_t error) { return "no error"; }
+
+// Defines that might be checked
+#define __CUDACC__ 1
+#define __host__
+#define __device__
+#define __global__
+#define __forceinline__ inline
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 22859e9ab8..1d5239ec9b 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -647,6 +647,7 @@ inline auto run_main(int argc, char** argv) -> int
   char* conf_path = argv[--argc];
   std::ifstream conf_stream(conf_path);
 
+
   for (int i = 1; i < argc; i++) {
     if (parse_bool_flag(argv[i], "--force", force_overwrite) ||
         parse_bool_flag(argv[i], "--build", build_mode) ||
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 97d7bb1bac..77846f202b 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -1972,6 +1972,62 @@ struct mmap_owner {
   size_t size_;
 };
 
+template <typename T, typename IdxT>
+void search_and_optimize(raft::resources const& res,
+                         const cuvs::neighbors::cagra::search_params& search_params,
+                         const index<T, IdxT>& idx,
+                         raft::device_matrix_view<const T, int64_t> dev_query_view,
+                         raft::device_matrix_view<IdxT, int64_t> dev_neighbors,
+                         raft::device_matrix_view<float, int64_t> dev_distances,
+                         raft::host_matrix_view<IdxT, int64_t> neighbors_view,
+                         raft::host_matrix<IdxT, int64_t>& cagra_graph,
+                         size_t curr_query_size,
+                         size_t next_graph_degree,
+                         size_t curr_topk,
+                         uint64_t max_chunk_size,
+                         bool flag_last,
+                         const index_params& params)
+{
+  // Search.
+  // Since there are many queries, divide them into batches and search them.
+  cuvs::spatial::knn::detail::utils::batch_load_iterator<T> query_batch(
+    dev_query_view.data_handle(),
+    curr_query_size,
+    dev_query_view.extent(1),
+    max_chunk_size,
+    raft::resource::get_cuda_stream(res),
+    raft::resource::get_workspace_resource(res));
+  for (const auto& batch : query_batch) {
+    auto batch_dev_query_view = raft::make_device_matrix_view<const T, int64_t>(
+      batch.data(), batch.size(), dev_query_view.extent(1));
+    auto batch_dev_neighbors_view = raft::make_device_matrix_view<IdxT, int64_t>(
+      dev_neighbors.data_handle(), batch.size(), curr_topk);
+    auto batch_dev_distances_view = raft::make_device_matrix_view<float, int64_t>(
+      dev_distances.data_handle(), batch.size(), curr_topk);
+
+    cuvs::neighbors::cagra::search(res,
+                                   search_params,
+                                   idx,
+                                   batch_dev_query_view,
+                                   batch_dev_neighbors_view,
+                                   batch_dev_distances_view);
+
+    auto batch_neighbors_view = raft::make_host_matrix_view<IdxT, int64_t>(
+      neighbors_view.data_handle() + batch.offset() * curr_topk, batch.size(), curr_topk);
+    raft::copy(batch_neighbors_view.data_handle(),
+               batch_dev_neighbors_view.data_handle(),
+               batch_neighbors_view.size(),
+               raft::resource::get_cuda_stream(res));
+  }
+
+  // Optimize graph
+  auto next_graph_size = curr_query_size;
+  cagra_graph          = raft::make_host_matrix<IdxT, int64_t>(0, 0);  // delete existing grahp
+  cagra_graph = raft::make_host_matrix<IdxT, int64_t>(next_graph_size, next_graph_degree);
+  optimize<IdxT>(
+    res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0);
+}
+
 template <typename T,
           typename IdxT = uint32_t,
           typename Accessor =
@@ -2062,6 +2118,24 @@ auto iterative_build_graph(
 
   bool flag_last       = false;
   auto curr_graph_size = initial_graph_size;
+
+  // Generate the compressed index once if compression is enabled
+  std::optional<index<T, IdxT>> idx_opt;
+  if (params.compression.has_value()) {
+    auto start = std::chrono::high_resolution_clock::now();
+    RAFT_EXPECTS(params.metric == cuvs::distance::DistanceType::L2Expanded,
+      "VPQ compression is only supported with L2Expanded distance mertric");
+    idx_opt.emplace(res, params.metric);
+    //idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view()));
+    idx_opt->update_dataset(
+    res,
+    // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later
+    cuvs::neighbors::vpq_build<decltype(dev_dataset), half, int64_t>(
+    res, *params.compression, dev_dataset));
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+    RAFT_LOG_INFO("# VPQ compression time: %.3lf sec", (double)elapsed_ms / 1000);
+  }
   while (true) {
     auto start           = std::chrono::high_resolution_clock::now();
     auto curr_query_size = std::min(2 * curr_graph_size, final_graph_size);
@@ -2097,9 +2171,13 @@ auto iterative_build_graph(
     // search results (neighbors).
     auto dev_dataset_view = raft::make_device_matrix_view<const T, int64_t>(
       dev_dataset.data_handle(), (int64_t)curr_graph_size, dev_dataset.extent(1));
-
-    auto idx = index<T, IdxT>(
-      res, params.metric, dev_dataset_view, raft::make_const_mdspan(cagra_graph.view()));
+    // No compression, create mdspan index
+    if (!params.compression.has_value()) {
+      idx_opt.emplace(res, params.metric, dev_dataset_view, raft::make_const_mdspan(cagra_graph.view()));
+    } else {
+      idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view()));
+    }
+    const auto& idx = *idx_opt;
 
     auto dev_query_view = raft::make_device_matrix_view<const T, int64_t>(
       dev_dataset.data_handle(), (int64_t)curr_query_size, dev_dataset.extent(1));
@@ -2107,44 +2185,22 @@ auto iterative_build_graph(
     auto neighbors_view =
       raft::make_host_matrix_view<IdxT, int64_t>(neighbors_ptr, curr_query_size, curr_topk);
 
-    // Search.
-    // Since there are many queries, divide them into batches and search them.
-    cuvs::spatial::knn::detail::utils::batch_load_iterator<T> query_batch(
-      dev_query_view.data_handle(),
-      curr_query_size,
-      dev_query_view.extent(1),
-      max_chunk_size,
-      raft::resource::get_cuda_stream(res),
-      raft::resource::get_workspace_resource(res));
-    for (const auto& batch : query_batch) {
-      auto batch_dev_query_view = raft::make_device_matrix_view<const T, int64_t>(
-        batch.data(), batch.size(), dev_query_view.extent(1));
-      auto batch_dev_neighbors_view = raft::make_device_matrix_view<IdxT, int64_t>(
-        dev_neighbors.data_handle(), batch.size(), curr_topk);
-      auto batch_dev_distances_view = raft::make_device_matrix_view<float, int64_t>(
-        dev_distances.data_handle(), batch.size(), curr_topk);
-
-      cuvs::neighbors::cagra::search(res,
-                                     search_params,
-                                     idx,
-                                     batch_dev_query_view,
-                                     batch_dev_neighbors_view,
-                                     batch_dev_distances_view);
-
-      auto batch_neighbors_view = raft::make_host_matrix_view<IdxT, int64_t>(
-        neighbors_view.data_handle() + batch.offset() * curr_topk, batch.size(), curr_topk);
-      raft::copy(batch_neighbors_view.data_handle(),
-                 batch_dev_neighbors_view.data_handle(),
-                 batch_neighbors_view.size(),
-                 raft::resource::get_cuda_stream(res));
-    }
-
-    // Optimize graph
-    auto next_graph_size = curr_query_size;
-    cagra_graph          = raft::make_host_matrix<IdxT, int64_t>(0, 0);  // delete existing grahp
-    cagra_graph = raft::make_host_matrix<IdxT, int64_t>(next_graph_size, next_graph_degree);
-    optimize<IdxT>(
-      res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0);
+    
+
+    search_and_optimize(res,
+                        search_params,
+                        idx,
+                        dev_query_view,
+                        dev_neighbors.view(),
+                        dev_distances.view(),
+                        neighbors_view,
+                        cagra_graph,
+                        curr_query_size,
+                        next_graph_degree,
+                        curr_topk,
+                        max_chunk_size,
+                        flag_last,
+                        params);
 
     auto end        = std::chrono::high_resolution_clock::now();
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
@@ -2152,6 +2208,7 @@ auto iterative_build_graph(
 
     if (flag_last) { break; }
     flag_last       = (curr_graph_size == final_graph_size);
+    auto next_graph_size = curr_query_size;
     curr_graph_size = next_graph_size;
   }
 
diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index 2d383a2429..a80afdca95 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -153,6 +153,7 @@ void search_main(raft::resources const& res,
   if (auto* strided_dset = dynamic_cast<const strided_dataset<T, ds_idx_type>*>(&index.data());
       strided_dset != nullptr) {
     // Search using a plain (strided) row-major dataset
+    RAFT_LOG_INFO("Searching with strided dataset");
     RAFT_EXPECTS(index.metric() != cuvs::distance::DistanceType::CosineExpanded ||
                    index.dataset_norms().has_value(),
                  "Dataset norms must be provided for CosineExpanded metric");
@@ -179,6 +180,7 @@ void search_main(raft::resources const& res,
     RAFT_FAIL("FP32 VPQ dataset support is coming soon");
   } else if (auto* vpq_dset = dynamic_cast<const vpq_dataset<half, ds_idx_type>*>(&index.data());
              vpq_dset != nullptr) {
+    RAFT_LOG_INFO("Searching with VPQ dataset");
     auto desc = dataset_descriptor_init_with_cache<T, graph_idx_type, DistanceT>(
       res, params, *vpq_dset, index.metric(), nullptr);
     search_main_core<T, graph_idx_type, DistanceT, CagraSampleFilterT, IdxT, OutputIdxT>(
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
index f9974fa3df..c314d4d0a3 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
@@ -239,11 +239,13 @@ struct dataset_descriptor_host {
     template <typename InitF>
     state(InitF init, size_t size) : ready{false}, value{std::make_tuple(init, size)}
     {
+      // RAFT_LOG_INFO("trying to create a descriptor state %p", reinterpret_cast<std::uintptr_t>(this));
     }
 
     ~state() noexcept
     {
       if (std::holds_alternative<ready_t>(value)) {
+        // RAFT_LOG_INFO("trying to free descriptor state %p", reinterpret_cast<std::uintptr_t>(this));
         auto& [ptr, stream] = std::get<ready_t>(value);
         RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(ptr, stream));
       }
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
index 045c63fe59..e3ef7bacc1 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -118,6 +118,8 @@ RAFT_KERNEL random_pickup_kernel(
   if (global_team_index >= num_pickup) { return; }
   extern __shared__ uint8_t smem[];
   dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id);
+  // Set the resulting random index limit to the modulo wrap value if it is set
+  INDEX_T seed_index_limit = mod_wrap > 0 ? mod_wrap : dataset_desc->size;
   __syncthreads();
 
   const INDEX_T seed_index_limit = graph_size > 0 ? graph_size : dataset_desc->size;
diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh
index beb379e44d..77dbcb683c 100644
--- a/cpp/tests/neighbors/ann_cagra.cuh
+++ b/cpp/tests/neighbors/ann_cagra.cuh
@@ -1455,75 +1455,74 @@ inline std::vector<AnnCagraInputs> generate_inputs()
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
   // Additional distances tested with a single search algo.
-  inputs2 = raft::util::itertools::product<AnnCagraInputs>(
-    {1, 100},
-    {1000},
-    {8},
-    {1, 16},  // k
-    {graph_build_algo::NN_DESCENT},
-    {search_algo::SINGLE_CTA},
-    {0},  // query size
-    {0},
-    {256},
-    {1},
-    {cuvs::distance::DistanceType::InnerProduct,
-     cuvs::distance::DistanceType::BitwiseHamming,
-     cuvs::distance::DistanceType::CosineExpanded},
-    {false},
-    {true},
-    {false},
-    {0.995},
-    {std::optional<float>{std::nullopt}},
-    {std::optional<vpq_params>{std::nullopt}},
-    {std::optional<bool>{std::nullopt}},
-    {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL});
-  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
-
-  // Corner cases for small datasets
-  inputs2 = raft::util::itertools::product<AnnCagraInputs>(
-    {2},
-    {3, 6, 31, 32, 64, 101},
-    {1, 10},
-    {2},  // k
-    {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT},
-    {search_algo::SINGLE_CTA, search_algo::MULTI_CTA, search_algo::MULTI_KERNEL},
-    {0},  // query size
-    {0},
-    {256},
-    {1},
-    {cuvs::distance::DistanceType::L2Expanded},
-    {false},
-    {true},
-    {true},
-    {0.995},
-    {std::optional<float>{std::nullopt}},
-    {std::optional<vpq_params>{std::nullopt}},
-    {std::optional<bool>{std::nullopt}},
-    {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL,
-     cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL});
-  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
+  // inputs2 = raft::util::itertools::product<AnnCagraInputs>(
+  //   {1, 100},
+  //   {1000},
+  //   {8},
+  //   {1, 16},  // k
+  //   {graph_build_algo::NN_DESCENT},
+  //   {search_algo::SINGLE_CTA},
+  //   {0},  // query size
+  //   {0},
+  //   {256},
+  //   {1},
+  //   {cuvs::distance::DistanceType::InnerProduct,
+  //    cuvs::distance::DistanceType::BitwiseHamming,
+  //    cuvs::distance::DistanceType::CosineExpanded},
+  //   {false},
+  //   {true},
+  //   {false},
+  //   {0.995},
+  //   {std::optional<float>{std::nullopt}},
+  //   {std::optional<vpq_params>{std::nullopt}},
+  //   {std::optional<bool>{std::nullopt}},
+  //   {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL});
+  // inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
+
+  // // Corner cases for small datasets
+  // inputs2 = raft::util::itertools::product<AnnCagraInputs>(
+  //   {2},
+  //   {3, 6, 31, 32, 64, 101},
+  //   {1, 10},
+  //   {2},  // k
+  //   {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT},
+  //   {search_algo::SINGLE_CTA, search_algo::MULTI_CTA, search_algo::MULTI_KERNEL},
+  //   {0},  // query size
+  //   {0},
+  //   {256},
+  //   {1},
+  //   {cuvs::distance::DistanceType::L2Expanded},
+  //   {false},
+  //   {true},
+  //   {true},
+  //   {0.995},
+  //   {std::optional<float>{std::nullopt}},
+  //   {std::optional<vpq_params>{std::nullopt}},
+  //   {std::optional<bool>{std::nullopt}},
+  //   {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL,
+  //    cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL});
+  // inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
   // Varying dim and build algo.
   inputs2 = raft::util::itertools::product<AnnCagraInputs>(
     {100},
-    {1000},
-    {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 1024},  // dim
+    {1000000},
+    {768},  // dim
     {16},                                                    // k
-    {graph_build_algo::IVF_PQ,
-     graph_build_algo::NN_DESCENT,
+    {
+      //graph_build_algo::IVF_PQ,
+     //graph_build_algo::NN_DESCENT,
      graph_build_algo::ITERATIVE_CAGRA_SEARCH},
     {search_algo::AUTO},
     {10},
     {0},
     {64},
     {1},
-    {cuvs::distance::DistanceType::L2Expanded,
-     cuvs::distance::DistanceType::InnerProduct,
-     cuvs::distance::DistanceType::BitwiseHamming},
+    {cuvs::distance::DistanceType::L2Expanded},
     {false},
     {true},
     {false},
-    {0.995},
+    {0.01},
     {std::optional<float>{std::nullopt}},
     {std::optional<vpq_params>{std::nullopt}},
     {std::optional<bool>{std::nullopt}},
@@ -1532,107 +1531,107 @@ inline std::vector<AnnCagraInputs> generate_inputs()
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
   // Varying team_size, graph_build_algo
-  inputs2 = raft::util::itertools::product<AnnCagraInputs>(
-    {100},
-    {1000},
-    {64},
-    {16},
-    {graph_build_algo::IVF_PQ,
-     graph_build_algo::NN_DESCENT,
-     graph_build_algo::ITERATIVE_CAGRA_SEARCH},
-    {search_algo::AUTO},
-    {10},
-    {0},  // team_size
-    {64},
-    {1},
-    {cuvs::distance::DistanceType::L2Expanded,
-     cuvs::distance::DistanceType::InnerProduct,
-     cuvs::distance::DistanceType::BitwiseHamming,
-     cuvs::distance::DistanceType::CosineExpanded},
-    {false},
-    {false},
-    {false},
-    {0.995},
-    {std::optional<float>{std::nullopt}},
-    {std::optional<vpq_params>{std::nullopt}},
-    {std::optional<bool>{std::nullopt}},
-    {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL,
-     cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL});
-  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
-
-  // Vary team size only.
-  inputs2 = raft::util::itertools::product<AnnCagraInputs>(
-    {100},
-    {1000},
-    {64},
-    {16},
-    {graph_build_algo::NN_DESCENT},
-    {search_algo::AUTO},
-    {10},
-    {8, 16, 32},  // team_size
-    {64},
-    {1},
-    {cuvs::distance::DistanceType::L2Expanded,
-     cuvs::distance::DistanceType::InnerProduct,
-     cuvs::distance::DistanceType::BitwiseHamming,
-     cuvs::distance::DistanceType::CosineExpanded},
-    {false},
-    {false},
-    {false},
-    {0.995},
-    {std::optional<float>{std::nullopt}},
-    {std::optional<vpq_params>{std::nullopt}},
-    {std::optional<bool>{std::nullopt}},
-    {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL});
-  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
-
-  // Varying n_rows, host_dataset
-  inputs2 = raft::util::itertools::product<AnnCagraInputs>(
-    {100},
-    {10000},
-    {32},
-    {10},
-    {graph_build_algo::AUTO},
-    {search_algo::AUTO},
-    {10},
-    {0},  // team_size
-    {64},
-    {1},
-    {cuvs::distance::DistanceType::L2Expanded, cuvs::distance::DistanceType::InnerProduct},
-    {false, true},
-    {false},
-    {true},
-    {0.985},
-    {std::optional<float>{std::nullopt}},
-    {std::optional<vpq_params>{std::nullopt}},
-    {std::optional<bool>{std::nullopt}},
-    {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL,
-     cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL});
-  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
-
-  // A few PQ configurations.
-  // Varying dim, vq_n_centers
-  inputs2 = raft::util::itertools::product<AnnCagraInputs>(
-    {100},
-    {10000},
-    {64, 128, 192, 256, 512, 1024},  // dim
-    {16},                            // k
-    {graph_build_algo::IVF_PQ},
-    {search_algo::AUTO},
-    {10},
-    {0},
-    {64},
-    {1},
-    {cuvs::distance::DistanceType::L2Expanded},
-    {false},
-    {true},
-    {false},
-    {0.6},
-    {std::optional<float>{std::nullopt}},
-    {std::optional<vpq_params>{std::nullopt}},
-    {std::optional<bool>{std::nullopt}},
-    {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL,
-     cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL});  // don't demand high recall
+  // inputs2 = raft::util::itertools::product<AnnCagraInputs>(
+  //   {100},
+  //   {1000},
+  //   {64},
+  //   {16},
+  //   {graph_build_algo::IVF_PQ,
+  //    graph_build_algo::NN_DESCENT,
+  //    graph_build_algo::ITERATIVE_CAGRA_SEARCH},
+  //   {search_algo::AUTO},
+  //   {10},
+  //   {0},  // team_size
+  //   {64},
+  //   {1},
+  //   {cuvs::distance::DistanceType::L2Expanded,
+  //    cuvs::distance::DistanceType::InnerProduct,
+  //    cuvs::distance::DistanceType::BitwiseHamming,
+  //    cuvs::distance::DistanceType::CosineExpanded},
+  //   {false},
+  //   {false},
+  //   {false},
+  //   {0.995},
+  //   {std::optional<float>{std::nullopt}},
+  //   {std::optional<vpq_params>{std::nullopt}},
+  //   {std::optional<bool>{std::nullopt}},
+  //   {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL,
+  //    cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL});
+  // inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
+
+  // // Vary team size only.
+  // inputs2 = raft::util::itertools::product<AnnCagraInputs>(
+  //   {100},
+  //   {1000},
+  //   {64},
+  //   {16},
+  //   {graph_build_algo::NN_DESCENT},
+  //   {search_algo::AUTO},
+  //   {10},
+  //   {8, 16, 32},  // team_size
+  //   {64},
+  //   {1},
+  //   {cuvs::distance::DistanceType::L2Expanded,
+  //    cuvs::distance::DistanceType::InnerProduct,
+  //    cuvs::distance::DistanceType::BitwiseHamming,
+  //    cuvs::distance::DistanceType::CosineExpanded},
+  //   {false},
+  //   {false},
+  //   {false},
+  //   {0.995},
+  //   {std::optional<float>{std::nullopt}},
+  //   {std::optional<vpq_params>{std::nullopt}},
+  //   {std::optional<bool>{std::nullopt}},
+  //   {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL});
+  // inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
+
+  // // Varying n_rows, host_dataset
+  // inputs2 = raft::util::itertools::product<AnnCagraInputs>(
+  //   {100},
+  //   {10000},
+  //   {32},
+  //   {10},
+  //   {graph_build_algo::AUTO},
+  //   {search_algo::AUTO},
+  //   {10},
+  //   {0},  // team_size
+  //   {64},
+  //   {1},
+  //   {cuvs::distance::DistanceType::L2Expanded, cuvs::distance::DistanceType::InnerProduct},
+  //   {false, true},
+  //   {false},
+  //   {true},
+  //   {0.985},
+  //   {std::optional<float>{std::nullopt}},
+  //   {std::optional<vpq_params>{std::nullopt}},
+  //   {std::optional<bool>{std::nullopt}},
+  //   {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL,
+  //    cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL});
+  // inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
+
+  // // A few PQ configurations.
+  // // Varying dim, vq_n_centers
+  // inputs2 = raft::util::itertools::product<AnnCagraInputs>(
+  //   {100},
+  //   {10000},
+  //   {64, 128, 192, 256, 512, 1024},  // dim
+  //   {16},                            // k
+  //   {graph_build_algo::IVF_PQ},
+  //   {search_algo::AUTO},
+  //   {10},
+  //   {0},
+  //   {64},
+  //   {1},
+  //   {cuvs::distance::DistanceType::L2Expanded},
+  //   {false},
+  //   {true},
+  //   {false},
+  //   {0.6},
+  //   {std::optional<float>{std::nullopt}},
+  //   {std::optional<vpq_params>{std::nullopt}},
+  //   {std::optional<bool>{std::nullopt}},
+  //   {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL,
+  //    cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL});  // don't demand high recall
                                                                 // without refinement
   for (uint32_t pq_len : {2}) {  // for now, only pq_len = 2 is supported, more options coming  soon
     for (uint32_t vq_n_centers : {100, 1000}) {
diff --git a/python/cuvs_bench/cuvs_bench/run/__main__.py b/python/cuvs_bench/cuvs_bench/run/__main__.py
index ef1bf06d4e..5abd19d7c7 100644
--- a/python/cuvs_bench/cuvs_bench/run/__main__.py
+++ b/python/cuvs_bench/cuvs_bench/run/__main__.py
@@ -199,6 +199,7 @@ def main(
 
     """
 
+    print("config ?")
     if not data_export:
         run_benchmark(**locals())
 
diff --git a/python/cuvs_bench/cuvs_bench/run/run.py b/python/cuvs_bench/cuvs_bench/run/run.py
index 8830c89622..9597ff5245 100644
--- a/python/cuvs_bench/cuvs_bench/run/run.py
+++ b/python/cuvs_bench/cuvs_bench/run/run.py
@@ -637,6 +637,8 @@ def run_benchmark(
     conf_file = prepare_conf_file(dataset_conf, subset_size, count, batch_size)
     algos_conf_fs = gather_algorithm_configs(scripts_path, configuration)
 
+    
+
     allowed_algos = algorithms.split(",") if algorithms else None
     allowed_groups = groups.split(",") if groups else None
     allowed_algo_groups = (

From 71a4a0901cde840baa14b4b72eb15e2606e1e003 Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Tue, 13 Jan 2026 09:15:59 -0800
Subject: [PATCH 046/119] removed temp files

---
 cpp/.clangd_headers/cuda_runtime.h | 52 ------------------------------
 1 file changed, 52 deletions(-)
 delete mode 100644 cpp/.clangd_headers/cuda_runtime.h

diff --git a/cpp/.clangd_headers/cuda_runtime.h b/cpp/.clangd_headers/cuda_runtime.h
deleted file mode 100644
index b3bdecf5a3..0000000000
--- a/cpp/.clangd_headers/cuda_runtime.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#pragma once
-
-#include <cstdint>
-#include <cstddef>
-
-// Basic CUDA types needed for RAFT/cuVS analysis
-enum cudaError_t { cudaSuccess = 0, cudaErrorMemoryAllocation = 2 };
-enum cudaMemcpyKind { cudaMemcpyHostToHost = 0, cudaMemcpyHostToDevice = 1, cudaMemcpyDeviceToHost = 2, cudaMemcpyDeviceToDevice = 3 };
-
-// Global memory type enum (often used outside the struct)
-// Moved out of struct to avoid type mismatch errors
-enum cudaMemoryType { 
-    cudaMemoryTypeHost = 1, 
-    cudaMemoryTypeDevice = 2, 
-    cudaMemoryTypeManaged = 3, 
-    cudaMemoryTypeUnregistered = 4 
-};
-
-struct cudaPointerAttributes {
-    // Remove internal enum definition to avoid "different enumeration types" error
-    // Use the global enum type
-    enum cudaMemoryType type;
-    int device;
-    void* devicePointer;
-    void* hostPointer;
-    int isManaged;
-};
-
-typedef struct CUstream_st* cudaStream_t;
-typedef struct cudaDeviceProp* cudaDeviceProp_t; // Incomplete type is usually enough for pointers
-
-// Stub functions (declarations only)
-inline cudaError_t cudaPointerGetAttributes(cudaPointerAttributes* attributes, const void* ptr) { return cudaSuccess; }
-inline cudaError_t cudaMalloc(void** devPtr, size_t size) { return cudaSuccess; }
-inline cudaError_t cudaFree(void* devPtr) { return cudaSuccess; }
-inline cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) { return cudaSuccess; }
-inline cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0) { return cudaSuccess; }
-inline cudaError_t cudaStreamSynchronize(cudaStream_t stream) { return cudaSuccess; }
-inline cudaError_t cudaDeviceSynchronize() { return cudaSuccess; }
-
-// Error handling stubs
-inline cudaError_t cudaGetLastError() { return cudaSuccess; }
-inline cudaError_t cudaPeekAtLastError() { return cudaSuccess; }
-inline const char* cudaGetErrorName(cudaError_t error) { return "cudaSuccess"; }
-inline const char* cudaGetErrorString(cudaError_t error) { return "no error"; }
-
-// Defines that might be checked
-#define __CUDACC__ 1
-#define __host__
-#define __device__
-#define __global__
-#define __forceinline__ inline

From ff8174d0e5647eff349a48f28f22ffe53ff72333 Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Wed, 14 Jan 2026 00:49:37 -0800
Subject: [PATCH 047/119] started implementing cagra q index computation
 outside of the loop

---
 cpp/.clangd_headers/cuda_runtime.h            | 52 +++++++++++++++++++
 .../detail/cagra/search_multi_kernel.cuh      |  2 +-
 2 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 cpp/.clangd_headers/cuda_runtime.h

diff --git a/cpp/.clangd_headers/cuda_runtime.h b/cpp/.clangd_headers/cuda_runtime.h
new file mode 100644
index 0000000000..b3bdecf5a3
--- /dev/null
+++ b/cpp/.clangd_headers/cuda_runtime.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+
+// Basic CUDA types needed for RAFT/cuVS analysis
+enum cudaError_t { cudaSuccess = 0, cudaErrorMemoryAllocation = 2 };
+enum cudaMemcpyKind { cudaMemcpyHostToHost = 0, cudaMemcpyHostToDevice = 1, cudaMemcpyDeviceToHost = 2, cudaMemcpyDeviceToDevice = 3 };
+
+// Global memory type enum (often used outside the struct)
+// Moved out of struct to avoid type mismatch errors
+enum cudaMemoryType { 
+    cudaMemoryTypeHost = 1, 
+    cudaMemoryTypeDevice = 2, 
+    cudaMemoryTypeManaged = 3, 
+    cudaMemoryTypeUnregistered = 4 
+};
+
+struct cudaPointerAttributes {
+    // Remove internal enum definition to avoid "different enumeration types" error
+    // Use the global enum type
+    enum cudaMemoryType type;
+    int device;
+    void* devicePointer;
+    void* hostPointer;
+    int isManaged;
+};
+
+typedef struct CUstream_st* cudaStream_t;
+typedef struct cudaDeviceProp* cudaDeviceProp_t; // Incomplete type is usually enough for pointers
+
+// Stub functions (declarations only)
+inline cudaError_t cudaPointerGetAttributes(cudaPointerAttributes* attributes, const void* ptr) { return cudaSuccess; }
+inline cudaError_t cudaMalloc(void** devPtr, size_t size) { return cudaSuccess; }
+inline cudaError_t cudaFree(void* devPtr) { return cudaSuccess; }
+inline cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) { return cudaSuccess; }
+inline cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0) { return cudaSuccess; }
+inline cudaError_t cudaStreamSynchronize(cudaStream_t stream) { return cudaSuccess; }
+inline cudaError_t cudaDeviceSynchronize() { return cudaSuccess; }
+
+// Error handling stubs
+inline cudaError_t cudaGetLastError() { return cudaSuccess; }
+inline cudaError_t cudaPeekAtLastError() { return cudaSuccess; }
+inline const char* cudaGetErrorName(cudaError_t error) { return "cudaSuccess"; }
+inline const char* cudaGetErrorString(cudaError_t error) { return "no error"; }
+
+// Defines that might be checked
+#define __CUDACC__ 1
+#define __host__
+#define __device__
+#define __global__
+#define __forceinline__ inline
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
index e3ef7bacc1..bd5383d04a 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -119,7 +119,7 @@ RAFT_KERNEL random_pickup_kernel(
   extern __shared__ uint8_t smem[];
   dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id);
   // Set the resulting random index limit to the modulo wrap value if it is set
-  INDEX_T seed_index_limit = mod_wrap > 0 ? mod_wrap : dataset_desc->size;
+  INDEX_T seed_index_limit = graph_size > 0 ? graph_size : dataset_desc->size;
   __syncthreads();
 
   const INDEX_T seed_index_limit = graph_size > 0 ? graph_size : dataset_desc->size;

From 74f5894a7eac8c0c2d56e0c3cdff6add2824585e Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Wed, 14 Jan 2026 00:53:51 -0800
Subject: [PATCH 048/119] removed stub file

---
 cpp/.clangd_headers/cuda_runtime.h | 52 ------------------------------
 1 file changed, 52 deletions(-)
 delete mode 100644 cpp/.clangd_headers/cuda_runtime.h

diff --git a/cpp/.clangd_headers/cuda_runtime.h b/cpp/.clangd_headers/cuda_runtime.h
deleted file mode 100644
index b3bdecf5a3..0000000000
--- a/cpp/.clangd_headers/cuda_runtime.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#pragma once
-
-#include <cstdint>
-#include <cstddef>
-
-// Basic CUDA types needed for RAFT/cuVS analysis
-enum cudaError_t { cudaSuccess = 0, cudaErrorMemoryAllocation = 2 };
-enum cudaMemcpyKind { cudaMemcpyHostToHost = 0, cudaMemcpyHostToDevice = 1, cudaMemcpyDeviceToHost = 2, cudaMemcpyDeviceToDevice = 3 };
-
-// Global memory type enum (often used outside the struct)
-// Moved out of struct to avoid type mismatch errors
-enum cudaMemoryType { 
-    cudaMemoryTypeHost = 1, 
-    cudaMemoryTypeDevice = 2, 
-    cudaMemoryTypeManaged = 3, 
-    cudaMemoryTypeUnregistered = 4 
-};
-
-struct cudaPointerAttributes {
-    // Remove internal enum definition to avoid "different enumeration types" error
-    // Use the global enum type
-    enum cudaMemoryType type;
-    int device;
-    void* devicePointer;
-    void* hostPointer;
-    int isManaged;
-};
-
-typedef struct CUstream_st* cudaStream_t;
-typedef struct cudaDeviceProp* cudaDeviceProp_t; // Incomplete type is usually enough for pointers
-
-// Stub functions (declarations only)
-inline cudaError_t cudaPointerGetAttributes(cudaPointerAttributes* attributes, const void* ptr) { return cudaSuccess; }
-inline cudaError_t cudaMalloc(void** devPtr, size_t size) { return cudaSuccess; }
-inline cudaError_t cudaFree(void* devPtr) { return cudaSuccess; }
-inline cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) { return cudaSuccess; }
-inline cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0) { return cudaSuccess; }
-inline cudaError_t cudaStreamSynchronize(cudaStream_t stream) { return cudaSuccess; }
-inline cudaError_t cudaDeviceSynchronize() { return cudaSuccess; }
-
-// Error handling stubs
-inline cudaError_t cudaGetLastError() { return cudaSuccess; }
-inline cudaError_t cudaPeekAtLastError() { return cudaSuccess; }
-inline const char* cudaGetErrorName(cudaError_t error) { return "cudaSuccess"; }
-inline const char* cudaGetErrorString(cudaError_t error) { return "no error"; }
-
-// Defines that might be checked
-#define __CUDACC__ 1
-#define __host__
-#define __device__
-#define __global__
-#define __forceinline__ inline

From 5d7c4f46a666f9dfdedbfa66b3bf3d06c76f48fb Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Wed, 14 Jan 2026 01:11:04 -0800
Subject: [PATCH 049/119] updated gitignore

---
 .gitignore | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 3627558ff5..317c28d997 100644
--- a/.gitignore
+++ b/.gitignore
@@ -72,7 +72,9 @@ docs/source/_static/rust
 
 # clang tooling
 compile_commands.json
-.clangd/
+
+
+
 
 # serialized ann indexes
 brute_force_index

From c6dd661e0c45db723e1717cdd6ec5c8f938dceb2 Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Wed, 14 Jan 2026 08:27:32 -0800
Subject: [PATCH 050/119] cagra q index now is only calculated once in the
 iterative build

---
 cpp/include/cuvs/neighbors/cagra.hpp                      | 7 +++++++
 cpp/src/neighbors/detail/cagra/cagra_build.cuh            | 6 +++++-
 cpp/src/neighbors/detail/cagra/cagra_search.cuh           | 8 ++++++--
 cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh    | 4 ++--
 .../detail/cagra/search_single_cta_kernel-inl.cuh         | 3 ++-
 5 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 9b9e8eb0e6..ef55507869 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -247,6 +247,13 @@ struct search_params : cuvs::neighbors::search_params {
   /** Bit mask used for initial random seed node selection. */
   uint64_t rand_xor_mask = 0x128394;
 
+  /** 
+   * Maximum node ID for random seed selection. 
+   * When > 0, random seeds are constrained to [0, max_node_id) instead of [0, dataset_size).
+   * This is useful when the graph is smaller than the dataset (e.g., iterative build with compression).
+   * Default 0 means no constraint (use dataset_size).
+   */
+  uint32_t max_node_id = 0;
   /** Whether to use the persistent version of the kernel (only SINGLE_CTA is supported a.t.m.) */
   bool persistent = false;
   /** Persistent kernel: time in seconds before the kernel stops if no requests received. */
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 77846f202b..70a29fe379 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -1990,6 +1990,7 @@ void search_and_optimize(raft::resources const& res,
 {
   // Search.
   // Since there are many queries, divide them into batches and search them.
+  RAFT_LOG_DEBUG("search_and_optimize: search_params.max_node_id=%u", search_params.max_node_id);
   cuvs::spatial::knn::detail::utils::batch_load_iterator<T> query_batch(
     dev_query_view.data_handle(),
     curr_query_size,
@@ -2185,7 +2186,10 @@ auto iterative_build_graph(
     auto neighbors_view =
       raft::make_host_matrix_view<IdxT, int64_t>(neighbors_ptr, curr_query_size, curr_topk);
 
-    
+    // Set max_node_id to constrain random seed selection to valid graph nodes
+    search_params.max_node_id = static_cast<uint32_t>(curr_graph_size);
+    RAFT_LOG_DEBUG("iterative_build: Setting search_params.max_node_id=%u (curr_graph_size=%lu)",
+                   search_params.max_node_id, curr_graph_size);
 
     search_and_optimize(res,
                         search_params,
diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index a80afdca95..0efb7cfd45 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -51,6 +51,8 @@ void search_main_core(
   raft::device_matrix_view<DistanceT, int64_t, raft::row_major> distances,
   CagraSampleFilterT sample_filter = CagraSampleFilterT())
 {
+  RAFT_LOG_DEBUG("search_main_core: max_node_id=%u, graph.extent(0)=%lu", 
+                 params.max_node_id, graph.extent(0));
   static_assert(std::is_same_v<IndexT, uint32_t>,
                 "Only uint32_t is supported as the graph element type (internal index type)");
   RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n",
@@ -72,11 +74,13 @@ void search_main_core(
     topk,
     queries.extent(1));
 
+  RAFT_LOG_DEBUG("search_main_core: creating plan with max_node_id=%u", params.max_node_id);
   using CagraSampleFilterT_s = typename CagraSampleFilterT_Selector<CagraSampleFilterT>::type;
   std::unique_ptr<
     search_plan_impl<DataT, IndexT, DistanceT, CagraSampleFilterT_s, SourceIdxT, OutputIdxT>>
     plan = factory<DataT, IndexT, DistanceT, CagraSampleFilterT_s, SourceIdxT, OutputIdxT>::create(
       res, params, dataset_desc, queries.extent(1), graph.extent(0), graph.extent(1), topk);
+  RAFT_LOG_DEBUG("search_main_core: plan created, plan->max_node_id=%u", plan->max_node_id);
 
   plan->check(topk);
 
@@ -153,7 +157,7 @@ void search_main(raft::resources const& res,
   if (auto* strided_dset = dynamic_cast<const strided_dataset<T, ds_idx_type>*>(&index.data());
       strided_dset != nullptr) {
     // Search using a plain (strided) row-major dataset
-    RAFT_LOG_INFO("Searching with strided dataset");
+    RAFT_LOG_DEBUG("Searching with strided dataset");
     RAFT_EXPECTS(index.metric() != cuvs::distance::DistanceType::CosineExpanded ||
                    index.dataset_norms().has_value(),
                  "Dataset norms must be provided for CosineExpanded metric");
@@ -180,7 +184,7 @@ void search_main(raft::resources const& res,
     RAFT_FAIL("FP32 VPQ dataset support is coming soon");
   } else if (auto* vpq_dset = dynamic_cast<const vpq_dataset<half, ds_idx_type>*>(&index.data());
              vpq_dset != nullptr) {
-    RAFT_LOG_INFO("Searching with VPQ dataset");
+    RAFT_LOG_DEBUG("Searching with VPQ dataset");
     auto desc = dataset_descriptor_init_with_cache<T, graph_idx_type, DistanceT>(
       res, params, *vpq_dset, index.metric(), nullptr);
     search_main_core<T, graph_idx_type, DistanceT, CagraSampleFilterT, IdxT, OutputIdxT>(
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
index bd5383d04a..c8d94edb82 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -178,6 +178,7 @@ void random_pickup(const dataset_descriptor_host<DataT, IndexT, DistanceT>& data
   const auto num_teams_per_threadblock = block_size / dataset_desc.team_size;
   const dim3 grid_size((num_pickup + num_teams_per_threadblock - 1) / num_teams_per_threadblock,
                        num_queries);
+  RAFT_LOG_DEBUG("max_node_id: %d", mod_wrap);
 
   random_pickup_kernel<<<grid_size, block_size, dataset_desc.smem_ws_size_in_bytes, cuda_stream>>>(
     dataset_desc.dev_ptr(cuda_stream),
@@ -833,8 +834,7 @@ struct search
                                             result_buffer_allocation_size,
                                             hashmap.data(),
                                             hash_bitlen,
-                                            stream,
-                                            static_cast<IndexT>(this->dataset_size));
+                                            stream);
 
     unsigned iter = 0;
     while (1) {
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 5d465c25b5..b0ea9113d7 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1323,7 +1323,8 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel_p(
                                  small_hash_bitlen,
                                  small_hash_reset_interval,
                                  query_id,
-                                 sample_filter);
+                                 sample_filter,
+                                 0);  // TODO: persistent kernel doesn't support max_node_id yet
 
     // make sure all writes are visible even for the host
     //     (e.g. when result buffers are in pinned memory)

From 83580cd7bc57226fed054fb6ab58790a41fdcd3d Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Mon, 9 Feb 2026 05:35:00 -0800
Subject: [PATCH 051/119] fixed rebasing artefact

---
 cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
index c8d94edb82..c8bd9db324 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -118,12 +118,10 @@ RAFT_KERNEL random_pickup_kernel(
   if (global_team_index >= num_pickup) { return; }
   extern __shared__ uint8_t smem[];
   dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id);
-  // Set the resulting random index limit to the modulo wrap value if it is set
-  INDEX_T seed_index_limit = graph_size > 0 ? graph_size : dataset_desc->size;
   __syncthreads();
 
+  // Set the resulting random index limit to the modulo wrap value if it is set
   const INDEX_T seed_index_limit = graph_size > 0 ? graph_size : dataset_desc->size;
-
   INDEX_T best_index_team_local;
   DISTANCE_T best_norm2_team_local = utils::get_max_value<DISTANCE_T>();
   for (unsigned i = 0; i < num_distilation; i++) {

From 13b1f77b8000053d0c8a2170b390f9707fc6fbc1 Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Mon, 16 Feb 2026 05:54:19 -0800
Subject: [PATCH 052/119] addressed comments regarding type cast

---
 cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh  | 2 +-
 cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
index a2e9a43ff0..92331b0eb6 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
@@ -630,7 +630,7 @@ void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dat
                                                        ps.max_iterations,
                                                        num_executed_iterations,
                                                        sample_filter,
-                                                       static_cast<uint32_t>(graph.extent(0)));
+                                                       static_cast<IndexT>(graph.extent(0)));
 }
 
 }  // namespace multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 5d465c25b5..32bdcf07ad 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -2348,7 +2348,7 @@ control is returned in this thread (in persistent_runner_t constructor), so we'r
                                                            small_hash_bitlen,
                                                            small_hash_reset_interval,
                                                            sample_filter,
-                                                           static_cast<uint32_t>(graph.extent(0)));
+                                                           static_cast<IndexT>(graph.extent(0)));
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 }

From 609b0f3db6643c0a62dd89d59202ec4a43b339c5 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Mon, 16 Feb 2026 18:52:40 +0000
Subject: [PATCH 053/119] prune kernel smem

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 41efa1686f..270e76838a 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -166,14 +166,20 @@ __global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, g
                            uint64_t* const stats)
 {
   __shared__ uint32_t smem_num_detour[MAX_DEGREE];
+  extern __shared__ unsigned char smem_buf[];
+  IdxT* const smem_knn_iA_neighbors = reinterpret_cast<IdxT*>(smem_buf);
+
   uint64_t* const num_retain = stats;
   uint64_t* const num_full   = stats + 1;
 
   const uint64_t iA = blockIdx.x + (batch_size * batch_id);
   if (iA >= graph_size) { return; }
+
+  // Load this node's neighbor row into shared memory to reduce global reads
   for (uint32_t k = threadIdx.x; k < graph_degree; k += blockDim.x) {
-    smem_num_detour[k] = 0;
-    if (knn_graph[k + ((uint64_t)graph_degree * iA)] == iA) {
+    smem_num_detour[k]       = 0;
+    smem_knn_iA_neighbors[k] = knn_graph[k + ((uint64_t)graph_degree * iA)];
+    if (smem_knn_iA_neighbors[k] == iA) {
       // Lower the priority of self-edge
       smem_num_detour[k] = graph_degree;
     }
@@ -182,14 +188,14 @@ __global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, g
 
   // count number of detours (A->D->B)
   for (uint32_t kAD = 0; kAD < graph_degree - 1; kAD++) {
-    const uint64_t iD = knn_graph[kAD + (graph_degree * iA)];
+    const uint64_t iD = smem_knn_iA_neighbors[kAD];
     if (iD >= graph_size) { continue; }
     for (uint32_t kDB = threadIdx.x; kDB < graph_degree; kDB += blockDim.x) {
       const uint64_t iB_candidate = knn_graph[kDB + ((uint64_t)graph_degree * iD)];
       for (uint32_t kAB = kAD + 1; kAB < graph_degree; kAB++) {
         // if ( kDB < kAB )
         {
-          const uint64_t iB = knn_graph[kAB + (graph_degree * iA)];
+          const uint64_t iB = smem_knn_iA_neighbors[kAB];
           if (iB == iB_candidate) {
             atomicAdd(smem_num_detour + kAB, 1);
             break;
@@ -1298,9 +1304,10 @@ void optimize(
       RAFT_CUDA_TRY(cudaMemsetAsync(
         dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res)));
 
+      const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT);
       for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
         kern_prune<MAX_DEGREE, IdxT>
-          <<<blocks_prune, threads_prune, 0, raft::resource::get_cuda_stream(res)>>>(
+          <<<blocks_prune, threads_prune, prune_smem_size, raft::resource::get_cuda_stream(res)>>>(
             d_input_graph.data_handle(),
             graph_size,
             knn_graph_degree,

From a320e0e90453527e156da007bb96dc00de3898c0 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 18 Feb 2026 16:26:54 +0000
Subject: [PATCH 054/119] reduce copies within reverse graph compute

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 61 ++++++++++++-------
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 270e76838a..f3b0f0778e 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -244,6 +244,29 @@ __global__ void kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_
   }
 }
 
+// Build reverse graph from column k of output_graph (avoids per-column host fill and H2D copy).
+template <class IdxT>
+__global__ void kern_make_rev_graph_column(const IdxT* const output_graph,  // [graph_size, degree]
+                                           IdxT* const rev_graph,
+                                           uint32_t* const rev_graph_count,
+                                           const uint32_t graph_size,
+                                           const uint32_t degree,
+                                           const uint32_t k)
+{
+  const uint64_t tid  = threadIdx.x + (blockDim.x * blockIdx.x);
+  const uint64_t tnum = blockDim.x * gridDim.x;
+
+  for (uint64_t src_id = tid; src_id < graph_size; src_id += tnum) {
+    const IdxT dest_id = output_graph[k + (static_cast<uint64_t>(degree) * src_id)];
+    if (dest_id >= graph_size) continue;
+
+    const uint32_t pos = atomicAdd(rev_graph_count + dest_id, 1);
+    if (pos < degree) {
+      rev_graph[(static_cast<uint64_t>(degree) * dest_id) + pos] = static_cast<IdxT>(src_id);
+    }
+  }
+}
+
 template <class IdxT, class LabelT>
 __device__ __host__ LabelT get_root_label(IdxT i, const LabelT* label)
 {
@@ -1444,32 +1467,26 @@ void optimize(
                                   graph_size * sizeof(uint32_t),
                                   raft::resource::get_cuda_stream(res)));
 
-    auto dest_nodes = raft::make_host_vector<IdxT, int64_t>(graph_size);
-    auto d_dest_nodes =
-      raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
-
-    for (uint64_t k = 0; k < output_graph_degree; k++) {
-#pragma omp parallel for
-      for (uint64_t i = 0; i < graph_size; i++) {
-        // dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)];
-        dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)];
-      }
-      raft::resource::sync_stream(res);
-
-      raft::copy(d_dest_nodes.data_handle(),
-                 dest_nodes.data_handle(),
-                 graph_size,
-                 raft::resource::get_cuda_stream(res));
+    // Copy full output graph to device once; kernel indexes by column k (no per-column H2D copy).
+    // TODO: depending on available device memory, this may need to be split into multiple copies.
+    auto d_output_graph = raft::make_device_mdarray<IdxT>(
+      res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
+    raft::copy(d_output_graph.data_handle(),
+               output_graph_ptr,
+               static_cast<size_t>(graph_size) * output_graph_degree,
+               raft::resource::get_cuda_stream(res));
 
-      dim3 threads(256, 1, 1);
-      dim3 blocks(1024, 1, 1);
-      kern_make_rev_graph<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
-        d_dest_nodes.data_handle(),
+    dim3 threads(256, 1, 1);
+    dim3 blocks(1024, 1, 1);
+    for (uint32_t k = 0; k < output_graph_degree; k++) {
+      kern_make_rev_graph_column<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
+        d_output_graph.data_handle(),
         d_rev_graph.data_handle(),
         d_rev_graph_count.data_handle(),
         graph_size,
-        output_graph_degree);
-      RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u    \r", k, output_graph_degree);
+        output_graph_degree,
+        k);
+      RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %u / %u    \r", k, output_graph_degree);
     }
 
     raft::resource::sync_stream(res);

From 6d1a6187f2cc138c4941608d4d9c746a11e0d774 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 19 Feb 2026 23:07:23 +0000
Subject: [PATCH 055/119] optimize() draft move more compute to GPU

---
 .../neighbors/detail/cagra/cagra_build.cuh    |   2 +
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 864 ++++++++++++------
 2 files changed, 590 insertions(+), 276 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 97d7bb1bac..152b603286 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -822,6 +822,8 @@ inline std::pair<size_t, size_t> optimize_workspace_size(size_t n_rows,
                                                          size_t index_size,
                                                          bool mst_optimize = false)
 {
+  // TODO: MODIFY!!
+
   // MST optimization memory (host only)
   size_t mst_host = n_rows * index_size;  // mst_graph_num_edges
   if (mst_optimize) {
diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index f3b0f0778e..f2cd79ecb6 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -161,8 +161,8 @@ __global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, g
                            const uint32_t degree,
                            const uint32_t batch_size,
                            const uint32_t batch_id,
-                           uint8_t* const detour_count,          // [graph_chunk_size, graph_degree]
-                           uint32_t* const num_no_detour_edges,  // [graph_size]
+                           uint8_t* const detour_count,          // [batch_size, graph_degree]
+                           uint32_t* const num_no_detour_edges,  // [batch_size]
                            uint64_t* const stats)
 {
   __shared__ uint32_t smem_num_detour[MAX_DEGREE];
@@ -172,7 +172,9 @@ __global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, g
   uint64_t* const num_retain = stats;
   uint64_t* const num_full   = stats + 1;
 
-  const uint64_t iA = blockIdx.x + (batch_size * batch_id);
+  const uint64_t iA       = blockIdx.x + (batch_size * batch_id);
+  const uint64_t iA_batch = iA % static_cast<uint64_t>(batch_size);
+
   if (iA >= graph_size) { return; }
 
   // Load this node's neighbor row into shared memory to reduce global reads
@@ -208,7 +210,7 @@ __global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, g
 
   uint32_t num_edges_no_detour = 0;
   for (uint32_t k = threadIdx.x; k < graph_degree; k += blockDim.x) {
-    detour_count[k + (graph_degree * iA)] = min(smem_num_detour[k], (uint32_t)255);
+    detour_count[k + (graph_degree * iA_batch)] = min(smem_num_detour[k], (uint32_t)255);
     if (smem_num_detour[k] == 0) { num_edges_no_detour++; }
   }
   num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 1);
@@ -219,7 +221,7 @@ __global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, g
   num_edges_no_detour = min(num_edges_no_detour, degree);
 
   if (threadIdx.x == 0) {
-    num_no_detour_edges[iA] = num_edges_no_detour;
+    num_no_detour_edges[iA_batch] = num_edges_no_detour;
     atomicAdd((unsigned long long int*)num_retain, (unsigned long long int)num_edges_no_detour);
     if (num_edges_no_detour >= degree) { atomicAdd((unsigned long long int*)num_full, 1); }
   }
@@ -244,26 +246,179 @@ __global__ void kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_
   }
 }
 
-// Build reverse graph from column k of output_graph (avoids per-column host fill and H2D copy).
+// Select output_graph_degree neighbors with smallest detour count per node (writes to device).
 template <class IdxT>
-__global__ void kern_make_rev_graph_column(const IdxT* const output_graph,  // [graph_size, degree]
-                                           IdxT* const rev_graph,
-                                           uint32_t* const rev_graph_count,
-                                           const uint32_t graph_size,
-                                           const uint32_t degree,
-                                           const uint32_t k)
+__global__ void kern_select_smallest_detour_neighbors(
+  const IdxT* const knn_graph,
+  uint64_t graph_size,
+  uint64_t knn_graph_degree,
+  uint64_t output_graph_degree,
+  const uint8_t* const d_detour_count,  // [batch_size, graph_degree]
+  IdxT* output_graph_ptr,               // [batch_size, output_graph_degree]
+  const uint32_t batch_size,
+  const uint32_t batch_id)
 {
-  const uint64_t tid  = threadIdx.x + (blockDim.x * blockIdx.x);
-  const uint64_t tnum = blockDim.x * gridDim.x;
+  // FIXME: this does not really work for num_warps > 1
+  constexpr unsigned warp_mask = 0xffffffff;
+  const uint32_t num_warps     = blockDim.x / raft::WarpSize;
+  extern __shared__ unsigned char smem_buf[];
+  uint32_t* smem_indices = reinterpret_cast<uint32_t*>(smem_buf);
+  uint16_t* smem_detour_count =
+    reinterpret_cast<uint16_t*>(&smem_indices[knn_graph_degree * num_warps]);
 
-  for (uint64_t src_id = tid; src_id < graph_size; src_id += tnum) {
-    const IdxT dest_id = output_graph[k + (static_cast<uint64_t>(degree) * src_id)];
-    if (dest_id >= graph_size) continue;
+  const uint32_t wid     = threadIdx.x / raft::WarpSize;
+  const uint32_t lane_id = threadIdx.x % raft::WarpSize;
+  const uint64_t nid     = static_cast<uint64_t>(blockIdx.x) * num_warps +
+                       (static_cast<uint64_t>(batch_size) * batch_id * num_warps) + wid;
 
-    const uint32_t pos = atomicAdd(rev_graph_count + dest_id, 1);
-    if (pos < degree) {
-      rev_graph[(static_cast<uint64_t>(degree) * dest_id) + pos] = static_cast<IdxT>(src_id);
+  const uint64_t nid_batch = nid % static_cast<uint64_t>(batch_size);
+
+  if (nid >= graph_size) return;
+
+  for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) {
+    smem_detour_count[(knn_graph_degree * wid) + k] =
+      d_detour_count[nid_batch * knn_graph_degree + k];
+    smem_indices[(knn_graph_degree * wid) + k] = k;
+  }
+  __syncwarp(warp_mask);
+
+  for (uint32_t i = 0; i < output_graph_degree; i++) {
+    uint32_t local_min = 256;
+    uint32_t local_idx = 0xffffffff;
+    for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) {
+      uint32_t c = smem_detour_count[(knn_graph_degree * wid) + k];
+      if (c < local_min) {
+        local_min = c;
+        local_idx = smem_indices[(knn_graph_degree * wid) + k];
+      }
+    }
+    uint32_t local_min_with_tag = (local_min << 16) | local_idx;
+    for (int offset = raft::WarpSize / 2; offset > 0; offset /= 2) {
+      uint32_t other     = __shfl_down_sync(warp_mask, local_min_with_tag, offset);
+      local_min_with_tag = (local_min_with_tag <= other) ? local_min_with_tag : other;
+    }
+    uint32_t warp_min_tag   = __shfl_sync(warp_mask, local_min_with_tag, 0);
+    uint32_t warp_local_idx = warp_min_tag & 0xffff;
+
+    if (local_idx == warp_local_idx) {
+      output_graph_ptr[nid_batch * output_graph_degree + i] =
+        knn_graph[knn_graph_degree * nid + warp_local_idx];
+      smem_detour_count[knn_graph_degree * wid + warp_local_idx] = 255;
+    }
+    __syncwarp(warp_mask);
+  }
+}
+
+// Helper functions for merging the graph
+template <typename T>
+__device__ unsigned int warp_pos_in_array(T val, const T* array, uint64_t num)
+{
+  unsigned int ret       = num;
+  const uint32_t lane_id = threadIdx.x % 32;
+  for (uint64_t i = lane_id; i < num; i += 32) {
+    if (val == array[i]) {
+      ret = i;
+      break;
+    }
+  }
+  ret = __reduce_min_sync(0xffffffff, ret);
+  return ret;
+}
+
+template <typename T>
+__device__ void thread_shift_array(T* array, uint64_t num)
+{
+  for (uint64_t i = num; i > 0; i--) {
+    array[i] = array[i - 1];
+  }
+}
+
+template <typename IdxT>
+__global__ void kern_merge_graph(IdxT* output_graph,
+                                 const IdxT* const rev_graph,
+                                 uint32_t* const rev_graph_count,  // [graph_size]
+                                 const uint32_t graph_size,
+                                 const uint32_t output_graph_degree,
+                                 const IdxT* const mst_graph,
+                                 const uint32_t mst_graph_degree,
+                                 const uint32_t* const mst_graph_num_edges_ptr,
+                                 const uint32_t batch_size,
+                                 const uint32_t batch_id,
+                                 bool guarantee_connectivity,
+                                 bool* check_num_protected_edges)
+{
+  extern __shared__ unsigned char smem_buf[];
+  IdxT* smem_sorted_output_graph = reinterpret_cast<IdxT*>(smem_buf);
+
+  const uint32_t wid       = threadIdx.x / 32;
+  const uint32_t lane_id   = threadIdx.x % 32;
+  const uint32_t num_warps = blockDim.x / 32;
+  const uint64_t nid       = blockIdx.x * num_warps + (batch_size * batch_id * num_warps) + wid;
+  if (nid >= graph_size) { return; }
+
+  if (lane_id == 0) check_num_protected_edges[0] = true;
+
+  const auto mst_graph_num_edges = mst_graph_num_edges_ptr[nid];
+  // If guarantee_connectivity == true, use a temporal list to merge the
+  // neighbor lists of the graphs.
+  if (guarantee_connectivity) {
+    for (uint32_t i = lane_id; i < mst_graph_degree; i += 32) {
+      smem_sorted_output_graph[i] = mst_graph[nid * mst_graph_degree + i];
     }
+    __syncwarp();
+    for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges;
+         (pruned_j < output_graph_degree) && (output_j < output_graph_degree);
+         pruned_j++) {
+      const auto v     = output_graph[output_graph_degree * nid + pruned_j];
+      unsigned int dup = 0;
+      for (uint32_t m = lane_id; m < output_j; m += 32) {
+        if (v == smem_sorted_output_graph[m]) {
+          dup = 1;
+          break;
+        }
+      }
+
+      unsigned int warp_dup = __ballot_sync(0xffffffff, dup);
+      if (warp_dup == 0) {
+        if (lane_id == 0) smem_sorted_output_graph[output_j] = v;
+        output_j++;
+      }
+      __syncwarp();
+    }
+  }
+
+  else {
+    for (uint32_t i = lane_id; i < output_graph_degree; i += 32) {
+      smem_sorted_output_graph[i] = output_graph[output_graph_degree * nid + i];
+    }
+    __syncwarp();
+  }
+
+  const auto num_protected_edges = max(mst_graph_num_edges, output_graph_degree / 2);
+
+  if (num_protected_edges > output_graph_degree) { check_num_protected_edges[0] = false; }
+  if (num_protected_edges == output_graph_degree) { return; }
+
+  auto kr = min(rev_graph_count[nid], output_graph_degree);
+
+  while (kr) {
+    kr -= 1;
+    if (rev_graph[kr + (output_graph_degree * nid)] < graph_size) {
+      uint64_t pos = warp_pos_in_array<IdxT>(
+        rev_graph[kr + (output_graph_degree * nid)], smem_sorted_output_graph, output_graph_degree);
+      if (pos < num_protected_edges) { continue; }
+      uint64_t num_shift = pos - num_protected_edges;
+      if (pos >= output_graph_degree) { num_shift = output_graph_degree - num_protected_edges - 1; }
+      if (lane_id == 0) {
+        thread_shift_array<IdxT>(smem_sorted_output_graph + num_protected_edges, num_shift);
+        smem_sorted_output_graph[num_protected_edges] = rev_graph[kr + (output_graph_degree * nid)];
+      }
+      __syncwarp();
+    }
+  }
+
+  for (uint32_t i = lane_id; i < output_graph_degree; i += 32) {
+    output_graph[(output_graph_degree * nid) + i] = smem_sorted_output_graph[i];
   }
 }
 
@@ -737,11 +892,11 @@ void mst_opt_update_graph(IdxT* mst_graph_ptr,
 //   an approximate MST.
 // * If the input kNN graph is disconnected, random connection is added to the largest cluster.
 //
-template <typename IdxT = uint32_t>
+template <typename IdxT, typename InputMatrixView, typename OutputMatrixView, typename VectorView>
 void mst_optimization(raft::resources const& res,
-                      raft::host_matrix_view<IdxT, int64_t, raft::row_major> input_graph,
-                      raft::host_matrix_view<IdxT, int64_t, raft::row_major> output_graph,
-                      raft::host_vector_view<uint32_t, int64_t> mst_graph_num_edges,
+                      InputMatrixView input_graph,
+                      OutputMatrixView output_graph,
+                      VectorView mst_graph_num_edges,
                       bool use_gpu = true)
 {
   if (use_gpu) {
@@ -1185,6 +1340,7 @@ void count_2hop_detours(raft::host_matrix_view<IdxT, int64_t, raft::row_major> k
   }
 }
 
+// TODO allow pinned input for both knn_graph and new_graph
 template <typename IdxT = uint32_t,
           typename g_accessor =
             raft::host_device_accessor<cuda::std::default_accessor<IdxT>, raft::memory_type::host>>
@@ -1213,9 +1369,10 @@ void optimize(
     "cagra::graph::optimize(%zu, %zu, %u)", graph_size, knn_graph_degree, output_graph_degree);
 
   // MST optimization
-  auto mst_graph               = raft::make_host_matrix<IdxT, int64_t, raft::row_major>(0, 0);
-  auto mst_graph_num_edges     = raft::make_host_vector<uint32_t, int64_t>(graph_size);
+  auto mst_graph           = raft::make_pinned_matrix<IdxT, int64_t, raft::row_major>(res, 0, 0);
+  auto mst_graph_num_edges = raft::make_pinned_vector<uint32_t, int64_t>(res, graph_size);
   auto mst_graph_num_edges_ptr = mst_graph_num_edges.data_handle();
+
 #pragma omp parallel for
   for (uint64_t i = 0; i < graph_size; i++) {
     mst_graph_num_edges_ptr[i] = 0;
@@ -1223,10 +1380,10 @@ void optimize(
   if (guarantee_connectivity) {
     raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
       "cagra::graph::optimize/check_connectivity");
-    mst_graph =
-      raft::make_host_matrix<IdxT, int64_t, raft::row_major>(graph_size, output_graph_degree);
+    mst_graph = raft::make_pinned_matrix<IdxT, int64_t, raft::row_major>(
+      res, graph_size, output_graph_degree);
     RAFT_LOG_INFO("MST optimization is used to guarantee graph connectivity.");
-    mst_optimization(res, knn_graph, mst_graph.view(), mst_graph_num_edges.view(), use_gpu);
+    mst_optimization<IdxT>(res, knn_graph, mst_graph.view(), mst_graph_num_edges.view(), use_gpu);
 
     for (uint64_t i = 0; i < graph_size; i++) {
       if (i < 8 || i >= graph_size - 8) {
@@ -1235,6 +1392,37 @@ void optimize(
     }
   }
 
+  uint32_t batch_size =
+    std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
+  const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
+
+  //
+  // If the available device memory is insufficient, do not use the GPU to count
+  // the number of 2-hop detours, but use the CPU.
+  //
+  // TODO: we should decide on a global strategy for this in a single place
+  // it comes down to input memory type and available memory which data should be copied to GPU
+  bool _use_gpu_prune = use_gpu;
+  if (_use_gpu_prune) {
+    try {
+      auto d_detour_count = raft::make_device_mdarray<uint8_t>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, knn_graph_degree));
+      auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size));
+      auto d_output_graph = raft::make_device_mdarray<IdxT>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, output_graph_degree));
+      // TODO we also want to consider pinned memory in case we are short on memory
+      auto d_input_graph = raft::make_device_mdarray<IdxT>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, knn_graph_degree));
+    } catch (std::bad_alloc& e) {
+      RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU");
+      _use_gpu_prune = false;
+    } catch (raft::logic_error& e) {
+      RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU (logic error)");
+      _use_gpu_prune = false;
+    }
+  }
+
   {
     raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
       "cagra::graph::optimize/prune");
@@ -1253,63 +1441,10 @@ void optimize(
     // specified number of edges are picked up for each node, starting with the
     // edge with the lowest number of 2-hop detours.
     //
-    auto detour_count = raft::make_host_matrix<uint8_t, int64_t>(graph_size, knn_graph_degree);
-
-    //
-    // If the available device memory is insufficient, do not use the GPU to count
-    // the number of 2-hop detours, but use the CPU.
-    //
-    bool _use_gpu = use_gpu;
-    if (_use_gpu) {
-      try {
-        auto d_detour_count = raft::make_device_mdarray<uint8_t>(
-          res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, knn_graph_degree));
-        auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
-          res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
-        auto d_input_graph = raft::make_device_mdarray<IdxT>(
-          res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, knn_graph_degree));
-      } catch (std::bad_alloc& e) {
-        RAFT_LOG_DEBUG("Insufficient memory for 2-hop node counting on GPU");
-        _use_gpu = false;
-      } catch (raft::logic_error& e) {
-        RAFT_LOG_DEBUG("Insufficient memory for 2-hop node counting on GPU (logic error)");
-        _use_gpu = false;
-      }
-    }
-    if (_use_gpu) {
-      // Count 2-hop detours on GPU
-      raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-        "cagra::graph::optimize/prune/2-hop-counting-by-GPU");
-      const double time_2hop_count_start = cur_time();
-
-      uint64_t num_keep __attribute__((unused)) = 0;
-      uint64_t num_full __attribute__((unused)) = 0;
-      auto d_detour_count                       = raft::make_device_mdarray<uint8_t>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, knn_graph_degree));
-
-      RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
-                                    0xff,
-                                    graph_size * knn_graph_degree * sizeof(uint8_t),
-                                    raft::resource::get_cuda_stream(res)));
-
-      auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
-      RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(),
-                                    0x00,
-                                    graph_size * sizeof(uint32_t),
-                                    raft::resource::get_cuda_stream(res)));
-
-      auto dev_stats  = raft::make_device_vector<uint64_t>(res, 2);
-      auto host_stats = raft::make_host_vector<uint64_t>(2);
-
+    if (_use_gpu_prune) {
+      // Pruning on GPU
       RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");
 
-      // Copy knn_graph over to device if necessary
-      device_matrix_view_from_host d_input_graph(
-        res,
-        raft::make_host_matrix_view<IdxT, int64_t>(
-          knn_graph.data_handle(), graph_size, knn_graph_degree));
-
       constexpr int MAX_DEGREE = 1024;
       if (knn_graph_degree > MAX_DEGREE) {
         RAFT_FAIL(
@@ -1318,17 +1453,47 @@ void optimize(
           knn_graph_degree,
           MAX_DEGREE);
       }
-      const uint32_t batch_size =
-        std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
-      const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
-      const dim3 threads_prune(32, 1, 1);
-      const dim3 blocks_prune(batch_size, 1, 1);
 
+      const double prune_start = cur_time();
+
+      uint64_t num_keep __attribute__((unused)) = 0;
+      uint64_t num_full __attribute__((unused)) = 0;
+      auto dev_stats                            = raft::make_device_vector<uint64_t>(res, 2);
+      auto host_stats                           = raft::make_host_vector<uint64_t>(2);
       RAFT_CUDA_TRY(cudaMemsetAsync(
         dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res)));
 
-      const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT);
+      // Copy knn_graph over to device if necessary
+      // TODO: should we use pinned memory if we have issues fitting on GPU?
+      device_matrix_view_from_host d_input_graph(
+        res,
+        raft::make_host_matrix_view<IdxT, int64_t>(
+          knn_graph.data_handle(), graph_size, knn_graph_degree));
+
+      // data structures per batch
+      auto d_detour_count = raft::make_device_mdarray<uint8_t>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, knn_graph_degree));
+      auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size));
+      auto d_output_graph = raft::make_device_mdarray<IdxT>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, output_graph_degree));
+
       for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
+        // initialize the detour_count and num_no_detour_edges for the current batch
+        RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
+                                      0xff,
+                                      batch_size * knn_graph_degree * sizeof(uint8_t),
+                                      raft::resource::get_cuda_stream(res)));
+
+        RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(),
+                                      0x00,
+                                      batch_size * sizeof(uint32_t),
+                                      raft::resource::get_cuda_stream(res)));
+
+        // count 2-hop detours for the current batch
+        const dim3 threads_prune(32, 1, 1);
+        const dim3 blocks_prune(batch_size, 1, 1);
+        const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT);
         kern_prune<MAX_DEGREE, IdxT>
           <<<blocks_prune, threads_prune, prune_smem_size, raft::resource::get_cuda_stream(res)>>>(
             d_input_graph.data_handle(),
@@ -1340,6 +1505,30 @@ void optimize(
             d_detour_count.data_handle(),
             d_num_no_detour_edges.data_handle(),
             dev_stats.data_handle());
+
+        // select smallest-detour neighbors for the current batch
+        const size_t select_smem_size =
+          (knn_graph_degree * knn_graph_degree) * (sizeof(uint16_t) + sizeof(uint32_t));
+        const dim3 threads_select(32, 1, 1);
+        const dim3 blocks_select(batch_size, 1, 1);
+        kern_select_smallest_detour_neighbors<IdxT>
+          <<<blocks_select,
+             threads_select,
+             select_smem_size,
+             raft::resource::get_cuda_stream(res)>>>(d_input_graph.data_handle(),
+                                                     graph_size,
+                                                     knn_graph_degree,
+                                                     output_graph_degree,
+                                                     d_detour_count.data_handle(),
+                                                     d_output_graph.data_handle(),
+                                                     batch_size,
+                                                     i_batch);
+
+        raft::copy(output_graph_ptr,
+                   d_output_graph.data_handle() + i_batch * batch_size * output_graph_degree,
+                   static_cast<size_t>(batch_size) * output_graph_degree,
+                   raft::resource::get_cuda_stream(res));
+
         raft::resource::sync_stream(res);
         RAFT_LOG_DEBUG(
           "# Pruning kNN Graph on GPUs (%.1lf %%)\r",
@@ -1348,96 +1537,93 @@ void optimize(
       raft::resource::sync_stream(res);
       RAFT_LOG_DEBUG("\n");
 
-      raft::copy(detour_count.data_handle(),
-                 d_detour_count.data_handle(),
-                 detour_count.size(),
-                 raft::resource::get_cuda_stream(res));
-
       raft::copy(
         host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res));
       num_keep = host_stats.data_handle()[0];
       num_full = host_stats.data_handle()[1];
 
-      const double time_2hop_count_end = cur_time();
+      const double prune_end = cur_time();
       RAFT_LOG_DEBUG(
-        "# Time for 2-hop detour counting on GPU: %.1lf sec, "
+        "# Time for pruning on GPU: %.1lf sec, "
         "avg_no_detour_edges_per_node: %.2lf/%u, "
         "nodes_with_no_detour_at_all_edges: %.1lf%%",
-        time_2hop_count_end - time_2hop_count_start,
+        prune_end - prune_start,
         (double)num_keep / graph_size,
         output_graph_degree,
         (double)num_full / graph_size * 100);
     } else {
-      // Count 2-hop detours on CPU
-      raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-        "cagra::graph::optimize/prune/2-hop-counting-by-CPU");
-      const double time_2hop_count_start = cur_time();
+      // Pruning on CPU
+      auto detour_count = raft::make_host_matrix<uint8_t, int64_t>(graph_size, knn_graph_degree);
 
-      count_2hop_detours(knn_graph, detour_count.view());
+      {
+        raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+          "cagra::graph::optimize/prune/2-hop-counting-by-CPU");
+        const double time_2hop_count_start = cur_time();
 
-      const double time_2hop_count_end = cur_time();
-      RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec",
-                     time_2hop_count_end - time_2hop_count_start);
-    }
+        count_2hop_detours(knn_graph, detour_count.view());
 
-    // Create pruned kNN graph
-    bool invalid_neighbor_list = false;
+        const double time_2hop_count_end = cur_time();
+        RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec",
+                       time_2hop_count_end - time_2hop_count_start);
+      }
+      bool invalid_neighbor_list = false;
 #pragma omp parallel for
-    for (uint64_t i = 0; i < graph_size; i++) {
-      // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable
-      // count of the neighbors while increasing the target detourable count from zero.
-      uint64_t pk         = 0;
-      uint32_t num_detour = 0;
-      for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) {
-        uint32_t next_num_detour = std::numeric_limits<uint32_t>::max();
-        for (uint64_t k = 0; k < knn_graph_degree; k++) {
-          const auto num_detour_k = detour_count(i, k);
-          // Find the detourable count to check in the next iteration
-          if (num_detour_k > num_detour) {
-            next_num_detour = std::min(static_cast<uint32_t>(num_detour_k), next_num_detour);
-          }
-
-          // Store the neighbor index if its detourable count is equal to `num_detour`.
-          if (num_detour_k != num_detour) { continue; }
+      for (uint64_t i = 0; i < graph_size; i++) {
+        // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable
+        // count of the neighbors while increasing the target detourable count from zero.
+        uint64_t pk         = 0;
+        uint32_t num_detour = 0;
+        for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) {
+          uint32_t next_num_detour = std::numeric_limits<uint32_t>::max();
+          for (uint64_t k = 0; k < knn_graph_degree; k++) {
+            const auto num_detour_k = detour_count(i, k);
+            // Find the detourable count to check in the next iteration
+            if (num_detour_k > num_detour) {
+              next_num_detour = std::min(static_cast<uint32_t>(num_detour_k), next_num_detour);
+            }
 
-          // Check duplication and append
-          const auto candidate_node = knn_graph(i, k);
-          bool dup                  = false;
-          for (uint32_t dk = 0; dk < pk; dk++) {
-            if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) {
-              dup = true;
-              break;
+            // Store the neighbor index if its detourable count is equal to `num_detour`.
+            if (num_detour_k != num_detour) { continue; }
+
+            // Check duplication and append
+            const auto candidate_node = knn_graph(i, k);
+            bool dup                  = false;
+            for (uint32_t dk = 0; dk < pk; dk++) {
+              if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) {
+                dup = true;
+                break;
+              }
             }
-          }
-          if (!dup && candidate_node < graph_size) {
-            output_graph_ptr[i * output_graph_degree + pk] = candidate_node;
-            pk += 1;
+            if (!dup && candidate_node < graph_size) {
+              output_graph_ptr[i * output_graph_degree + pk] = candidate_node;
+              pk += 1;
+            }
+            if (pk >= output_graph_degree) break;
           }
           if (pk >= output_graph_degree) break;
-        }
-        if (pk >= output_graph_degree) break;
 
-        if (next_num_detour == std::numeric_limits<uint32_t>::max()) {
-          // There are no valid edges enough in the initial kNN graph. Break the loop here and catch
-          // the error at the next validation (pk != output_graph_degree).
-          break;
+          if (next_num_detour == std::numeric_limits<uint32_t>::max()) {
+            // There are no valid edges enough in the initial kNN graph. Break the loop here and
+            // catch the error at the next validation (pk != output_graph_degree).
+            break;
+          }
+          num_detour = next_num_detour;
+        }
+        if (pk != output_graph_degree) {
+          RAFT_LOG_DEBUG(
+            "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for "
+            "node %lu in the rank-based node reranking process",
+            output_graph_degree,
+            i);
+          invalid_neighbor_list = true;
         }
-        num_detour = next_num_detour;
-      }
-      if (pk != output_graph_degree) {
-        RAFT_LOG_DEBUG(
-          "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for "
-          "node %lu in the rank-based node reranking process",
-          output_graph_degree,
-          i);
-        invalid_neighbor_list = true;
       }
+      RAFT_EXPECTS(
+        !invalid_neighbor_list,
+        "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
+        "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
+        "overflows occur during the norm computation between the dataset vectors.");
     }
-    RAFT_EXPECTS(
-      !invalid_neighbor_list,
-      "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
-      "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
-      "overflows occur during the norm computation between the dataset vectors.");
 
     const double time_prune_end = cur_time();
     RAFT_LOG_DEBUG("# Pruning time: %.1lf ms", (time_prune_end - time_prune_start) * 1000.0);
@@ -1446,155 +1632,281 @@ void optimize(
   auto rev_graph       = raft::make_host_matrix<IdxT, int64_t>(graph_size, output_graph_degree);
   auto rev_graph_count = raft::make_host_vector<uint32_t, int64_t>(graph_size);
 
-  {
-    raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-      "cagra::graph::optimize/reverse");
+  bool _use_gpu_rev_graph = use_gpu;
+  // TODO: should we use pinned memory if we have issues fitting on GPU?
+  if (_use_gpu_rev_graph) {
+    try {
+      auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
+      auto d_dest_nodes =
+        raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
+      auto d_rev_graph = raft::make_device_mdarray<IdxT>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
+    } catch (std::bad_alloc& e) {
+      RAFT_LOG_DEBUG("Insufficient memory for reverse graph on GPU");
+      _use_gpu_rev_graph = false;
+    } catch (raft::logic_error& e) {
+      RAFT_LOG_DEBUG("Insufficient memory for reverse graph on GPU (logic error)");
+      _use_gpu_rev_graph = false;
+    }
+  }
+
+  const double time_make_start = cur_time();
+  if (_use_gpu_rev_graph) {
     //
-    // Make reverse graph
+    // Make reverse graph on GPU
     //
-    const double time_make_start = cur_time();
+    auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
+      res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
 
     device_matrix_view_from_host<IdxT, int64_t> d_rev_graph(res, rev_graph.view());
-    RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph.data_handle(),
-                                  0xff,
-                                  graph_size * output_graph_degree * sizeof(IdxT),
-                                  raft::resource::get_cuda_stream(res)));
+    device_matrix_view_from_host<IdxT, int64_t> d_output_graph(
+      res,
+      raft::make_host_matrix_view<IdxT, int64_t>(
+        output_graph_ptr, graph_size, output_graph_degree));
 
-    auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
-      res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
-    RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph_count.data_handle(),
-                                  0x00,
-                                  graph_size * sizeof(uint32_t),
-                                  raft::resource::get_cuda_stream(res)));
-
-    // Copy full output graph to device once; kernel indexes by column k (no per-column H2D copy).
-    // TODO: depending on available device memory, this may need to be split into multiple copies.
-    auto d_output_graph = raft::make_device_mdarray<IdxT>(
-      res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
-    raft::copy(d_output_graph.data_handle(),
-               output_graph_ptr,
-               static_cast<size_t>(graph_size) * output_graph_degree,
-               raft::resource::get_cuda_stream(res));
+    {
+      raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+        "cagra::graph::optimize/reverse");
+      auto dest_nodes = raft::make_host_vector<IdxT, int64_t>(graph_size);
+      auto d_dest_nodes =
+        raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
 
-    dim3 threads(256, 1, 1);
-    dim3 blocks(1024, 1, 1);
-    for (uint32_t k = 0; k < output_graph_degree; k++) {
-      kern_make_rev_graph_column<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
-        d_output_graph.data_handle(),
-        d_rev_graph.data_handle(),
-        d_rev_graph_count.data_handle(),
-        graph_size,
-        output_graph_degree,
-        k);
-      RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %u / %u    \r", k, output_graph_degree);
-    }
+      RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph.data_handle(),
+                                    0xff,
+                                    graph_size * output_graph_degree * sizeof(IdxT),
+                                    raft::resource::get_cuda_stream(res)));
 
-    raft::resource::sync_stream(res);
-    RAFT_LOG_DEBUG("\n");
+      RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph_count.data_handle(),
+                                    0x00,
+                                    graph_size * sizeof(uint32_t),
+                                    raft::resource::get_cuda_stream(res)));
 
-    if (d_rev_graph.allocated_memory()) {
-      raft::copy(rev_graph.data_handle(),
-                 d_rev_graph.data_handle(),
-                 graph_size * output_graph_degree,
+      for (uint64_t k = 0; k < output_graph_degree; k++) {
+#pragma omp parallel for
+        for (uint64_t i = 0; i < graph_size; i++) {
+          // dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)];
+          dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)];
+        }
+        raft::resource::sync_stream(res);
+
+        raft::copy(d_dest_nodes.data_handle(),
+                   dest_nodes.data_handle(),
+                   graph_size,
+                   raft::resource::get_cuda_stream(res));
+
+        dim3 threads(256, 1, 1);
+        dim3 blocks(1024, 1, 1);
+        kern_make_rev_graph<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
+          d_dest_nodes.data_handle(),
+          d_rev_graph.data_handle(),
+          d_rev_graph_count.data_handle(),
+          graph_size,
+          output_graph_degree);
+        RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u    \r", k, output_graph_degree);
+      }
+
+      raft::resource::sync_stream(res);
+      RAFT_LOG_DEBUG("\n");
+
+      if (d_rev_graph.allocated_memory()) {
+        raft::copy(rev_graph.data_handle(),
+                   d_rev_graph.data_handle(),
+                   graph_size * output_graph_degree,
+                   raft::resource::get_cuda_stream(res));
+      }
+      raft::copy(rev_graph_count.data_handle(),
+                 d_rev_graph_count.data_handle(),
+                 graph_size,
                  raft::resource::get_cuda_stream(res));
+
+      const double time_make_end = cur_time();
+      RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms",
+                     (time_make_end - time_make_start) * 1000.0);
     }
-    raft::copy(rev_graph_count.data_handle(),
-               d_rev_graph_count.data_handle(),
-               graph_size,
-               raft::resource::get_cuda_stream(res));
 
-    const double time_make_end = cur_time();
-    RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms",
-                   (time_make_end - time_make_start) * 1000.0);
-  }
+    {
+      raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+        "cagra::graph::optimize/combine");
 
-  {
-    raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-      "cagra::graph::optimize/combine");
-    //
-    // Create search graphs from MST and pruned and reverse graphs
-    //
-    const double time_replace_start = cur_time();
+      // Merging the prunned graph and the reverse graph
+      const double merge_graph_start = cur_time();
+
+      // Create a boolean variable on the GPU using RAFT device allocator
+      auto d_check_num_protected_edges = raft::make_device_scalar<bool>(res, true);
+
+      const dim3 threads_merge(32, 1, 1);
+      const dim3 blocks_merge(batch_size, 1, 1);
+      const size_t merge_smem_size = (output_graph_degree + output_graph_degree) * sizeof(IdxT);
+      for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
+        kern_merge_graph<IdxT>
+          <<<blocks_merge, threads_merge, merge_smem_size, raft::resource::get_cuda_stream(res)>>>(
+            d_output_graph.data_handle(),
+            d_rev_graph.data_handle(),
+            d_rev_graph_count.data_handle(),
+            graph_size,
+            output_graph_degree,
+            mst_graph.data_handle(),
+            output_graph_degree,
+            mst_graph_num_edges_ptr,
+            batch_size,
+            i_batch,
+            guarantee_connectivity,
+            d_check_num_protected_edges.data_handle());
+      }
+
+      bool check_num_protected_edges = true;
+      raft::copy(&check_num_protected_edges,
+                 d_check_num_protected_edges.data_handle(),
+                 1,
+                 raft::resource::get_cuda_stream(res));
+      raft::resource::sync_stream(res);
+
+      // TODO: is this required?
+      if (d_output_graph.allocated_memory()) {
+        raft::copy(output_graph_ptr,
+                   d_output_graph.data_handle(),
+                   graph_size * output_graph_degree,
+                   raft::resource::get_cuda_stream(res));
+      }
+
+      const auto merge_graph_end = cur_time();
+      RAFT_EXPECTS(check_num_protected_edges,
+                   "Failed to merge the MST, pruned, and reverse edge graphs. "
+                   "Some nodes have too "
+                   "many MST optimization edges.");
+
+      RAFT_LOG_DEBUG("# Time for merging graphs: %.1lf ms",
+                     (merge_graph_end - merge_graph_start) * 1000.0);
+    }
+  } else {
+    {
+      // Make reverse graph on CPU
+      raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+        "cagra::graph::optimize/reverse");
+
+      auto rev_graph_ptr       = rev_graph.data_handle();
+      auto rev_graph_count_ptr = rev_graph_count.data_handle();
 
-    bool check_num_protected_edges = true;
 #pragma omp parallel for
-    for (uint64_t i = 0; i < graph_size; i++) {
-      auto my_rev_graph = rev_graph.data_handle() + (output_graph_degree * i);
-      auto my_out_graph = output_graph_ptr + (output_graph_degree * i);
+      for (uint64_t i = 0; i < graph_size; i++) {
+        rev_graph_count_ptr[i] = 0;
+      }
 
-      // If guarantee_connectivity == true, use a temporal list to merge the neighbor lists of the
-      // graphs.
-      std::vector<IdxT> temp_output_neighbor_list;
-      if (guarantee_connectivity) {
-        temp_output_neighbor_list.resize(output_graph_degree);
-        my_out_graph                   = temp_output_neighbor_list.data();
-        const auto mst_graph_num_edges = mst_graph_num_edges_ptr[i];
-
-        // Set MST graph edges
-        for (uint32_t j = 0; j < mst_graph_num_edges; j++) {
-          my_out_graph[j] = mst_graph(i, j);
+      for (uint32_t k = 0; k < output_graph_degree; k++) {
+#pragma omp parallel for
+        for (uint64_t src_id = 0; src_id < graph_size; src_id++) {
+          const IdxT dest_id =
+            output_graph_ptr[k + (static_cast<uint64_t>(output_graph_degree) * src_id)];
+          if (dest_id >= graph_size) continue;
+          uint32_t pos;
+#pragma omp atomic capture
+          pos = rev_graph_count_ptr[dest_id]++;
+          if (pos < output_graph_degree) {
+            rev_graph_ptr[(static_cast<uint64_t>(output_graph_degree) * dest_id) + pos] =
+              static_cast<IdxT>(src_id);
+          }
         }
+      }
 
-        // Set pruned graph edges
-        for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges;
-             (pruned_j < output_graph_degree) && (output_j < output_graph_degree);
-             pruned_j++) {
-          const auto v = output_graph_ptr[output_graph_degree * i + pruned_j];
-
-          // duplication check
-          bool dup = false;
-          for (uint32_t m = 0; m < output_j; m++) {
-            if (v == my_out_graph[m]) {
-              dup = true;
-              break;
-            }
+      const double time_make_end = cur_time();
+      RAFT_LOG_DEBUG("# Making reverse graph time (CPU): %.1lf ms",
+                     (time_make_end - time_make_start) * 1000.0);
+    }
+
+    {
+      raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+        "cagra::graph::optimize/combine");
+      //
+      // Create search graphs from MST and pruned and reverse graphs
+      //
+      const double time_replace_start = cur_time();
+
+      bool check_num_protected_edges = true;
+#pragma omp parallel for
+      for (uint64_t i = 0; i < graph_size; i++) {
+        auto my_rev_graph = rev_graph.data_handle() + (output_graph_degree * i);
+        auto my_out_graph = output_graph_ptr + (output_graph_degree * i);
+
+        // If guarantee_connectivity == true, use a temporal list to merge the neighbor lists of the
+        // graphs.
+        std::vector<IdxT> temp_output_neighbor_list;
+        if (guarantee_connectivity) {
+          temp_output_neighbor_list.resize(output_graph_degree);
+          my_out_graph                   = temp_output_neighbor_list.data();
+          const auto mst_graph_num_edges = mst_graph_num_edges_ptr[i];
+
+          // Set MST graph edges
+          for (uint32_t j = 0; j < mst_graph_num_edges; j++) {
+            my_out_graph[j] = mst_graph(i, j);
           }
 
-          if (!dup) {
-            my_out_graph[output_j] = v;
-            output_j++;
+          // Set pruned graph edges
+          for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges;
+               (pruned_j < output_graph_degree) && (output_j < output_graph_degree);
+               pruned_j++) {
+            const auto v = output_graph_ptr[output_graph_degree * i + pruned_j];
+
+            // duplication check
+            bool dup = false;
+            for (uint32_t m = 0; m < output_j; m++) {
+              if (v == my_out_graph[m]) {
+                dup = true;
+                break;
+              }
+            }
+
+            if (!dup) {
+              my_out_graph[output_j] = v;
+              output_j++;
+            }
           }
         }
-      }
 
-      const auto num_protected_edges =
-        std::max<uint64_t>(mst_graph_num_edges_ptr[i], output_graph_degree / 2);
-      if (num_protected_edges > output_graph_degree) { check_num_protected_edges = false; }
-      if (num_protected_edges == output_graph_degree) continue;
-
-      // Replace some edges of the output graph with edges of the reverse graph.
-      auto kr = std::min<uint32_t>(rev_graph_count.data_handle()[i], output_graph_degree);
-      while (kr) {
-        kr -= 1;
-        if (my_rev_graph[kr] < graph_size) {
-          uint64_t pos = pos_in_array<IdxT>(my_rev_graph[kr], my_out_graph, output_graph_degree);
-          if (pos < num_protected_edges) { continue; }
-          uint64_t num_shift = pos - num_protected_edges;
-          if (pos >= output_graph_degree) {
-            num_shift = output_graph_degree - num_protected_edges - 1;
+        const auto num_protected_edges =
+          std::max<uint64_t>(mst_graph_num_edges_ptr[i], output_graph_degree / 2);
+        if (num_protected_edges > output_graph_degree) { check_num_protected_edges = false; }
+        if (num_protected_edges == output_graph_degree) continue;
+
+        // Replace some edges of the output graph with edges of the reverse graph.
+        auto kr = std::min<uint32_t>(rev_graph_count.data_handle()[i], output_graph_degree);
+        while (kr) {
+          kr -= 1;
+          if (my_rev_graph[kr] < graph_size) {
+            uint64_t pos = pos_in_array<IdxT>(my_rev_graph[kr], my_out_graph, output_graph_degree);
+            if (pos < num_protected_edges) { continue; }
+            uint64_t num_shift = pos - num_protected_edges;
+            if (pos >= output_graph_degree) {
+              num_shift = output_graph_degree - num_protected_edges - 1;
+            }
+            shift_array<IdxT>(my_out_graph + num_protected_edges, num_shift);
+            my_out_graph[num_protected_edges] = my_rev_graph[kr];
           }
-          shift_array<IdxT>(my_out_graph + num_protected_edges, num_shift);
-          my_out_graph[num_protected_edges] = my_rev_graph[kr];
         }
-      }
 
-      // If guarantee_connectivity == true, move the output neighbor list from the temporal list to
-      // the output list. If false, the copy is not needed because my_out_graph is a pointer to the
-      // output buffer.
-      if (guarantee_connectivity) {
-        for (uint32_t j = 0; j < output_graph_degree; j++) {
-          output_graph_ptr[(output_graph_degree * i) + j] = my_out_graph[j];
+        // If guarantee_connectivity == true, move the output neighbor list from the temporal list
+        // to the output list. If false, the copy is not needed because my_out_graph is a pointer to
+        // the output buffer.
+        if (guarantee_connectivity) {
+          for (uint32_t j = 0; j < output_graph_degree; j++) {
+            output_graph_ptr[(output_graph_degree * i) + j] = my_out_graph[j];
+          }
         }
       }
-    }
-    RAFT_EXPECTS(check_num_protected_edges,
-                 "Failed to merge the MST, pruned, and reverse edge graphs. Some nodes have too "
-                 "many MST optimization edges.");
+      RAFT_EXPECTS(check_num_protected_edges,
+                   "Failed to merge the MST, pruned, and reverse edge graphs. Some nodes have too "
+                   "many MST optimization edges.");
 
-    const double time_replace_end = cur_time();
-    RAFT_LOG_DEBUG("# Replacing edges time: %.1lf ms",
-                   (time_replace_end - time_replace_start) * 1000.0);
+      const double time_replace_end = cur_time();
+      RAFT_LOG_DEBUG("# Replacing edges time: %.1lf ms",
+                     (time_replace_end - time_replace_start) * 1000.0);
+    }
+  }
 
+  // Check stats
+  {
+    raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+      "cagra::graph::optimize/stats");
     /* stats */
     uint64_t num_replaced_edges = 0;
 #pragma omp parallel for reduction(+ : num_replaced_edges)

From 3e9767c930dbdbce7279353eca5ae2e0fcc8586c Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Fri, 20 Feb 2026 01:27:41 -0800
Subject: [PATCH 056/119] Removed max_node_id

---
 cpp/include/cuvs/neighbors/cagra.hpp          |  7 ---
 .../neighbors/detail/cagra/cagra_build.cuh    | 58 +++++++++----------
 .../neighbors/detail/cagra/cagra_search.cuh   |  2 -
 3 files changed, 28 insertions(+), 39 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index d105fa816d..6fd734064c 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -247,13 +247,6 @@ struct search_params : cuvs::neighbors::search_params {
   /** Bit mask used for initial random seed node selection. */
   uint64_t rand_xor_mask = 0x128394;
 
-  /** 
-   * Maximum node ID for random seed selection. 
-   * When > 0, random seeds are constrained to [0, max_node_id) instead of [0, dataset_size).
-   * This is useful when the graph is smaller than the dataset (e.g., iterative build with compression).
-   * Default 0 means no constraint (use dataset_size).
-   */
-  uint32_t max_node_id = 0;
   /** Whether to use the persistent version of the kernel (only SINGLE_CTA is supported a.t.m.) */
   bool persistent = false;
   /** Persistent kernel: time in seconds before the kernel stops if no requests received. */
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 70a29fe379..613b05cb67 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -1990,7 +1990,6 @@ void search_and_optimize(raft::resources const& res,
 {
   // Search.
   // Since there are many queries, divide them into batches and search them.
-  RAFT_LOG_DEBUG("search_and_optimize: search_params.max_node_id=%u", search_params.max_node_id);
   cuvs::spatial::knn::detail::utils::batch_load_iterator<T> query_batch(
     dev_query_view.data_handle(),
     curr_query_size,
@@ -2024,7 +2023,7 @@ void search_and_optimize(raft::resources const& res,
   // Optimize graph
   auto next_graph_size = curr_query_size;
   cagra_graph          = raft::make_host_matrix<IdxT, int64_t>(0, 0);  // delete existing grahp
-  cagra_graph = raft::make_host_matrix<IdxT, int64_t>(next_graph_size, next_graph_degree);
+  cagra_graph          = raft::make_host_matrix<IdxT, int64_t>(next_graph_size, next_graph_degree);
   optimize<IdxT>(
     res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0);
 }
@@ -2125,15 +2124,15 @@ auto iterative_build_graph(
   if (params.compression.has_value()) {
     auto start = std::chrono::high_resolution_clock::now();
     RAFT_EXPECTS(params.metric == cuvs::distance::DistanceType::L2Expanded,
-      "VPQ compression is only supported with L2Expanded distance mertric");
+                 "VPQ compression is only supported with L2Expanded distance mertric");
     idx_opt.emplace(res, params.metric);
-    //idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view()));
+    // idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view()));
     idx_opt->update_dataset(
-    res,
-    // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later
-    cuvs::neighbors::vpq_build<decltype(dev_dataset), half, int64_t>(
-    res, *params.compression, dev_dataset));
-    auto end = std::chrono::high_resolution_clock::now();
+      res,
+      // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later
+      cuvs::neighbors::vpq_build<decltype(dev_dataset), half, int64_t>(
+        res, *params.compression, dev_dataset));
+    auto end        = std::chrono::high_resolution_clock::now();
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
     RAFT_LOG_INFO("# VPQ compression time: %.3lf sec", (double)elapsed_ms / 1000);
   }
@@ -2148,25 +2147,28 @@ auto iterative_build_graph(
     // pruning is not used except in the last iteration.
     // (*) The appropriate setting for itopk_size requires careful consideration.
     auto curr_topk       = next_graph_degree + 1;
-    auto curr_itopk_size = next_graph_degree + 32;
+    auto curr_itopk_size = std::max(next_graph_degree + 32, (uint64_t)128);
     if (flag_last) {
       curr_topk       = topk;
       curr_itopk_size = curr_topk + 32;
     }
 
-    RAFT_LOG_INFO(
-      "# graph_size = %lu (%.3lf), graph_degree = %lu, query_size = %lu, itopk = %lu, topk = %lu",
-      (uint64_t)cagra_graph.extent(0),
-      (double)cagra_graph.extent(0) / final_graph_size,
-      (uint64_t)cagra_graph.extent(1),
-      (uint64_t)curr_query_size,
-      (uint64_t)curr_itopk_size,
-      (uint64_t)curr_topk);
+    // RAFT_LOG_INFO(
+    //   "# graph_size = %lu (%.3lf), graph_degree = %lu, query_size = %lu, itopk = %lu, topk =
+    //   %lu", (uint64_t)cagra_graph.extent(0), (double)cagra_graph.extent(0) / final_graph_size,
+    //   (uint64_t)cagra_graph.extent(1),
+    //   (uint64_t)curr_query_size,
+    //   (uint64_t)curr_itopk_size,
+    //   (uint64_t)curr_topk);
 
     cuvs::neighbors::cagra::search_params search_params;
-    search_params.algo        = cuvs::neighbors::cagra::search_algo::AUTO;
-    search_params.max_queries = max_chunk_size;
-    search_params.itopk_size  = curr_itopk_size;
+    search_params.algo           = cuvs::neighbors::cagra::search_algo::AUTO;
+    search_params.max_queries    = max_chunk_size;
+    search_params.itopk_size     = curr_itopk_size;
+    search_params.max_iterations = 8;
+    search_params.search_width   = 1;
+    // This fails. Why?
+    // search_params.persistent = true;
 
     // Create an index (idx), a query view (dev_query_view), and a mdarray for
     // search results (neighbors).
@@ -2174,7 +2176,8 @@ auto iterative_build_graph(
       dev_dataset.data_handle(), (int64_t)curr_graph_size, dev_dataset.extent(1));
     // No compression, create mdspan index
     if (!params.compression.has_value()) {
-      idx_opt.emplace(res, params.metric, dev_dataset_view, raft::make_const_mdspan(cagra_graph.view()));
+      idx_opt.emplace(
+        res, params.metric, dev_dataset_view, raft::make_const_mdspan(cagra_graph.view()));
     } else {
       idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view()));
     }
@@ -2186,11 +2189,6 @@ auto iterative_build_graph(
     auto neighbors_view =
       raft::make_host_matrix_view<IdxT, int64_t>(neighbors_ptr, curr_query_size, curr_topk);
 
-    // Set max_node_id to constrain random seed selection to valid graph nodes
-    search_params.max_node_id = static_cast<uint32_t>(curr_graph_size);
-    RAFT_LOG_DEBUG("iterative_build: Setting search_params.max_node_id=%u (curr_graph_size=%lu)",
-                   search_params.max_node_id, curr_graph_size);
-
     search_and_optimize(res,
                         search_params,
                         idx,
@@ -2208,12 +2206,12 @@ auto iterative_build_graph(
 
     auto end        = std::chrono::high_resolution_clock::now();
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
-    RAFT_LOG_INFO("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000);
+    RAFT_LOG_DEBUG("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000);
 
     if (flag_last) { break; }
-    flag_last       = (curr_graph_size == final_graph_size);
+    flag_last            = (curr_graph_size == final_graph_size);
     auto next_graph_size = curr_query_size;
-    curr_graph_size = next_graph_size;
+    curr_graph_size      = next_graph_size;
   }
 
   return cagra_graph;
diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index 0efb7cfd45..efec4b3f93 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -51,8 +51,6 @@ void search_main_core(
   raft::device_matrix_view<DistanceT, int64_t, raft::row_major> distances,
   CagraSampleFilterT sample_filter = CagraSampleFilterT())
 {
-  RAFT_LOG_DEBUG("search_main_core: max_node_id=%u, graph.extent(0)=%lu", 
-                 params.max_node_id, graph.extent(0));
   static_assert(std::is_same_v<IndexT, uint32_t>,
                 "Only uint32_t is supported as the graph element type (internal index type)");
   RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n",

From 822faea739f9b77f13642c8201090273e27d32bc Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 20 Feb 2026 12:14:35 +0000
Subject: [PATCH 057/119] some fixes, cleanup

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 137 ++++++++++--------
 1 file changed, 77 insertions(+), 60 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index f2cd79ecb6..1b7e46e535 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -173,7 +173,7 @@ __global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, g
   uint64_t* const num_full   = stats + 1;
 
   const uint64_t iA       = blockIdx.x + (batch_size * batch_id);
-  const uint64_t iA_batch = iA % static_cast<uint64_t>(batch_size);
+  const uint64_t iA_batch = blockIdx.x;
 
   if (iA >= graph_size) { return; }
 
@@ -246,66 +246,69 @@ __global__ void kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_
   }
 }
 
-// Select output_graph_degree neighbors with smallest detour count per node (writes to device).
-template <class IdxT>
+// Based on the detour count, select the smallest detour count and its index
+// (Pruning Update Kernel)
+template <typename IdxT>
 __global__ void kern_select_smallest_detour_neighbors(
-  const IdxT* const knn_graph,
+  const IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
   uint64_t graph_size,
   uint64_t knn_graph_degree,
   uint64_t output_graph_degree,
-  const uint8_t* const d_detour_count,  // [batch_size, graph_degree]
-  IdxT* output_graph_ptr,               // [batch_size, output_graph_degree]
-  const uint32_t batch_size,
-  const uint32_t batch_id)
+  uint8_t* const d_detour_count,  // [batch_size, graph_degree]
+  IdxT* output_graph_ptr,
+  const uint32_t batch_size,  // [batch_size, output_graph_degree]
+  const uint32_t batch_id,
+  uint32_t* const d_invalid_neighbor_list)
 {
-  // FIXME: this does not really work for num_warps > 1
-  constexpr unsigned warp_mask = 0xffffffff;
-  const uint32_t num_warps     = blockDim.x / raft::WarpSize;
-  extern __shared__ unsigned char smem_buf[];
-  uint32_t* smem_indices = reinterpret_cast<uint32_t*>(smem_buf);
-  uint16_t* smem_detour_count =
-    reinterpret_cast<uint16_t*>(&smem_indices[knn_graph_degree * num_warps]);
+  assert(blockDim.x == 32);
 
-  const uint32_t wid     = threadIdx.x / raft::WarpSize;
-  const uint32_t lane_id = threadIdx.x % raft::WarpSize;
-  const uint64_t nid     = static_cast<uint64_t>(blockIdx.x) * num_warps +
-                       (static_cast<uint64_t>(batch_size) * batch_id * num_warps) + wid;
+  // Allocate shared memory for detour counts and their indices
+  extern __shared__ IdxT smem_indices[];
+  uint16_t* smem_detour_count = (uint16_t*)&smem_indices[knn_graph_degree];
 
-  const uint64_t nid_batch = nid % static_cast<uint64_t>(batch_size);
+  const uint64_t nid       = blockIdx.x + (batch_size * batch_id);
+  const uint64_t nid_batch = blockIdx.x;
 
-  if (nid >= graph_size) return;
+  if (nid >= graph_size) { return; }
 
-  for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) {
-    smem_detour_count[(knn_graph_degree * wid) + k] =
-      d_detour_count[nid_batch * knn_graph_degree + k];
-    smem_indices[(knn_graph_degree * wid) + k] = k;
+  // Each uint64_t loads detour_count for its assigned k
+  for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) {
+    smem_detour_count[k] = d_detour_count[nid_batch * knn_graph_degree + k];
+    smem_indices[k]      = knn_graph[knn_graph_degree * nid + k];
   }
-  __syncwarp(warp_mask);
+  __syncwarp();
+
+  const unsigned warp_mask = 0xffffffff;
 
   for (uint32_t i = 0; i < output_graph_degree; i++) {
-    uint32_t local_min = 256;
+    uint32_t local_min = 255;
     uint32_t local_idx = 0xffffffff;
-    for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) {
-      uint32_t c = smem_detour_count[(knn_graph_degree * wid) + k];
-      if (c < local_min) {
-        local_min = c;
-        local_idx = smem_indices[(knn_graph_degree * wid) + k];
+    for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) {
+      if (smem_detour_count[k] < local_min) {
+        local_min = smem_detour_count[k];
+        local_idx = k;
       }
     }
-    uint32_t local_min_with_tag = (local_min << 16) | local_idx;
-    for (int offset = raft::WarpSize / 2; offset > 0; offset /= 2) {
-      uint32_t other     = __shfl_down_sync(warp_mask, local_min_with_tag, offset);
-      local_min_with_tag = (local_min_with_tag <= other) ? local_min_with_tag : other;
+
+    uint32_t local_min_with_tag = (local_min << 16) | ((uint32_t)local_idx);
+    uint32_t warp_min_with_tag  = __reduce_min_sync(warp_mask, local_min_with_tag);
+    uint32_t warp_min_count     = warp_min_with_tag >> 16;
+    uint32_t warp_local_idx     = warp_min_with_tag & 0xffff;
+
+    if (warp_min_count == 255) {
+      // No valid position left; set error flag and fill remaining slots with sentinel
+      if (threadIdx.x == 0) { atomicExch(d_invalid_neighbor_list, 1u); }
+      break;
     }
-    uint32_t warp_min_tag   = __shfl_sync(warp_mask, local_min_with_tag, 0);
-    uint32_t warp_local_idx = warp_min_tag & 0xffff;
 
-    if (local_idx == warp_local_idx) {
-      output_graph_ptr[nid_batch * output_graph_degree + i] =
-        knn_graph[knn_graph_degree * nid + warp_local_idx];
-      smem_detour_count[knn_graph_degree * wid + warp_local_idx] = 255;
+    IdxT selected_node = smem_indices[warp_local_idx];
+
+    for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) {
+      if (smem_indices[k] == selected_node) { smem_detour_count[k] = 255; }
     }
     __syncwarp(warp_mask);
+
+    if (threadIdx.x == 0) { output_graph_ptr[nid_batch * output_graph_degree + i] = selected_node; }
   }
 }
 
@@ -350,19 +353,18 @@ __global__ void kern_merge_graph(IdxT* output_graph,
   extern __shared__ unsigned char smem_buf[];
   IdxT* smem_sorted_output_graph = reinterpret_cast<IdxT*>(smem_buf);
 
-  const uint32_t wid       = threadIdx.x / 32;
-  const uint32_t lane_id   = threadIdx.x % 32;
-  const uint32_t num_warps = blockDim.x / 32;
-  const uint64_t nid       = blockIdx.x * num_warps + (batch_size * batch_id * num_warps) + wid;
+  assert(blockDim.x == 32);
+
+  const uint64_t nid = blockIdx.x + (batch_size * batch_id);
   if (nid >= graph_size) { return; }
 
-  if (lane_id == 0) check_num_protected_edges[0] = true;
+  if (threadIdx.x == 0) check_num_protected_edges[0] = true;
 
   const auto mst_graph_num_edges = mst_graph_num_edges_ptr[nid];
   // If guarantee_connectivity == true, use a temporal list to merge the
   // neighbor lists of the graphs.
   if (guarantee_connectivity) {
-    for (uint32_t i = lane_id; i < mst_graph_degree; i += 32) {
+    for (uint32_t i = threadIdx.x; i < mst_graph_degree; i += 32) {
       smem_sorted_output_graph[i] = mst_graph[nid * mst_graph_degree + i];
     }
     __syncwarp();
@@ -371,7 +373,7 @@ __global__ void kern_merge_graph(IdxT* output_graph,
          pruned_j++) {
       const auto v     = output_graph[output_graph_degree * nid + pruned_j];
       unsigned int dup = 0;
-      for (uint32_t m = lane_id; m < output_j; m += 32) {
+      for (uint32_t m = threadIdx.x; m < output_j; m += 32) {
         if (v == smem_sorted_output_graph[m]) {
           dup = 1;
           break;
@@ -380,7 +382,7 @@ __global__ void kern_merge_graph(IdxT* output_graph,
 
       unsigned int warp_dup = __ballot_sync(0xffffffff, dup);
       if (warp_dup == 0) {
-        if (lane_id == 0) smem_sorted_output_graph[output_j] = v;
+        if (threadIdx.x == 0) smem_sorted_output_graph[output_j] = v;
         output_j++;
       }
       __syncwarp();
@@ -388,7 +390,7 @@ __global__ void kern_merge_graph(IdxT* output_graph,
   }
 
   else {
-    for (uint32_t i = lane_id; i < output_graph_degree; i += 32) {
+    for (uint32_t i = threadIdx.x; i < output_graph_degree; i += 32) {
       smem_sorted_output_graph[i] = output_graph[output_graph_degree * nid + i];
     }
     __syncwarp();
@@ -409,7 +411,7 @@ __global__ void kern_merge_graph(IdxT* output_graph,
       if (pos < num_protected_edges) { continue; }
       uint64_t num_shift = pos - num_protected_edges;
       if (pos >= output_graph_degree) { num_shift = output_graph_degree - num_protected_edges - 1; }
-      if (lane_id == 0) {
+      if (threadIdx.x == 0) {
         thread_shift_array<IdxT>(smem_sorted_output_graph + num_protected_edges, num_shift);
         smem_sorted_output_graph[num_protected_edges] = rev_graph[kr + (output_graph_degree * nid)];
       }
@@ -417,7 +419,7 @@ __global__ void kern_merge_graph(IdxT* output_graph,
     }
   }
 
-  for (uint32_t i = lane_id; i < output_graph_degree; i += 32) {
+  for (uint32_t i = threadIdx.x; i < output_graph_degree; i += 32) {
     output_graph[(output_graph_degree * nid) + i] = smem_sorted_output_graph[i];
   }
 }
@@ -1477,6 +1479,7 @@ void optimize(
         res, large_tmp_mr, raft::make_extents<int64_t>(batch_size));
       auto d_output_graph = raft::make_device_mdarray<IdxT>(
         res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, output_graph_degree));
+      auto d_invalid_neighbor_list = raft::make_device_scalar<uint32_t>(res, 0u);
 
       for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
         // initialize the detour_count and num_no_detour_edges for the current batch
@@ -1507,8 +1510,7 @@ void optimize(
             dev_stats.data_handle());
 
         // select smallest-detour neighbors for the current batch
-        const size_t select_smem_size =
-          (knn_graph_degree * knn_graph_degree) * (sizeof(uint16_t) + sizeof(uint32_t));
+        const size_t select_smem_size = (knn_graph_degree) * (sizeof(uint16_t) + sizeof(IdxT));
         const dim3 threads_select(32, 1, 1);
         const dim3 blocks_select(batch_size, 1, 1);
         kern_select_smallest_detour_neighbors<IdxT>
@@ -1522,10 +1524,11 @@ void optimize(
                                                      d_detour_count.data_handle(),
                                                      d_output_graph.data_handle(),
                                                      batch_size,
-                                                     i_batch);
+                                                     i_batch,
+                                                     d_invalid_neighbor_list.data_handle());
 
-        raft::copy(output_graph_ptr,
-                   d_output_graph.data_handle() + i_batch * batch_size * output_graph_degree,
+        raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree,
+                   d_output_graph.data_handle(),
                    static_cast<size_t>(batch_size) * output_graph_degree,
                    raft::resource::get_cuda_stream(res));
 
@@ -1537,6 +1540,18 @@ void optimize(
       raft::resource::sync_stream(res);
       RAFT_LOG_DEBUG("\n");
 
+      uint32_t invalid_neighbor_list = 0;
+      raft::copy(&invalid_neighbor_list,
+                 d_invalid_neighbor_list.data_handle(),
+                 1,
+                 raft::resource::get_cuda_stream(res));
+      raft::resource::sync_stream(res);
+      RAFT_EXPECTS(
+        invalid_neighbor_list == 0,
+        "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
+        "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
+        "overflows occur during the norm computation between the dataset vectors.");
+
       raft::copy(
         host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res));
       num_keep = host_stats.data_handle()[0];
@@ -1642,6 +1657,8 @@ void optimize(
         raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
       auto d_rev_graph = raft::make_device_mdarray<IdxT>(
         res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
+      auto d_output_graph = raft::make_device_mdarray<IdxT>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
     } catch (std::bad_alloc& e) {
       RAFT_LOG_DEBUG("Insufficient memory for reverse graph on GPU");
       _use_gpu_rev_graph = false;
@@ -1760,9 +1777,7 @@ void optimize(
                  d_check_num_protected_edges.data_handle(),
                  1,
                  raft::resource::get_cuda_stream(res));
-      raft::resource::sync_stream(res);
 
-      // TODO: is this required?
       if (d_output_graph.allocated_memory()) {
         raft::copy(output_graph_ptr,
                    d_output_graph.data_handle(),
@@ -1770,6 +1785,8 @@ void optimize(
                    raft::resource::get_cuda_stream(res));
       }
 
+      raft::resource::sync_stream(res);
+
       const auto merge_graph_end = cur_time();
       RAFT_EXPECTS(check_num_protected_edges,
                    "Failed to merge the MST, pruned, and reverse edge graphs. "

From 129ee4ff4b4762b9f15c2de6e63041aabb34d8db Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Wed, 25 Feb 2026 07:20:12 -0800
Subject: [PATCH 058/119] added the fix that also checks whether the kernel
 function pointer has changed

---
 cpp/src/neighbors/detail/smem_utils.cuh | 58 +++++++++++++++----------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/cpp/src/neighbors/detail/smem_utils.cuh b/cpp/src/neighbors/detail/smem_utils.cuh
index 41c95c0ccd..978f751787 100644
--- a/cpp/src/neighbors/detail/smem_utils.cuh
+++ b/cpp/src/neighbors/detail/smem_utils.cuh
@@ -6,7 +6,6 @@
 
 #include <raft/core/error.hpp>
 
-#include <atomic>
 #include <cstdint>
 #include <mutex>
 
@@ -14,14 +13,18 @@ namespace cuvs::neighbors::detail {
 
 /**
  * @brief (Thread-)Safely invoke a kernel with a maximum dynamic shared memory size.
- * This is required because the sequence `cudaFuncSetAttribute` + kernel launch is not executed
- * atomically.
  *
- * Used this way, the cudaFuncAttributeMaxDynamicSharedMemorySize can only grow and thus
- * guarantees that the kernel is safe to launch.
+ * Maintains a monotonically growing high-water mark for `cudaFuncAttributeMaxDynamicSharedMemorySize`.
+ * When the kernel function pointer changes, the new kernel is brought up to the current high-water
+ * mark; when smem_size exceeds the high-water mark, it is grown for the current kernel.
+ * This guarantees every kernel's attribute is always >= smem_size at the time of launch.
+ *
+ * NB: cudaFuncSetAttribute is per kernel function pointer value, not per type. Multiple kernel
+ * template instantiations may share the same KernelT type (e.g. function pointers with the same
+ * signature), so we track the kernel identity alongside the smem high-water mark.
  *
  * @tparam KernelT The type of the kernel.
- * @tparam InvocationT The type of the invocation function.
+ * @tparam KernelLauncherT The type of the launch function/lambda.
  * @param kernel The kernel function address (for whom the smem-size is specified).
  * @param smem_size The size of the dynamic shared memory to be set.
  * @param launch The kernel launch function/lambda.
@@ -31,23 +34,33 @@ void safely_launch_kernel_with_smem_size(KernelT const& kernel,
                                          uint32_t smem_size,
                                          KernelLauncherT const& launch)
 {
-  // the last smem size is parameterized by the kernel thanks to the template parameter.
-  static std::atomic<uint32_t> current_smem_size{0};
-  auto last_smem_size = current_smem_size.load(std::memory_order_relaxed);
-  if (smem_size > last_smem_size) {
-    // We still need a mutex for the critical section: actualize last_smem_size and set the
-    // attribute.
-    static auto mutex = std::mutex{};
-    auto guard        = std::lock_guard<std::mutex>{mutex};
-    if (!current_smem_size.compare_exchange_strong(
-          last_smem_size, smem_size, std::memory_order_relaxed, std::memory_order_relaxed)) {
-      // The value has been updated by another thread between the load and the mutex acquisition.
-      if (smem_size > last_smem_size) {
-        current_smem_size.store(smem_size, std::memory_order_relaxed);
-      }
+  // current_smem_size is a monotonically growing high-water mark across all kernel pointers.
+  // current_kernel tracks which kernel pointer was last used.
+  static uint32_t current_smem_size{0};
+  static KernelT current_kernel{KernelT{}};
+  static std::mutex mutex;
+
+  {
+    std::lock_guard<std::mutex> guard(mutex);
+
+    auto last_kernel    = current_kernel;
+    auto last_smem_size = current_smem_size;
+
+    // When the kernel function pointer changes, bring the new kernel up to the global high-water
+    // mark. This is necessary because cudaFuncSetAttribute applies to a specific function pointer,
+    // not to the pointer type — different template instantiations may share the same KernelT.
+    if (kernel != last_kernel) {
+      current_kernel     = kernel;
+      auto launch_status =
+        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, last_smem_size);
+      RAFT_EXPECTS(launch_status == cudaSuccess,
+                   "Failed to set max dynamic shared memory size to %u bytes",
+                   last_smem_size);
     }
-    // Only update if the last seen value is smaller than the new one.
+    // When smem_size exceeds the high-water mark, grow it for the current kernel.
+    // If the kernel also changed above, this handles the case where smem_size > last_smem_size.
     if (smem_size > last_smem_size) {
+      current_smem_size  = smem_size;
       auto launch_status =
         cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
       RAFT_EXPECTS(launch_status == cudaSuccess,
@@ -55,7 +68,8 @@ void safely_launch_kernel_with_smem_size(KernelT const& kernel,
                    smem_size);
     }
   }
-  // We don't need to guard the kernel launch because the smem_size can only grow.
+  // The kernel launch is outside the lock: any concurrent cudaFuncSetAttribute can only increase
+  // the limit, so the launch is always safe.
   return launch(kernel);
 }
 

From 5ec30278eb16a570889d9c08b37d75925274bcce Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Wed, 25 Feb 2026 07:36:35 -0800
Subject: [PATCH 059/119] merged main in

---
 cpp/src/neighbors/detail/smem_utils.cuh | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cpp/src/neighbors/detail/smem_utils.cuh b/cpp/src/neighbors/detail/smem_utils.cuh
index 978f751787..74838f9be9 100644
--- a/cpp/src/neighbors/detail/smem_utils.cuh
+++ b/cpp/src/neighbors/detail/smem_utils.cuh
@@ -14,10 +14,11 @@ namespace cuvs::neighbors::detail {
 /**
  * @brief (Thread-)Safely invoke a kernel with a maximum dynamic shared memory size.
  *
- * Maintains a monotonically growing high-water mark for `cudaFuncAttributeMaxDynamicSharedMemorySize`.
- * When the kernel function pointer changes, the new kernel is brought up to the current high-water
- * mark; when smem_size exceeds the high-water mark, it is grown for the current kernel.
- * This guarantees every kernel's attribute is always >= smem_size at the time of launch.
+ * Maintains a monotonically growing high-water mark for
+ * `cudaFuncAttributeMaxDynamicSharedMemorySize`. When the kernel function pointer changes, the new
+ * kernel is brought up to the current high-water mark; when smem_size exceeds the high-water mark,
+ * it is grown for the current kernel. This guarantees every kernel's attribute is always >=
+ * smem_size at the time of launch.
  *
  * NB: cudaFuncSetAttribute is per kernel function pointer value, not per type. Multiple kernel
  * template instantiations may share the same KernelT type (e.g. function pointers with the same
@@ -50,7 +51,7 @@ void safely_launch_kernel_with_smem_size(KernelT const& kernel,
     // mark. This is necessary because cudaFuncSetAttribute applies to a specific function pointer,
     // not to the pointer type — different template instantiations may share the same KernelT.
     if (kernel != last_kernel) {
-      current_kernel     = kernel;
+      current_kernel = kernel;
       auto launch_status =
         cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, last_smem_size);
       RAFT_EXPECTS(launch_status == cudaSuccess,
@@ -60,7 +61,7 @@ void safely_launch_kernel_with_smem_size(KernelT const& kernel,
     // When smem_size exceeds the high-water mark, grow it for the current kernel.
     // If the kernel also changed above, this handles the case where smem_size > last_smem_size.
     if (smem_size > last_smem_size) {
-      current_smem_size  = smem_size;
+      current_smem_size = smem_size;
       auto launch_status =
         cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
       RAFT_EXPECTS(launch_status == cudaSuccess,

From 9b1f741ca39eb4624a08b51d54a2429fa6b08eff Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 25 Feb 2026 15:41:10 +0000
Subject: [PATCH 060/119] some fixes

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 26 ++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 1b7e46e535..8705f555b6 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -268,21 +268,23 @@ __global__ void kern_select_smallest_detour_neighbors(
 
   const uint64_t nid       = blockIdx.x + (batch_size * batch_id);
   const uint64_t nid_batch = blockIdx.x;
+  const uint32_t maxval16  = 0x0000ffff;
 
   if (nid >= graph_size) { return; }
 
-  // Each uint64_t loads detour_count for its assigned k
+  // Load indices and detour counts for each neighbor; invalidate out-of-bounds entries
   for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) {
-    smem_detour_count[k] = d_detour_count[nid_batch * knn_graph_degree + k];
     smem_indices[k]      = knn_graph[knn_graph_degree * nid + k];
+    smem_detour_count[k] = (smem_indices[k] >= graph_size)
+                             ? maxval16
+                             : (uint16_t)d_detour_count[nid_batch * knn_graph_degree + k];
   }
   __syncwarp();
 
   const unsigned warp_mask = 0xffffffff;
-
   for (uint32_t i = 0; i < output_graph_degree; i++) {
-    uint32_t local_min = 255;
-    uint32_t local_idx = 0xffffffff;
+    uint32_t local_min = maxval16;
+    uint32_t local_idx = maxval16;
     for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) {
       if (smem_detour_count[k] < local_min) {
         local_min = smem_detour_count[k];
@@ -295,8 +297,7 @@ __global__ void kern_select_smallest_detour_neighbors(
     uint32_t warp_min_count     = warp_min_with_tag >> 16;
     uint32_t warp_local_idx     = warp_min_with_tag & 0xffff;
 
-    if (warp_min_count == 255) {
-      // No valid position left; set error flag and fill remaining slots with sentinel
+    if (warp_min_count == maxval16 || warp_local_idx == maxval16) {
       if (threadIdx.x == 0) { atomicExch(d_invalid_neighbor_list, 1u); }
       break;
     }
@@ -304,7 +305,7 @@ __global__ void kern_select_smallest_detour_neighbors(
     IdxT selected_node = smem_indices[warp_local_idx];
 
     for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) {
-      if (smem_indices[k] == selected_node) { smem_detour_count[k] = 255; }
+      if (smem_indices[k] == selected_node) { smem_detour_count[k] = maxval16; }
     }
     __syncwarp(warp_mask);
 
@@ -1355,7 +1356,11 @@ void optimize(
 {
   RAFT_LOG_DEBUG(
     "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
+
+  // large temporary memory for large arrays, e.g. everything >= O(graph_size)
   auto large_tmp_mr = raft::resource::get_large_workspace_resource(res);
+  // temporary memory for small arrays, e.g. everything <= O(batchsize * graph_degree)
+  // auto tmp_mr = raft::resource::get_tmp_workspace_resource(res);
 
   RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0),
                "Each input array is expected to have the same number of rows");
@@ -1527,9 +1532,12 @@ void optimize(
                                                      i_batch,
                                                      d_invalid_neighbor_list.data_handle());
 
+        size_t copy_size =
+          std::min(static_cast<size_t>(batch_size), graph_size - i_batch * batch_size) *
+          output_graph_degree;
         raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree,
                    d_output_graph.data_handle(),
-                   static_cast<size_t>(batch_size) * output_graph_degree,
+                   copy_size,
                    raft::resource::get_cuda_stream(res));
 
         raft::resource::sync_stream(res);

From 18647117ca3e9fa4242f258d4c65f59302451041 Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Wed, 25 Feb 2026 07:48:49 -0800
Subject: [PATCH 061/119] style fixes|

---
 cpp/bench/ann/src/common/benchmark.hpp              | 1 -
 cpp/src/neighbors/detail/cagra/compute_distance.hpp | 8 +++++---
 cpp/src/neighbors/detail/cagra/device_common.hpp    | 4 ++--
 cpp/tests/neighbors/ann_cagra.cuh                   | 9 ++++-----
 python/cuvs_bench/cuvs_bench/run/run.py             | 2 --
 5 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 1d5239ec9b..22859e9ab8 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -647,7 +647,6 @@ inline auto run_main(int argc, char** argv) -> int
   char* conf_path = argv[--argc];
   std::ifstream conf_stream(conf_path);
 
-
   for (int i = 1; i < argc; i++) {
     if (parse_bool_flag(argv[i], "--force", force_overwrite) ||
         parse_bool_flag(argv[i], "--build", build_mode) ||
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
index c314d4d0a3..19b6c1db71 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -239,13 +239,15 @@ struct dataset_descriptor_host {
     template <typename InitF>
     state(InitF init, size_t size) : ready{false}, value{std::make_tuple(init, size)}
     {
-      // RAFT_LOG_INFO("trying to create a descriptor state %p", reinterpret_cast<std::uintptr_t>(this));
+      // RAFT_LOG_INFO("trying to create a descriptor state %p",
+      // reinterpret_cast<std::uintptr_t>(this));
     }
 
     ~state() noexcept
     {
       if (std::holds_alternative<ready_t>(value)) {
-        // RAFT_LOG_INFO("trying to free descriptor state %p", reinterpret_cast<std::uintptr_t>(this));
+        // RAFT_LOG_INFO("trying to free descriptor state %p",
+        // reinterpret_cast<std::uintptr_t>(this));
         auto& [ptr, stream] = std::get<ready_t>(value);
         RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(ptr, stream));
       }
diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp
index 3d3a25c8eb..0b75de6bab 100644
--- a/cpp/src/neighbors/detail/cagra/device_common.hpp
+++ b/cpp/src/neighbors/detail/cagra/device_common.hpp
@@ -103,11 +103,11 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes(
   const uint32_t traversed_hash_bitlen,
   const uint32_t block_id   = 0,
   const uint32_t num_blocks = 1,
-  const IndexT graph_size = 0)
+  const IndexT graph_size   = 0)
 {
   const auto team_size_bits = dataset_desc.team_size_bitshift_from_smem();
   const auto max_i = raft::round_up_safe<uint32_t>(num_pickup, warp_size >> team_size_bits);
-  const auto compute_distance = dataset_desc.compute_distance_impl;
+  const auto compute_distance   = dataset_desc.compute_distance_impl;
   const IndexT seed_index_limit = graph_size > 0 ? graph_size : dataset_desc.size;
 
   for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += (blockDim.x >> team_size_bits)) {
diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh
index 77dbcb683c..80d033c3b4 100644
--- a/cpp/tests/neighbors/ann_cagra.cuh
+++ b/cpp/tests/neighbors/ann_cagra.cuh
@@ -1508,10 +1508,9 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {100},
     {1000000},
     {768},  // dim
-    {16},                                                    // k
-    {
-      //graph_build_algo::IVF_PQ,
-     //graph_build_algo::NN_DESCENT,
+    {16},   // k
+    {       // graph_build_algo::IVF_PQ,
+     // graph_build_algo::NN_DESCENT,
      graph_build_algo::ITERATIVE_CAGRA_SEARCH},
     {search_algo::AUTO},
     {10},
@@ -1632,7 +1631,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
   //   {std::optional<bool>{std::nullopt}},
   //   {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL,
   //    cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL});  // don't demand high recall
-                                                                // without refinement
+  // without refinement
   for (uint32_t pq_len : {2}) {  // for now, only pq_len = 2 is supported, more options coming  soon
     for (uint32_t vq_n_centers : {100, 1000}) {
       for (auto input : inputs2) {
diff --git a/python/cuvs_bench/cuvs_bench/run/run.py b/python/cuvs_bench/cuvs_bench/run/run.py
index bc3fd15028..6ec7c04847 100644
--- a/python/cuvs_bench/cuvs_bench/run/run.py
+++ b/python/cuvs_bench/cuvs_bench/run/run.py
@@ -644,8 +644,6 @@ def run_benchmark(
     conf_file = prepare_conf_file(dataset_conf, subset_size, count, batch_size)
     algos_conf_fs = gather_algorithm_configs(scripts_path, configuration)
 
-    
-
     allowed_algos = algorithms.split(",") if algorithms else None
     allowed_groups = groups.split(",") if groups else None
     allowed_algo_groups = (

From a92aa64caefbb649b7404efc17bd9188eafa9e6d Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Thu, 26 Feb 2026 01:16:31 -0800
Subject: [PATCH 062/119] optimisation attempt: skip optimisation except last
 step

---
 cpp/include/cuvs/neighbors/cagra.hpp          |   8 ++
 .../neighbors/detail/cagra/cagra_build.cuh    | 135 ++++++++++++++----
 2 files changed, 113 insertions(+), 30 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 6fd734064c..247d29649d 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -119,6 +119,14 @@ struct index_params : cuvs::neighbors::index_params {
    */
   bool guarantee_connectivity = false;
 
+  /**
+   * Whether to skip graph optimization (pruning, reverse edges, MST) during non-final iterations
+   * of iterative graph building. When true, search results are copied directly into the device
+   * graph without host round-trips. Only applies to iterative_search_params graph builds; the
+   * final iteration always runs full optimization.
+   */
+  bool skip_graph_optimization = false;
+
   /**
    * Whether to add the dataset content to the index, i.e.:
    *
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 0ba98a4447..9b41f70f64 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -1971,6 +1971,53 @@ struct mmap_owner {
   size_t size_;
 };
 
+template <typename T, typename IdxT>
+void search_to_device_graph(raft::resources const& res,
+                            const cuvs::neighbors::cagra::search_params& search_params,
+                            const index<T, IdxT>& idx,
+                            raft::device_matrix_view<const T, int64_t> dev_query_view,
+                            raft::device_matrix_view<IdxT, int64_t> dev_neighbors,
+                            raft::device_matrix_view<float, int64_t> dev_distances,
+                            raft::device_matrix_view<IdxT, int64_t> dev_graph_output,
+                            size_t curr_query_size,
+                            size_t next_graph_degree,
+                            size_t curr_topk,
+                            uint64_t max_chunk_size)
+{
+  cuvs::spatial::knn::detail::utils::batch_load_iterator<T> query_batch(
+    dev_query_view.data_handle(),
+    curr_query_size,
+    dev_query_view.extent(1),
+    max_chunk_size,
+    raft::resource::get_cuda_stream(res),
+    raft::resource::get_workspace_resource(res));
+
+  for (const auto& batch : query_batch) {
+    auto batch_dev_query_view = raft::make_device_matrix_view<const T, int64_t>(
+      batch.data(), batch.size(), dev_query_view.extent(1));
+    auto batch_dev_neighbors_view = raft::make_device_matrix_view<IdxT, int64_t>(
+      dev_neighbors.data_handle(), batch.size(), curr_topk);
+    auto batch_dev_distances_view = raft::make_device_matrix_view<float, int64_t>(
+      dev_distances.data_handle(), batch.size(), curr_topk);
+
+    cuvs::neighbors::cagra::search(
+      res, search_params, idx, batch_dev_query_view, batch_dev_neighbors_view,
+      batch_dev_distances_view);
+
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(
+      dev_graph_output.data_handle() + batch.offset() * next_graph_degree,
+      next_graph_degree * sizeof(IdxT),
+      dev_neighbors.data_handle(),
+      curr_topk * sizeof(IdxT),
+      next_graph_degree * sizeof(IdxT),
+      batch.size(),
+      cudaMemcpyDeviceToDevice,
+      raft::resource::get_cuda_stream(res)));
+  }
+
+  raft::resource::sync_stream(res);
+}
+
 template <typename T, typename IdxT>
 void search_and_optimize(raft::resources const& res,
                          const cuvs::neighbors::cagra::search_params& search_params,
@@ -2120,6 +2167,10 @@ auto iterative_build_graph(
   bool flag_last       = false;
   auto curr_graph_size = initial_graph_size;
 
+  // Device graph for skip_graph_optimization: keeps the graph on device between iterations.
+  auto dev_graph        = raft::make_device_matrix<IdxT, int64_t>(res, 0, 0);
+  bool use_device_graph = false;
+
   // Generate the compressed index once if compression is enabled
   std::optional<index<T, IdxT>> idx_opt;
   if (params.compression.has_value()) {
@@ -2154,13 +2205,7 @@ auto iterative_build_graph(
       curr_itopk_size = curr_topk + 32;
     }
 
-    // RAFT_LOG_INFO(
-    //   "# graph_size = %lu (%.3lf), graph_degree = %lu, query_size = %lu, itopk = %lu, topk =
-    //   %lu", (uint64_t)cagra_graph.extent(0), (double)cagra_graph.extent(0) / final_graph_size,
-    //   (uint64_t)cagra_graph.extent(1),
-    //   (uint64_t)curr_query_size,
-    //   (uint64_t)curr_itopk_size,
-    //   (uint64_t)curr_topk);
+    bool do_skip = false;//params.skip_graph_optimization && !flag_last;
 
     cuvs::neighbors::cagra::search_params search_params;
     search_params.algo           = cuvs::neighbors::cagra::search_algo::AUTO;
@@ -2168,42 +2213,72 @@ auto iterative_build_graph(
     search_params.itopk_size     = curr_itopk_size;
     search_params.max_iterations = 8;
     search_params.search_width   = 1;
-    // This fails. Why?
-    // search_params.persistent = true;
 
     // Create an index (idx), a query view (dev_query_view), and a mdarray for
     // search results (neighbors).
     auto dev_dataset_view = raft::make_device_matrix_view<const T, int64_t>(
       dev_dataset.data_handle(), (int64_t)curr_graph_size, dev_dataset.extent(1));
-    // No compression, create mdspan index
+
     if (!params.compression.has_value()) {
-      idx_opt.emplace(
-        res, params.metric, dev_dataset_view, raft::make_const_mdspan(cagra_graph.view()));
+      if (use_device_graph) {
+        idx_opt.emplace(
+          res, params.metric, dev_dataset_view, raft::make_const_mdspan(dev_graph.view()));
+      } else {
+        idx_opt.emplace(
+          res, params.metric, dev_dataset_view, raft::make_const_mdspan(cagra_graph.view()));
+      }
     } else {
-      idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view()));
+      if (use_device_graph) {
+        idx_opt->update_graph(res, raft::make_const_mdspan(dev_graph.view()));
+      } else {
+        idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view()));
+      }
     }
     const auto& idx = *idx_opt;
 
     auto dev_query_view = raft::make_device_matrix_view<const T, int64_t>(
       dev_dataset.data_handle(), (int64_t)curr_query_size, dev_dataset.extent(1));
 
-    auto neighbors_view =
-      raft::make_host_matrix_view<IdxT, int64_t>(neighbors_ptr, curr_query_size, curr_topk);
-
-    search_and_optimize(res,
-                        search_params,
-                        idx,
-                        dev_query_view,
-                        dev_neighbors.view(),
-                        dev_distances.view(),
-                        neighbors_view,
-                        cagra_graph,
-                        curr_query_size,
-                        next_graph_degree,
-                        curr_topk,
-                        max_chunk_size,
-                        flag_last,
-                        params);
+    if (do_skip) {
+      auto dev_graph_next =
+        raft::make_device_matrix<IdxT, int64_t>(res, curr_query_size, next_graph_degree);
+
+      search_to_device_graph(res,
+                             search_params,
+                             idx,
+                             dev_query_view,
+                             dev_neighbors.view(),
+                             dev_distances.view(),
+                             dev_graph_next.view(),
+                             curr_query_size,
+                             next_graph_degree,
+                             curr_topk,
+                             max_chunk_size);
+
+      dev_graph        = std::move(dev_graph_next);
+      use_device_graph = true;
+    } else {
+      auto neighbors_view =
+        raft::make_host_matrix_view<IdxT, int64_t>(neighbors_ptr, curr_query_size, curr_topk);
+
+      search_and_optimize(res,
+                          search_params,
+                          idx,
+                          dev_query_view,
+                          dev_neighbors.view(),
+                          dev_distances.view(),
+                          neighbors_view,
+                          cagra_graph,
+                          curr_query_size,
+                          next_graph_degree,
+                          curr_topk,
+                          max_chunk_size,
+                          flag_last,
+                          params);
+
+      dev_graph        = raft::make_device_matrix<IdxT, int64_t>(res, 0, 0);
+      use_device_graph = false;
+    }
 
     auto end        = std::chrono::high_resolution_clock::now();
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();

From 6e42fb12418cd7f71777f9b642544b45fd796f5d Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Thu, 26 Feb 2026 02:13:51 -0800
Subject: [PATCH 063/119] make optimize() accept device graph

---
 .../neighbors/detail/cagra/cagra_build.cuh    | 52 +++++++++++++------
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 23 +++++---
 2 files changed, 53 insertions(+), 22 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 9b41f70f64..f1f55c90a0 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -1904,7 +1904,8 @@ void optimize(
   raft::resources const& res,
   raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, g_accessor> knn_graph,
   raft::host_matrix_view<IdxT, int64_t, raft::row_major> new_graph,
-  const bool guarantee_connectivity = false)
+  const bool guarantee_connectivity = false,
+  const IdxT* d_knn_graph_ptr       = nullptr)
 {
   using internal_IdxT = typename std::make_unsigned<IdxT>::type;
 
@@ -1922,7 +1923,12 @@ void optimize(
       knn_graph.extent(1));
 
   cagra::detail::graph::optimize(
-    res, knn_graph_internal, new_graph_internal, guarantee_connectivity);
+    res,
+    knn_graph_internal,
+    new_graph_internal,
+    guarantee_connectivity,
+    true,
+    reinterpret_cast<const internal_IdxT*>(d_knn_graph_ptr));
 }
 
 // RAII wrapper for allocating memory with Transparent HugePage
@@ -2034,14 +2040,20 @@ void search_and_optimize(raft::resources const& res,
                          bool flag_last,
                          const index_params& params)
 {
-  // Search.
-  // Since there are many queries, divide them into batches and search them.
+  auto stream = raft::resource::get_cuda_stream(res);
+
+  // Accumulate search results on device to avoid D-to-H + H-to-D round-trip.
+  auto dev_knn_graph =
+    raft::make_device_matrix<IdxT, int64_t>(res, curr_query_size, curr_topk);
+
+  // Search in batches, accumulate results on both device and host.
+  // Host copy is needed by optimize Phase 3 (edge selection) which currently runs on CPU.
   cuvs::spatial::knn::detail::utils::batch_load_iterator<T> query_batch(
     dev_query_view.data_handle(),
     curr_query_size,
     dev_query_view.extent(1),
     max_chunk_size,
-    raft::resource::get_cuda_stream(res),
+    stream,
     raft::resource::get_workspace_resource(res));
   for (const auto& batch : query_batch) {
     auto batch_dev_query_view = raft::make_device_matrix_view<const T, int64_t>(
@@ -2058,20 +2070,28 @@ void search_and_optimize(raft::resources const& res,
                                    batch_dev_neighbors_view,
                                    batch_dev_distances_view);
 
-    auto batch_neighbors_view = raft::make_host_matrix_view<IdxT, int64_t>(
-      neighbors_view.data_handle() + batch.offset() * curr_topk, batch.size(), curr_topk);
-    raft::copy(batch_neighbors_view.data_handle(),
+    // D-to-D: accumulate into device knn_graph
+    raft::copy(dev_knn_graph.data_handle() + batch.offset() * curr_topk,
+               batch_dev_neighbors_view.data_handle(),
+               batch.size() * curr_topk,
+               stream);
+
+    // D-to-H: still needed for optimize Phase 3 (host edge selection)
+    raft::copy(neighbors_view.data_handle() + batch.offset() * curr_topk,
                batch_dev_neighbors_view.data_handle(),
-               batch_neighbors_view.size(),
-               raft::resource::get_cuda_stream(res));
+               batch.size() * curr_topk,
+               stream);
   }
 
-  // Optimize graph
+  // Optimize graph, passing device knn_graph to skip H-to-D copy inside optimize Phase 2.
   auto next_graph_size = curr_query_size;
-  cagra_graph          = raft::make_host_matrix<IdxT, int64_t>(0, 0);  // delete existing grahp
+  cagra_graph          = raft::make_host_matrix<IdxT, int64_t>(0, 0);
   cagra_graph          = raft::make_host_matrix<IdxT, int64_t>(next_graph_size, next_graph_degree);
-  optimize<IdxT>(
-    res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0);
+  optimize<IdxT>(res,
+                 neighbors_view,
+                 cagra_graph.view(),
+                 flag_last ? params.guarantee_connectivity : 0,
+                 dev_knn_graph.data_handle());
 }
 
 template <typename T,
@@ -2188,6 +2208,7 @@ auto iterative_build_graph(
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
     RAFT_LOG_INFO("# VPQ compression time: %.3lf sec", (double)elapsed_ms / 1000);
   }
+  bool do_skip = true;
   while (true) {
     auto start           = std::chrono::high_resolution_clock::now();
     auto curr_query_size = std::min(2 * curr_graph_size, final_graph_size);
@@ -2205,7 +2226,8 @@ auto iterative_build_graph(
       curr_itopk_size = curr_topk + 32;
     }
 
-    bool do_skip = false;//params.skip_graph_optimization && !flag_last;
+    do_skip = false;//params.skip_graph_optimization && !flag_last;
+    RAFT_LOG_INFO("# do_skip = %s", do_skip ? "true" : "false");
 
     cuvs::neighbors::cagra::search_params search_params;
     search_params.algo           = cuvs::neighbors::cagra::search_algo::AUTO;
diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 69175f152f..89f9db082f 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -31,6 +31,7 @@
 #include <climits>
 #include <iostream>
 #include <memory>
+#include <optional>
 #include <random>
 
 namespace cuvs::neighbors::cagra::detail::graph {
@@ -1143,7 +1144,8 @@ void optimize(
   raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, g_accessor> knn_graph,
   raft::host_matrix_view<IdxT, int64_t, raft::row_major> new_graph,
   const bool guarantee_connectivity = true,
-  const bool use_gpu                = true)
+  const bool use_gpu                = true,
+  const IdxT* d_knn_graph_ptr       = nullptr)
 {
   RAFT_LOG_DEBUG(
     "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
@@ -1248,11 +1250,18 @@ void optimize(
 
       RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");
 
-      // Copy knn_graph over to device if necessary
-      device_matrix_view_from_host d_input_graph(
-        res,
-        raft::make_host_matrix_view<IdxT, int64_t>(
-          knn_graph.data_handle(), graph_size, knn_graph_degree));
+      // Use device knn_graph directly if provided; otherwise copy from host.
+      std::optional<device_matrix_view_from_host<IdxT, int64_t>> d_input_graph_copy;
+      const IdxT* d_input_graph_handle;
+      if (d_knn_graph_ptr != nullptr) {
+        d_input_graph_handle = d_knn_graph_ptr;
+      } else {
+        d_input_graph_copy.emplace(
+          res,
+          raft::make_host_matrix_view<IdxT, int64_t>(
+            knn_graph.data_handle(), graph_size, knn_graph_degree));
+        d_input_graph_handle = d_input_graph_copy->data_handle();
+      }
 
       constexpr int MAX_DEGREE = 1024;
       if (knn_graph_degree > MAX_DEGREE) {
@@ -1273,7 +1282,7 @@ void optimize(
       for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
         kern_prune<MAX_DEGREE, IdxT>
           <<<blocks_prune, threads_prune, 0, raft::resource::get_cuda_stream(res)>>>(
-            d_input_graph.data_handle(),
+            d_input_graph_handle,
             graph_size,
             knn_graph_degree,
             output_graph_degree,

From c540cbc5ba7a7522568851e5e24695d6981d134b Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Thu, 26 Feb 2026 03:37:40 -0800
Subject: [PATCH 064/119] use reconstructed queries and free the original
 dataset in cagraq iterative graph construction

---
 .../neighbors/detail/cagra/cagra_build.cuh    | 88 +++++++++++++++++--
 1 file changed, 80 insertions(+), 8 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index f1f55c90a0..eb452fa445 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -1977,6 +1977,58 @@ struct mmap_owner {
   size_t size_;
 };
 
+template <typename T, typename MathT>
+__global__ void kern_reconstruct_vpq_queries(const uint8_t* encoded_data,
+                                             uint32_t encoded_row_len,
+                                             const MathT* vq_codebook,
+                                             const MathT* pq_codebook,
+                                             uint32_t dim,
+                                             uint32_t pq_len,
+                                             uint64_t offset,
+                                             uint32_t batch_size,
+                                             T* output)
+{
+  const uint64_t batch_idx = blockIdx.x;
+  if (batch_idx >= batch_size) return;
+  const uint64_t vec_idx       = offset + batch_idx;
+  const uint8_t* vec_data      = encoded_data + vec_idx * encoded_row_len;
+  const uint32_t vq_code       = *reinterpret_cast<const uint32_t*>(vec_data);
+  const uint8_t* pq_codes      = vec_data + sizeof(uint32_t);
+  const MathT* vq_centroid_ptr = vq_codebook + static_cast<uint64_t>(vq_code) * dim;
+
+  for (uint32_t d = threadIdx.x; d < dim; d += blockDim.x) {
+    uint32_t j   = d / pq_len;
+    uint32_t k   = d % pq_len;
+    float val    = static_cast<float>(vq_centroid_ptr[d]) +
+                static_cast<float>(pq_codebook[static_cast<uint32_t>(pq_codes[j]) * pq_len + k]);
+    output[batch_idx * dim + d] = static_cast<T>(val);
+  }
+}
+
+template <typename T, typename MathT, typename IdxT>
+void reconstruct_vpq_queries(raft::resources const& res,
+                             const vpq_dataset<MathT, IdxT>& vpq_dset,
+                             uint64_t offset,
+                             uint32_t batch_size,
+                             raft::device_matrix_view<T, int64_t> output)
+{
+  const uint32_t dim     = vpq_dset.dim();
+  const uint32_t pq_len  = vpq_dset.pq_len();
+  const uint32_t threads = std::min(dim, 256u);
+
+  kern_reconstruct_vpq_queries<T, MathT>
+    <<<batch_size, threads, 0, raft::resource::get_cuda_stream(res)>>>(
+      vpq_dset.data.data_handle(),
+      vpq_dset.encoded_row_length(),
+      vpq_dset.vq_code_book.data_handle(),
+      vpq_dset.pq_code_book.data_handle(),
+      dim,
+      pq_len,
+      offset,
+      batch_size,
+      output.data_handle());
+}
+
 template <typename T, typename IdxT>
 void search_to_device_graph(raft::resources const& res,
                             const cuvs::neighbors::cagra::search_params& search_params,
@@ -2192,13 +2244,13 @@ auto iterative_build_graph(
   bool use_device_graph = false;
 
   // Generate the compressed index once if compression is enabled
+  const uint64_t dataset_dim = dev_dataset.extent(1);
   std::optional<index<T, IdxT>> idx_opt;
   if (params.compression.has_value()) {
     auto start = std::chrono::high_resolution_clock::now();
     RAFT_EXPECTS(params.metric == cuvs::distance::DistanceType::L2Expanded,
                  "VPQ compression is only supported with L2Expanded distance mertric");
     idx_opt.emplace(res, params.metric);
-    // idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view()));
     idx_opt->update_dataset(
       res,
       // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later
@@ -2207,6 +2259,12 @@ auto iterative_build_graph(
     auto end        = std::chrono::high_resolution_clock::now();
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
     RAFT_LOG_INFO("# VPQ compression time: %.3lf sec", (double)elapsed_ms / 1000);
+
+    // Free the original dataset -- queries will be reconstructed from VPQ codes.
+    dev_aligned_dataset.reset();
+    RAFT_LOG_INFO(
+      "# Freed original dataset from device (%.1f MiB); queries will use VPQ reconstruction",
+      to_mib(final_graph_size * dataset_dim * sizeof(T)));
   }
   bool do_skip = true;
   while (true) {
@@ -2236,12 +2294,10 @@ auto iterative_build_graph(
     search_params.max_iterations = 8;
     search_params.search_width   = 1;
 
-    // Create an index (idx), a query view (dev_query_view), and a mdarray for
-    // search results (neighbors).
-    auto dev_dataset_view = raft::make_device_matrix_view<const T, int64_t>(
-      dev_dataset.data_handle(), (int64_t)curr_graph_size, dev_dataset.extent(1));
-
+    // Create index and query views.
     if (!params.compression.has_value()) {
+      auto dev_dataset_view = raft::make_device_matrix_view<const T, int64_t>(
+        dev_dataset.data_handle(), (int64_t)curr_graph_size, dev_dataset.extent(1));
       if (use_device_graph) {
         idx_opt.emplace(
           res, params.metric, dev_dataset_view, raft::make_const_mdspan(dev_graph.view()));
@@ -2258,8 +2314,24 @@ auto iterative_build_graph(
     }
     const auto& idx = *idx_opt;
 
-    auto dev_query_view = raft::make_device_matrix_view<const T, int64_t>(
-      dev_dataset.data_handle(), (int64_t)curr_query_size, dev_dataset.extent(1));
+    // When compression is enabled, reconstruct queries from VPQ codes instead of
+    // reading from the (freed) original dataset.
+    auto dev_reconstructed_queries =
+      params.compression.has_value()
+        ? raft::make_device_matrix<T, int64_t>(res, curr_query_size, dataset_dim)
+        : raft::make_device_matrix<T, int64_t>(res, 0, 0);
+    if (params.compression.has_value()) {
+      auto* vpq_dset =
+        dynamic_cast<const vpq_dataset<half, int64_t>*>(&idx.data());
+      RAFT_EXPECTS(vpq_dset != nullptr, "Expected VPQ dataset in compressed index");
+      reconstruct_vpq_queries<T, half, int64_t>(
+        res, *vpq_dset, 0, curr_query_size, dev_reconstructed_queries.view());
+    }
+    auto dev_query_view = params.compression.has_value()
+      ? raft::make_device_matrix_view<const T, int64_t>(
+          dev_reconstructed_queries.data_handle(), (int64_t)curr_query_size, dataset_dim)
+      : raft::make_device_matrix_view<const T, int64_t>(
+          dev_dataset.data_handle(), (int64_t)curr_query_size, dev_dataset.extent(1));
 
     if (do_skip) {
       auto dev_graph_next =

From 099a8d71537134c16f1d1f7530d4e00c5fb1198a Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Thu, 26 Feb 2026 05:40:46 -0800
Subject: [PATCH 065/119] moved edge selection to gpu

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 188 +++++++++++++-----
 1 file changed, 136 insertions(+), 52 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 89f9db082f..a9b033c855 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -242,6 +242,53 @@ __global__ void kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_
   }
 }
 
+template <class IdxT>
+__global__ void kern_select_edges(const uint8_t* const d_detour_count,
+                                  const IdxT* const d_knn_graph,
+                                  IdxT* const d_output_graph,
+                                  const uint32_t graph_size,
+                                  const uint32_t knn_graph_degree,
+                                  const uint32_t output_graph_degree,
+                                  uint32_t* const d_invalid_count)
+{
+  const uint64_t i = static_cast<uint64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (i >= graph_size) return;
+
+  const uint8_t* my_detour = d_detour_count + i * knn_graph_degree;
+  const IdxT* my_knn       = d_knn_graph + i * knn_graph_degree;
+  IdxT* my_output          = d_output_graph + i * output_graph_degree;
+
+  uint32_t pk         = 0;
+  uint32_t num_detour = 0;
+  for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) {
+    uint32_t next_num_detour = 0xFFFFFFFFu;
+    for (uint32_t k = 0; k < knn_graph_degree; k++) {
+      const uint32_t d = my_detour[k];
+      if (d > num_detour) { next_num_detour = min(next_num_detour, d); }
+      if (d != num_detour) { continue; }
+
+      const IdxT candidate = my_knn[k];
+      bool dup              = false;
+      for (uint32_t dk = 0; dk < pk; dk++) {
+        if (candidate == my_output[dk]) {
+          dup = true;
+          break;
+        }
+      }
+      if (!dup && candidate < static_cast<IdxT>(graph_size)) {
+        my_output[pk] = candidate;
+        pk++;
+      }
+      if (pk >= output_graph_degree) break;
+    }
+    if (pk >= output_graph_degree) break;
+    if (next_num_detour == 0xFFFFFFFFu) break;
+    num_detour = next_num_detour;
+  }
+
+  if (pk != output_graph_degree) { atomicAdd(d_invalid_count, 1); }
+}
+
 template <class IdxT, class LabelT>
 __device__ __host__ LabelT get_root_label(IdxT i, const LabelT* label)
 {
@@ -1299,8 +1346,6 @@ void optimize(
       raft::resource::sync_stream(res);
       RAFT_LOG_DEBUG("\n");
 
-      raft::copy(res, detour_count.view(), raft::make_const_mdspan(d_detour_count.view()));
-
       raft::copy(res, host_stats.view(), raft::make_const_mdspan(dev_stats.view()));
       num_keep = host_stats.data_handle()[0];
       num_full = host_stats.data_handle()[1];
@@ -1314,6 +1359,45 @@ void optimize(
         (double)num_keep / graph_size,
         output_graph_degree,
         (double)num_full / graph_size * 100);
+
+      // GPU edge selection: pick output_graph_degree edges per node with lowest detour counts.
+      {
+        raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> select_scope(
+          "cagra::graph::optimize/prune/edge-selection-by-GPU");
+        auto d_output_graph = raft::make_device_mdarray<IdxT>(
+          res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
+        auto d_invalid_count = raft::make_device_mdarray<uint32_t>(
+          res, large_tmp_mr, raft::make_extents<int64_t>(1));
+        raft::matrix::fill(res, d_invalid_count.view(), uint32_t(0));
+
+        const uint32_t select_threads = 256;
+        const uint32_t select_blocks  = (graph_size + select_threads - 1) / select_threads;
+        kern_select_edges<IdxT>
+          <<<select_blocks, select_threads, 0, raft::resource::get_cuda_stream(res)>>>(
+            d_detour_count.data_handle(),
+            d_input_graph_handle,
+            d_output_graph.data_handle(),
+            graph_size,
+            knn_graph_degree,
+            output_graph_degree,
+            d_invalid_count.data_handle());
+        raft::resource::sync_stream(res);
+
+        auto h_invalid_count = raft::make_host_vector<uint32_t, int64_t>(1);
+        raft::copy(res, h_invalid_count.view(), raft::make_const_mdspan(d_invalid_count.view()));
+        raft::resource::sync_stream(res);
+        RAFT_EXPECTS(
+          h_invalid_count.data_handle()[0] == 0,
+          "Could not generate an intermediate CAGRA graph because the initial kNN graph "
+          "contains too many invalid or duplicated neighbor nodes. (%u nodes failed)",
+          h_invalid_count.data_handle()[0]);
+
+        raft::copy(output_graph_ptr,
+                   d_output_graph.data_handle(),
+                   graph_size * output_graph_degree,
+                   raft::resource::get_cuda_stream(res));
+        raft::resource::sync_stream(res);
+      }
     } else {
       // Count 2-hop detours on CPU
       raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
@@ -1325,66 +1409,66 @@ void optimize(
       const double time_2hop_count_end = cur_time();
       RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec",
                      time_2hop_count_end - time_2hop_count_start);
-    }
 
-    // Create pruned kNN graph
-    bool invalid_neighbor_list = false;
+      // Create pruned kNN graph
+      bool invalid_neighbor_list = false;
 #pragma omp parallel for
-    for (uint64_t i = 0; i < graph_size; i++) {
-      // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable
-      // count of the neighbors while increasing the target detourable count from zero.
-      uint64_t pk         = 0;
-      uint32_t num_detour = 0;
-      for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) {
-        uint32_t next_num_detour = std::numeric_limits<uint32_t>::max();
-        for (uint64_t k = 0; k < knn_graph_degree; k++) {
-          const auto num_detour_k = detour_count(i, k);
-          // Find the detourable count to check in the next iteration
-          if (num_detour_k > num_detour) {
-            next_num_detour = std::min(static_cast<uint32_t>(num_detour_k), next_num_detour);
-          }
-
-          // Store the neighbor index if its detourable count is equal to `num_detour`.
-          if (num_detour_k != num_detour) { continue; }
+      for (uint64_t i = 0; i < graph_size; i++) {
+        // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable
+        // count of the neighbors while increasing the target detourable count from zero.
+        uint64_t pk         = 0;
+        uint32_t num_detour = 0;
+        for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) {
+          uint32_t next_num_detour = std::numeric_limits<uint32_t>::max();
+          for (uint64_t k = 0; k < knn_graph_degree; k++) {
+            const auto num_detour_k = detour_count(i, k);
+            // Find the detourable count to check in the next iteration
+            if (num_detour_k > num_detour) {
+              next_num_detour = std::min(static_cast<uint32_t>(num_detour_k), next_num_detour);
+            }
 
-          // Check duplication and append
-          const auto candidate_node = knn_graph(i, k);
-          bool dup                  = false;
-          for (uint32_t dk = 0; dk < pk; dk++) {
-            if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) {
-              dup = true;
-              break;
+            // Store the neighbor index if its detourable count is equal to `num_detour`.
+            if (num_detour_k != num_detour) { continue; }
+
+            // Check duplication and append
+            const auto candidate_node = knn_graph(i, k);
+            bool dup                  = false;
+            for (uint32_t dk = 0; dk < pk; dk++) {
+              if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) {
+                dup = true;
+                break;
+              }
             }
-          }
-          if (!dup && candidate_node < graph_size) {
-            output_graph_ptr[i * output_graph_degree + pk] = candidate_node;
-            pk += 1;
+            if (!dup && candidate_node < graph_size) {
+              output_graph_ptr[i * output_graph_degree + pk] = candidate_node;
+              pk += 1;
+            }
+            if (pk >= output_graph_degree) break;
           }
           if (pk >= output_graph_degree) break;
-        }
-        if (pk >= output_graph_degree) break;
 
-        if (next_num_detour == std::numeric_limits<uint32_t>::max()) {
-          // There are no valid edges enough in the initial kNN graph. Break the loop here and catch
-          // the error at the next validation (pk != output_graph_degree).
-          break;
+          if (next_num_detour == std::numeric_limits<uint32_t>::max()) {
+            // There are no valid edges enough in the initial kNN graph. Break the loop here and
+            // catch the error at the next validation (pk != output_graph_degree).
+            break;
+          }
+          num_detour = next_num_detour;
+        }
+        if (pk != output_graph_degree) {
+          RAFT_LOG_DEBUG(
+            "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for "
+            "node %lu in the rank-based node reranking process",
+            output_graph_degree,
+            i);
+          invalid_neighbor_list = true;
         }
-        num_detour = next_num_detour;
-      }
-      if (pk != output_graph_degree) {
-        RAFT_LOG_DEBUG(
-          "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for "
-          "node %lu in the rank-based node reranking process",
-          output_graph_degree,
-          i);
-        invalid_neighbor_list = true;
       }
+      RAFT_EXPECTS(
+        !invalid_neighbor_list,
+        "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
+        "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
+        "overflows occur during the norm computation between the dataset vectors.");
     }
-    RAFT_EXPECTS(
-      !invalid_neighbor_list,
-      "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
-      "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
-      "overflows occur during the norm computation between the dataset vectors.");
 
     const double time_prune_end = cur_time();
     RAFT_LOG_DEBUG("# Pruning time: %.1lf ms", (time_prune_end - time_prune_start) * 1000.0);

From 764aa64ce5dd411b1abe3907574f31010fb214ec Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Thu, 26 Feb 2026 07:37:09 -0800
Subject: [PATCH 066/119] Moved reverse graph construction to GPU

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 138 +++++++++++++-----
 1 file changed, 101 insertions(+), 37 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index a9b033c855..e8a73b46cb 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -242,6 +242,20 @@ __global__ void kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_
   }
 }
 
+template <class IdxT>
+__global__ void kern_extract_column(const IdxT* const d_matrix,
+                                    IdxT* const d_column,
+                                    const uint32_t n_rows,
+                                    const uint32_t n_cols,
+                                    const uint32_t col_idx)
+{
+  const uint32_t tid  = threadIdx.x + (blockDim.x * blockIdx.x);
+  const uint32_t tnum = blockDim.x * gridDim.x;
+  for (uint32_t i = tid; i < n_rows; i += tnum) {
+    d_column[i] = d_matrix[col_idx + (static_cast<uint64_t>(n_cols) * i)];
+  }
+}
+
 template <class IdxT>
 __global__ void kern_select_edges(const uint8_t* const d_detour_count,
                                   const IdxT* const d_knn_graph,
@@ -1234,6 +1248,10 @@ void optimize(
     }
   }
 
+  // Device pruned graph: populated by GPU edge selection, reused by GPU reverse graph.
+  auto d_pruned_graph = raft::make_device_mdarray<IdxT>(
+    res, large_tmp_mr, raft::make_extents<int64_t>(0, 0));
+
   {
     raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
       "cagra::graph::optimize/prune");
@@ -1361,11 +1379,11 @@ void optimize(
         (double)num_full / graph_size * 100);
 
       // GPU edge selection: pick output_graph_degree edges per node with lowest detour counts.
+      d_pruned_graph = raft::make_device_mdarray<IdxT>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
       {
         raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> select_scope(
           "cagra::graph::optimize/prune/edge-selection-by-GPU");
-        auto d_output_graph = raft::make_device_mdarray<IdxT>(
-          res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
         auto d_invalid_count = raft::make_device_mdarray<uint32_t>(
           res, large_tmp_mr, raft::make_extents<int64_t>(1));
         raft::matrix::fill(res, d_invalid_count.view(), uint32_t(0));
@@ -1376,7 +1394,7 @@ void optimize(
           <<<select_blocks, select_threads, 0, raft::resource::get_cuda_stream(res)>>>(
             d_detour_count.data_handle(),
             d_input_graph_handle,
-            d_output_graph.data_handle(),
+            d_pruned_graph.data_handle(),
             graph_size,
             knn_graph_degree,
             output_graph_degree,
@@ -1393,7 +1411,7 @@ void optimize(
           h_invalid_count.data_handle()[0]);
 
         raft::copy(output_graph_ptr,
-                   d_output_graph.data_handle(),
+                   d_pruned_graph.data_handle(),
                    graph_size * output_graph_degree,
                    raft::resource::get_cuda_stream(res));
         raft::resource::sync_stream(res);
@@ -1485,48 +1503,94 @@ void optimize(
     //
     const double time_make_start = cur_time();
 
-    device_matrix_view_from_host<IdxT, int64_t> d_rev_graph(res, rev_graph.view());
-    raft::matrix::fill(res,
-                       raft::make_device_vector_view<IdxT, int64_t>(
-                         d_rev_graph.data_handle(), graph_size * output_graph_degree),
-                       IdxT(-1));
-
-    auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
-      res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
-    raft::matrix::fill(res, d_rev_graph_count.view(), uint32_t(0));
+    if (d_pruned_graph.extent(0) > 0) {
+      // GPU path: d_pruned_graph is on device; extract columns on device to preserve
+      // column-priority ordering (earlier columns get priority in the reverse graph).
+      auto d_rev_graph = raft::make_device_mdarray<IdxT>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
+      raft::matrix::fill(res,
+                         raft::make_device_vector_view<IdxT, int64_t>(
+                           d_rev_graph.data_handle(), graph_size * output_graph_degree),
+                         IdxT(-1));
+
+      auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
+      raft::matrix::fill(res, d_rev_graph_count.view(), uint32_t(0));
 
-    auto dest_nodes = raft::make_host_vector<IdxT, int64_t>(graph_size);
-    auto d_dest_nodes =
-      raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
+      auto d_dest_nodes =
+        raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
 
-    for (uint64_t k = 0; k < output_graph_degree; k++) {
-#pragma omp parallel for
-      for (uint64_t i = 0; i < graph_size; i++) {
-        // dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)];
-        dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)];
+      for (uint64_t k = 0; k < output_graph_degree; k++) {
+        dim3 ext_threads(256, 1, 1);
+        dim3 ext_blocks(std::min(static_cast<uint32_t>((graph_size + 255) / 256), 65535u), 1, 1);
+        kern_extract_column<IdxT>
+          <<<ext_blocks, ext_threads, 0, raft::resource::get_cuda_stream(res)>>>(
+            d_pruned_graph.data_handle(),
+            d_dest_nodes.data_handle(),
+            graph_size,
+            output_graph_degree,
+            k);
+
+        dim3 threads(256, 1, 1);
+        dim3 blocks(1024, 1, 1);
+        kern_make_rev_graph<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
+          d_dest_nodes.data_handle(),
+          d_rev_graph.data_handle(),
+          d_rev_graph_count.data_handle(),
+          graph_size,
+          output_graph_degree);
       }
       raft::resource::sync_stream(res);
 
-      raft::copy(res, d_dest_nodes.view(), raft::make_const_mdspan(dest_nodes.view()));
+      d_pruned_graph = raft::make_device_mdarray<IdxT>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(0, 0));
 
-      dim3 threads(256, 1, 1);
-      dim3 blocks(1024, 1, 1);
-      kern_make_rev_graph<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
-        d_dest_nodes.data_handle(),
-        d_rev_graph.data_handle(),
-        d_rev_graph_count.data_handle(),
-        graph_size,
-        output_graph_degree);
-      RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u    \r", k, output_graph_degree);
-    }
+      raft::copy(res, rev_graph.view(), raft::make_const_mdspan(d_rev_graph.view()));
+      raft::copy(res, rev_graph_count.view(), raft::make_const_mdspan(d_rev_graph_count.view()));
+    } else {
+      // CPU fallback: per-column H-to-D copy approach.
+      device_matrix_view_from_host<IdxT, int64_t> d_rev_graph(res, rev_graph.view());
+      raft::matrix::fill(res,
+                         raft::make_device_vector_view<IdxT, int64_t>(
+                           d_rev_graph.data_handle(), graph_size * output_graph_degree),
+                         IdxT(-1));
+
+      auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
+      raft::matrix::fill(res, d_rev_graph_count.view(), uint32_t(0));
 
-    raft::resource::sync_stream(res);
-    RAFT_LOG_DEBUG("\n");
+      auto dest_nodes = raft::make_host_vector<IdxT, int64_t>(graph_size);
+      auto d_dest_nodes =
+        raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
 
-    if (d_rev_graph.allocated_memory()) {
-      raft::copy(res, rev_graph.view(), raft::make_const_mdspan(d_rev_graph.view()));
+      for (uint64_t k = 0; k < output_graph_degree; k++) {
+#pragma omp parallel for
+        for (uint64_t i = 0; i < graph_size; i++) {
+          dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)];
+        }
+        raft::resource::sync_stream(res);
+
+        raft::copy(res, d_dest_nodes.view(), raft::make_const_mdspan(dest_nodes.view()));
+
+        dim3 threads(256, 1, 1);
+        dim3 blocks(1024, 1, 1);
+        kern_make_rev_graph<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
+          d_dest_nodes.data_handle(),
+          d_rev_graph.data_handle(),
+          d_rev_graph_count.data_handle(),
+          graph_size,
+          output_graph_degree);
+        RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u    \r", k, output_graph_degree);
+      }
+
+      raft::resource::sync_stream(res);
+      RAFT_LOG_DEBUG("\n");
+
+      if (d_rev_graph.allocated_memory()) {
+        raft::copy(res, rev_graph.view(), raft::make_const_mdspan(d_rev_graph.view()));
+      }
+      raft::copy(res, rev_graph_count.view(), raft::make_const_mdspan(d_rev_graph_count.view()));
     }
-    raft::copy(res, rev_graph_count.view(), raft::make_const_mdspan(d_rev_graph_count.view()));
 
     const double time_make_end = cur_time();
     RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms",

From ecf3b1db009a78adaeea268ff51d1c2d79763eb9 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 27 Feb 2026 15:39:01 +0000
Subject: [PATCH 067/119] extract prune into separate function

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 497 +++++++++---------
 1 file changed, 245 insertions(+), 252 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 8705f555b6..70cd29aa4a 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -1343,6 +1343,246 @@ void count_2hop_detours(raft::host_matrix_view<IdxT, int64_t, raft::row_major> k
   }
 }
 
+//
+// Prune unimportant edges based on 2-hop detour counts.
+//
+// The edge to be retained is determined without explicitly considering distance or angle.
+// Suppose the edge is the k-th edge of some node-A to node-B (A->B). Among the edges
+// originating at node-A, there are k-1 edges shorter than the edge A->B. Each of these
+// k-1 edges are connected to a different k-1 nodes. Among these k-1 nodes, count the
+// number of nodes with edges to node-B, which is the number of 2-hop detours for the
+// edge A->B. Once the number of 2-hop detours has been counted for all edges, the
+// specified number of edges are picked up for each node, starting with the edge with
+// the lowest number of 2-hop detours.
+//
+template <typename IdxT, typename InputMatrixView, typename OutputMatrixView>
+void prune_graph(raft::resources const& res,
+                 InputMatrixView knn_graph,
+                 OutputMatrixView output_graph,
+                 bool use_gpu)
+{
+  const uint64_t graph_size          = output_graph.extent(0);
+  const uint64_t knn_graph_degree    = knn_graph.extent(1);
+  const uint64_t output_graph_degree = output_graph.extent(1);
+  auto output_graph_ptr              = output_graph.data_handle();
+
+  auto large_tmp_mr = raft::resource::get_large_workspace_resource(res);
+
+  uint32_t batch_size =
+    std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
+  const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
+
+  bool use_gpu_prune = use_gpu;
+  if (use_gpu_prune) {
+    try {
+      auto d_detour_count = raft::make_device_mdarray<uint8_t>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, knn_graph_degree));
+      auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size));
+      auto d_output_graph = raft::make_device_mdarray<IdxT>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, output_graph_degree));
+      auto d_input_graph = raft::make_device_mdarray<IdxT>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, knn_graph_degree));
+    } catch (std::bad_alloc& e) {
+      RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU");
+      use_gpu_prune = false;
+    } catch (raft::logic_error& e) {
+      RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU (logic error)");
+      use_gpu_prune = false;
+    }
+  }
+
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+    "cagra::graph::optimize/prune");
+  const double time_prune_start = cur_time();
+
+  if (use_gpu_prune) {
+    RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");
+
+    constexpr int MAX_DEGREE = 1024;
+    if (knn_graph_degree > MAX_DEGREE) {
+      RAFT_FAIL(
+        "The degree of input knn graph is too large (%zu). "
+        "It must be equal to or smaller than %d.",
+        knn_graph_degree,
+        MAX_DEGREE);
+    }
+
+    const double prune_start = cur_time();
+
+    uint64_t num_keep __attribute__((unused)) = 0;
+    uint64_t num_full __attribute__((unused)) = 0;
+    auto dev_stats                            = raft::make_device_vector<uint64_t>(res, 2);
+    auto host_stats                           = raft::make_host_vector<uint64_t>(2);
+    RAFT_CUDA_TRY(cudaMemsetAsync(
+      dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res)));
+
+    device_matrix_view_from_host d_input_graph(
+      res,
+      raft::make_host_matrix_view<IdxT, int64_t>(
+        knn_graph.data_handle(), graph_size, knn_graph_degree));
+
+    auto d_detour_count = raft::make_device_mdarray<uint8_t>(
+      res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, knn_graph_degree));
+    auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
+      res, large_tmp_mr, raft::make_extents<int64_t>(batch_size));
+    auto d_output_graph = raft::make_device_mdarray<IdxT>(
+      res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, output_graph_degree));
+    auto d_invalid_neighbor_list = raft::make_device_scalar<uint32_t>(res, 0u);
+
+    for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
+                                    0xff,
+                                    batch_size * knn_graph_degree * sizeof(uint8_t),
+                                    raft::resource::get_cuda_stream(res)));
+
+      RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(),
+                                    0x00,
+                                    batch_size * sizeof(uint32_t),
+                                    raft::resource::get_cuda_stream(res)));
+
+      const dim3 threads_prune(32, 1, 1);
+      const dim3 blocks_prune(batch_size, 1, 1);
+      const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT);
+      kern_prune<MAX_DEGREE, IdxT>
+        <<<blocks_prune, threads_prune, prune_smem_size, raft::resource::get_cuda_stream(res)>>>(
+          d_input_graph.data_handle(),
+          graph_size,
+          knn_graph_degree,
+          output_graph_degree,
+          batch_size,
+          i_batch,
+          d_detour_count.data_handle(),
+          d_num_no_detour_edges.data_handle(),
+          dev_stats.data_handle());
+
+      const size_t select_smem_size = (knn_graph_degree) * (sizeof(uint16_t) + sizeof(IdxT));
+      const dim3 threads_select(32, 1, 1);
+      const dim3 blocks_select(batch_size, 1, 1);
+      kern_select_smallest_detour_neighbors<IdxT>
+        <<<blocks_select, threads_select, select_smem_size, raft::resource::get_cuda_stream(res)>>>(
+          d_input_graph.data_handle(),
+          graph_size,
+          knn_graph_degree,
+          output_graph_degree,
+          d_detour_count.data_handle(),
+          d_output_graph.data_handle(),
+          batch_size,
+          i_batch,
+          d_invalid_neighbor_list.data_handle());
+
+      size_t copy_size =
+        std::min(static_cast<size_t>(batch_size), graph_size - i_batch * batch_size) *
+        output_graph_degree;
+      raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree,
+                 d_output_graph.data_handle(),
+                 copy_size,
+                 raft::resource::get_cuda_stream(res));
+
+      raft::resource::sync_stream(res);
+      RAFT_LOG_DEBUG(
+        "# Pruning kNN Graph on GPUs (%.1lf %%)\r",
+        (double)std::min<IdxT>((i_batch + 1) * batch_size, graph_size) / graph_size * 100);
+    }
+    raft::resource::sync_stream(res);
+    RAFT_LOG_DEBUG("\n");
+
+    uint32_t invalid_neighbor_list = 0;
+    raft::copy(&invalid_neighbor_list,
+               d_invalid_neighbor_list.data_handle(),
+               1,
+               raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+    RAFT_EXPECTS(
+      invalid_neighbor_list == 0,
+      "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
+      "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
+      "overflows occur during the norm computation between the dataset vectors.");
+
+    raft::copy(
+      host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res));
+    num_keep = host_stats.data_handle()[0];
+    num_full = host_stats.data_handle()[1];
+
+    const double prune_end = cur_time();
+    RAFT_LOG_DEBUG(
+      "# Time for pruning on GPU: %.1lf sec, "
+      "avg_no_detour_edges_per_node: %.2lf/%u, "
+      "nodes_with_no_detour_at_all_edges: %.1lf%%",
+      prune_end - prune_start,
+      (double)num_keep / graph_size,
+      output_graph_degree,
+      (double)num_full / graph_size * 100);
+  } else {
+    auto detour_count = raft::make_host_matrix<uint8_t, int64_t>(graph_size, knn_graph_degree);
+
+    {
+      raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+        "cagra::graph::optimize/prune/2-hop-counting-by-CPU");
+      const double time_2hop_count_start = cur_time();
+
+      auto knn_graph_view = raft::make_host_matrix_view<IdxT, int64_t>(
+        knn_graph.data_handle(), knn_graph.extent(0), knn_graph.extent(1));
+      count_2hop_detours(knn_graph_view, detour_count.view());
+
+      const double time_2hop_count_end = cur_time();
+      RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec",
+                     time_2hop_count_end - time_2hop_count_start);
+    }
+    bool invalid_neighbor_list = false;
+#pragma omp parallel for
+    for (uint64_t i = 0; i < graph_size; i++) {
+      uint64_t pk         = 0;
+      uint32_t num_detour = 0;
+      for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) {
+        uint32_t next_num_detour = std::numeric_limits<uint32_t>::max();
+        for (uint64_t k = 0; k < knn_graph_degree; k++) {
+          const auto num_detour_k = detour_count(i, k);
+          if (num_detour_k > num_detour) {
+            next_num_detour = std::min(static_cast<uint32_t>(num_detour_k), next_num_detour);
+          }
+
+          if (num_detour_k != num_detour) { continue; }
+
+          const auto candidate_node = knn_graph(i, k);
+          bool dup                  = false;
+          for (uint32_t dk = 0; dk < pk; dk++) {
+            if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) {
+              dup = true;
+              break;
+            }
+          }
+          if (!dup && candidate_node < graph_size) {
+            output_graph_ptr[i * output_graph_degree + pk] = candidate_node;
+            pk += 1;
+          }
+          if (pk >= output_graph_degree) break;
+        }
+        if (pk >= output_graph_degree) break;
+
+        if (next_num_detour == std::numeric_limits<uint32_t>::max()) { break; }
+        num_detour = next_num_detour;
+      }
+      if (pk != output_graph_degree) {
+        RAFT_LOG_DEBUG(
+          "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for "
+          "node %lu in the rank-based node reranking process",
+          output_graph_degree,
+          i);
+        invalid_neighbor_list = true;
+      }
+    }
+    RAFT_EXPECTS(
+      !invalid_neighbor_list,
+      "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
+      "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
+      "overflows occur during the norm computation between the dataset vectors.");
+  }
+
+  const double time_prune_end = cur_time();
+  RAFT_LOG_DEBUG("# Pruning time: %.1lf ms", (time_prune_end - time_prune_start) * 1000.0);
+}
+
 // TODO allow pinned input for both knn_graph and new_graph
 template <typename IdxT = uint32_t,
           typename g_accessor =
@@ -1399,258 +1639,7 @@ void optimize(
     }
   }
 
-  uint32_t batch_size =
-    std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
-  const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
-
-  //
-  // If the available device memory is insufficient, do not use the GPU to count
-  // the number of 2-hop detours, but use the CPU.
-  //
-  // TODO: we should decide on a global strategy for this in a single place
-  // it comes down to input memory type and available memory which data should be copied to GPU
-  bool _use_gpu_prune = use_gpu;
-  if (_use_gpu_prune) {
-    try {
-      auto d_detour_count = raft::make_device_mdarray<uint8_t>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, knn_graph_degree));
-      auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size));
-      auto d_output_graph = raft::make_device_mdarray<IdxT>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, output_graph_degree));
-      // TODO we also want to consider pinned memory in case we are short on memory
-      auto d_input_graph = raft::make_device_mdarray<IdxT>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, knn_graph_degree));
-    } catch (std::bad_alloc& e) {
-      RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU");
-      _use_gpu_prune = false;
-    } catch (raft::logic_error& e) {
-      RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU (logic error)");
-      _use_gpu_prune = false;
-    }
-  }
-
-  {
-    raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-      "cagra::graph::optimize/prune");
-    const double time_prune_start = cur_time();
-
-    //
-    // Prune unimportant edges.
-    //
-    // The edge to be retained is determined without explicitly considering
-    // distance or angle. Suppose the edge is the k-th edge of some node-A to
-    // node-B (A->B). Among the edges originating at node-A, there are k-1 edges
-    // shorter than the edge A->B. Each of these k-1 edges are connected to a
-    // different k-1 nodes. Among these k-1 nodes, count the number of nodes with
-    // edges to node-B, which is the number of 2-hop detours for the edge A->B.
-    // Once the number of 2-hop detours has been counted for all edges, the
-    // specified number of edges are picked up for each node, starting with the
-    // edge with the lowest number of 2-hop detours.
-    //
-    if (_use_gpu_prune) {
-      // Pruning on GPU
-      RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");
-
-      constexpr int MAX_DEGREE = 1024;
-      if (knn_graph_degree > MAX_DEGREE) {
-        RAFT_FAIL(
-          "The degree of input knn graph is too large (%zu). "
-          "It must be equal to or smaller than %d.",
-          knn_graph_degree,
-          MAX_DEGREE);
-      }
-
-      const double prune_start = cur_time();
-
-      uint64_t num_keep __attribute__((unused)) = 0;
-      uint64_t num_full __attribute__((unused)) = 0;
-      auto dev_stats                            = raft::make_device_vector<uint64_t>(res, 2);
-      auto host_stats                           = raft::make_host_vector<uint64_t>(2);
-      RAFT_CUDA_TRY(cudaMemsetAsync(
-        dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res)));
-
-      // Copy knn_graph over to device if necessary
-      // TODO: should we use pinned memory if we have issues fitting on GPU?
-      device_matrix_view_from_host d_input_graph(
-        res,
-        raft::make_host_matrix_view<IdxT, int64_t>(
-          knn_graph.data_handle(), graph_size, knn_graph_degree));
-
-      // data structures per batch
-      auto d_detour_count = raft::make_device_mdarray<uint8_t>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, knn_graph_degree));
-      auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size));
-      auto d_output_graph = raft::make_device_mdarray<IdxT>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, output_graph_degree));
-      auto d_invalid_neighbor_list = raft::make_device_scalar<uint32_t>(res, 0u);
-
-      for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
-        // initialize the detour_count and num_no_detour_edges for the current batch
-        RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
-                                      0xff,
-                                      batch_size * knn_graph_degree * sizeof(uint8_t),
-                                      raft::resource::get_cuda_stream(res)));
-
-        RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(),
-                                      0x00,
-                                      batch_size * sizeof(uint32_t),
-                                      raft::resource::get_cuda_stream(res)));
-
-        // count 2-hop detours for the current batch
-        const dim3 threads_prune(32, 1, 1);
-        const dim3 blocks_prune(batch_size, 1, 1);
-        const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT);
-        kern_prune<MAX_DEGREE, IdxT>
-          <<<blocks_prune, threads_prune, prune_smem_size, raft::resource::get_cuda_stream(res)>>>(
-            d_input_graph.data_handle(),
-            graph_size,
-            knn_graph_degree,
-            output_graph_degree,
-            batch_size,
-            i_batch,
-            d_detour_count.data_handle(),
-            d_num_no_detour_edges.data_handle(),
-            dev_stats.data_handle());
-
-        // select smallest-detour neighbors for the current batch
-        const size_t select_smem_size = (knn_graph_degree) * (sizeof(uint16_t) + sizeof(IdxT));
-        const dim3 threads_select(32, 1, 1);
-        const dim3 blocks_select(batch_size, 1, 1);
-        kern_select_smallest_detour_neighbors<IdxT>
-          <<<blocks_select,
-             threads_select,
-             select_smem_size,
-             raft::resource::get_cuda_stream(res)>>>(d_input_graph.data_handle(),
-                                                     graph_size,
-                                                     knn_graph_degree,
-                                                     output_graph_degree,
-                                                     d_detour_count.data_handle(),
-                                                     d_output_graph.data_handle(),
-                                                     batch_size,
-                                                     i_batch,
-                                                     d_invalid_neighbor_list.data_handle());
-
-        size_t copy_size =
-          std::min(static_cast<size_t>(batch_size), graph_size - i_batch * batch_size) *
-          output_graph_degree;
-        raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree,
-                   d_output_graph.data_handle(),
-                   copy_size,
-                   raft::resource::get_cuda_stream(res));
-
-        raft::resource::sync_stream(res);
-        RAFT_LOG_DEBUG(
-          "# Pruning kNN Graph on GPUs (%.1lf %%)\r",
-          (double)std::min<IdxT>((i_batch + 1) * batch_size, graph_size) / graph_size * 100);
-      }
-      raft::resource::sync_stream(res);
-      RAFT_LOG_DEBUG("\n");
-
-      uint32_t invalid_neighbor_list = 0;
-      raft::copy(&invalid_neighbor_list,
-                 d_invalid_neighbor_list.data_handle(),
-                 1,
-                 raft::resource::get_cuda_stream(res));
-      raft::resource::sync_stream(res);
-      RAFT_EXPECTS(
-        invalid_neighbor_list == 0,
-        "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
-        "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
-        "overflows occur during the norm computation between the dataset vectors.");
-
-      raft::copy(
-        host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res));
-      num_keep = host_stats.data_handle()[0];
-      num_full = host_stats.data_handle()[1];
-
-      const double prune_end = cur_time();
-      RAFT_LOG_DEBUG(
-        "# Time for pruning on GPU: %.1lf sec, "
-        "avg_no_detour_edges_per_node: %.2lf/%u, "
-        "nodes_with_no_detour_at_all_edges: %.1lf%%",
-        prune_end - prune_start,
-        (double)num_keep / graph_size,
-        output_graph_degree,
-        (double)num_full / graph_size * 100);
-    } else {
-      // Pruning on CPU
-      auto detour_count = raft::make_host_matrix<uint8_t, int64_t>(graph_size, knn_graph_degree);
-
-      {
-        raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-          "cagra::graph::optimize/prune/2-hop-counting-by-CPU");
-        const double time_2hop_count_start = cur_time();
-
-        count_2hop_detours(knn_graph, detour_count.view());
-
-        const double time_2hop_count_end = cur_time();
-        RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec",
-                       time_2hop_count_end - time_2hop_count_start);
-      }
-      bool invalid_neighbor_list = false;
-#pragma omp parallel for
-      for (uint64_t i = 0; i < graph_size; i++) {
-        // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable
-        // count of the neighbors while increasing the target detourable count from zero.
-        uint64_t pk         = 0;
-        uint32_t num_detour = 0;
-        for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) {
-          uint32_t next_num_detour = std::numeric_limits<uint32_t>::max();
-          for (uint64_t k = 0; k < knn_graph_degree; k++) {
-            const auto num_detour_k = detour_count(i, k);
-            // Find the detourable count to check in the next iteration
-            if (num_detour_k > num_detour) {
-              next_num_detour = std::min(static_cast<uint32_t>(num_detour_k), next_num_detour);
-            }
-
-            // Store the neighbor index if its detourable count is equal to `num_detour`.
-            if (num_detour_k != num_detour) { continue; }
-
-            // Check duplication and append
-            const auto candidate_node = knn_graph(i, k);
-            bool dup                  = false;
-            for (uint32_t dk = 0; dk < pk; dk++) {
-              if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) {
-                dup = true;
-                break;
-              }
-            }
-            if (!dup && candidate_node < graph_size) {
-              output_graph_ptr[i * output_graph_degree + pk] = candidate_node;
-              pk += 1;
-            }
-            if (pk >= output_graph_degree) break;
-          }
-          if (pk >= output_graph_degree) break;
-
-          if (next_num_detour == std::numeric_limits<uint32_t>::max()) {
-            // There are no valid edges enough in the initial kNN graph. Break the loop here and
-            // catch the error at the next validation (pk != output_graph_degree).
-            break;
-          }
-          num_detour = next_num_detour;
-        }
-        if (pk != output_graph_degree) {
-          RAFT_LOG_DEBUG(
-            "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for "
-            "node %lu in the rank-based node reranking process",
-            output_graph_degree,
-            i);
-          invalid_neighbor_list = true;
-        }
-      }
-      RAFT_EXPECTS(
-        !invalid_neighbor_list,
-        "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
-        "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
-        "overflows occur during the norm computation between the dataset vectors.");
-    }
-
-    const double time_prune_end = cur_time();
-    RAFT_LOG_DEBUG("# Pruning time: %.1lf ms", (time_prune_end - time_prune_start) * 1000.0);
-  }
+  prune_graph<IdxT>(res, knn_graph, new_graph, use_gpu);
 
   auto rev_graph       = raft::make_host_matrix<IdxT, int64_t>(graph_size, output_graph_degree);
   auto rev_graph_count = raft::make_host_vector<uint32_t, int64_t>(graph_size);
@@ -1760,6 +1749,10 @@ void optimize(
       // Create a boolean variable on the GPU using RAFT device allocator
       auto d_check_num_protected_edges = raft::make_device_scalar<bool>(res, true);
 
+      uint32_t batch_size =
+        std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
+      const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
+
       const dim3 threads_merge(32, 1, 1);
       const dim3 blocks_merge(batch_size, 1, 1);
       const size_t merge_smem_size = (output_graph_degree + output_graph_degree) * sizeof(IdxT);

From 972d278c77c05add60e4518150e00b0f0f7898cf Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Mon, 2 Mar 2026 14:41:22 +0000
Subject: [PATCH 068/119] extract optimize components

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 1174 +++++++++--------
 1 file changed, 616 insertions(+), 558 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 70cd29aa4a..713b03ca20 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -674,6 +674,316 @@ void shift_array(T* array, uint64_t num)
     array[i] = array[i - 1];
   }
 }
+
+template <typename IdxT>
+void log_replaced_edges_stats(const IdxT* output_graph_ptr,
+                              uint64_t graph_size,
+                              uint64_t output_graph_degree)
+{
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+    "cagra::graph::optimize/stats");
+  uint64_t num_replaced_edges = 0;
+#pragma omp parallel for reduction(+ : num_replaced_edges)
+  for (uint64_t i = 0; i < graph_size; i++) {
+    for (uint64_t k = 0; k < output_graph_degree; k++) {
+      const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)];
+      const uint64_t pos =
+        pos_in_array<IdxT>(j, output_graph_ptr + (output_graph_degree * i), output_graph_degree);
+      if (pos == output_graph_degree) { num_replaced_edges += 1; }
+    }
+  }
+  RAFT_LOG_DEBUG("# Average number of replaced edges per node: %.2f",
+                 (double)num_replaced_edges / graph_size);
+}
+
+template <typename IdxT>
+void log_incoming_edges_histogram(const IdxT* output_graph_ptr,
+                                  uint64_t graph_size,
+                                  uint64_t output_graph_degree)
+{
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+    "cagra::graph::optimize/check_edges");
+  auto in_edge_count     = raft::make_host_vector<uint32_t, int64_t>(graph_size);
+  auto in_edge_count_ptr = in_edge_count.data_handle();
+#pragma omp parallel for
+  for (uint64_t i = 0; i < graph_size; i++) {
+    in_edge_count_ptr[i] = 0;
+  }
+#pragma omp parallel for
+  for (uint64_t i = 0; i < graph_size; i++) {
+    for (uint64_t k = 0; k < output_graph_degree; k++) {
+      const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)];
+      if (j >= graph_size) continue;
+#pragma omp atomic
+      in_edge_count_ptr[j] += 1;
+    }
+  }
+  auto hist     = raft::make_host_vector<uint32_t, int64_t>(output_graph_degree);
+  auto hist_ptr = hist.data_handle();
+  for (uint64_t k = 0; k < output_graph_degree; k++) {
+    hist_ptr[k] = 0;
+  }
+#pragma omp parallel for
+  for (uint64_t i = 0; i < graph_size; i++) {
+    uint32_t count = in_edge_count_ptr[i];
+    if (count >= output_graph_degree) continue;
+#pragma omp atomic
+    hist_ptr[count] += 1;
+  }
+  RAFT_LOG_DEBUG("# Histogram for number of incoming edges\n");
+  uint32_t sum_hist = 0;
+  for (uint64_t k = 0; k < output_graph_degree; k++) {
+    sum_hist += hist_ptr[k];
+    RAFT_LOG_DEBUG("# %3lu, %8u, %lf, (%8u, %lf)\n",
+                   k,
+                   hist_ptr[k],
+                   (double)hist_ptr[k] / graph_size,
+                   sum_hist,
+                   (double)sum_hist / graph_size);
+  }
+}
+
+template <typename IdxT>
+void check_duplicates_and_out_of_range(const IdxT* output_graph_ptr,
+                                       uint64_t graph_size,
+                                       uint64_t output_graph_degree)
+{
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+    "cagra::graph::optimize/check_duplicates");
+  uint64_t num_dup = 0;
+  uint64_t num_oor = 0;
+#pragma omp parallel for reduction(+ : num_dup) reduction(+ : num_oor)
+  for (uint64_t i = 0; i < graph_size; i++) {
+    auto my_out_graph = output_graph_ptr + (output_graph_degree * i);
+    for (uint32_t j = 0; j < output_graph_degree; j++) {
+      const auto neighbor_a = my_out_graph[j];
+
+      if (neighbor_a > graph_size) {
+        num_oor++;
+        continue;
+      }
+
+      for (uint32_t k = j + 1; k < output_graph_degree; k++) {
+        const auto neighbor_b = my_out_graph[k];
+        if (neighbor_a == neighbor_b) { num_dup++; }
+      }
+    }
+  }
+  RAFT_EXPECTS(
+    num_dup == 0, "%lu duplicated node(s) are found in the generated CAGRA graph", num_dup);
+  RAFT_EXPECTS(
+    num_oor == 0, "%lu out-of-range index node(s) are found in the generated CAGRA graph", num_oor);
+}
+
+template <typename IdxT>
+void merge_graph_gpu(raft::resources const& res,
+                     IdxT* output_graph_ptr,
+                     const IdxT* d_rev_graph,
+                     uint32_t* d_rev_graph_count,
+                     const IdxT* mst_graph_ptr,
+                     const uint32_t* mst_graph_num_edges_ptr,
+                     uint64_t graph_size,
+                     uint64_t output_graph_degree,
+                     bool guarantee_connectivity)
+{
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+    "cagra::graph::optimize/combine");
+
+  const double merge_graph_start = cur_time();
+
+  device_matrix_view_from_host<IdxT, int64_t> d_output_graph(
+    res,
+    raft::make_host_matrix_view<IdxT, int64_t>(output_graph_ptr, graph_size, output_graph_degree));
+
+  auto d_check_num_protected_edges = raft::make_device_scalar<bool>(res, true);
+
+  uint32_t batch_size =
+    std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
+  const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
+
+  const dim3 threads_merge(32, 1, 1);
+  const dim3 blocks_merge(batch_size, 1, 1);
+  const size_t merge_smem_size = (output_graph_degree + output_graph_degree) * sizeof(IdxT);
+  for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
+    kern_merge_graph<IdxT>
+      <<<blocks_merge, threads_merge, merge_smem_size, raft::resource::get_cuda_stream(res)>>>(
+        d_output_graph.data_handle(),
+        d_rev_graph,
+        d_rev_graph_count,
+        static_cast<uint32_t>(graph_size),
+        static_cast<uint32_t>(output_graph_degree),
+        mst_graph_ptr,
+        static_cast<uint32_t>(output_graph_degree),
+        mst_graph_num_edges_ptr,
+        batch_size,
+        i_batch,
+        guarantee_connectivity,
+        d_check_num_protected_edges.data_handle());
+  }
+
+  bool check_num_protected_edges = true;
+  raft::copy(&check_num_protected_edges,
+             d_check_num_protected_edges.data_handle(),
+             1,
+             raft::resource::get_cuda_stream(res));
+
+  if (d_output_graph.allocated_memory()) {
+    raft::copy(output_graph_ptr,
+               d_output_graph.data_handle(),
+               graph_size * output_graph_degree,
+               raft::resource::get_cuda_stream(res));
+  }
+
+  raft::resource::sync_stream(res);
+
+  const auto merge_graph_end = cur_time();
+  RAFT_EXPECTS(check_num_protected_edges,
+               "Failed to merge the MST, pruned, and reverse edge graphs. "
+               "Some nodes have too "
+               "many MST optimization edges.");
+
+  RAFT_LOG_DEBUG("# Time for merging graphs: %.1lf ms",
+                 (merge_graph_end - merge_graph_start) * 1000.0);
+}
+
+template <typename IdxT>
+void merge_graph_cpu(IdxT* output_graph_ptr,
+                     const IdxT* rev_graph_ptr,
+                     const uint32_t* rev_graph_count_ptr,
+                     const IdxT* mst_graph_ptr,
+                     const uint32_t* mst_graph_num_edges_ptr,
+                     uint64_t graph_size,
+                     uint64_t output_graph_degree,
+                     bool guarantee_connectivity)
+{
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+    "cagra::graph::optimize/combine");
+
+  const double time_replace_start = cur_time();
+
+  bool check_num_protected_edges = true;
+#pragma omp parallel for
+  for (uint64_t i = 0; i < graph_size; i++) {
+    auto my_rev_graph = rev_graph_ptr + (output_graph_degree * i);
+    auto my_out_graph = output_graph_ptr + (output_graph_degree * i);
+
+    std::vector<IdxT> temp_output_neighbor_list;
+    if (guarantee_connectivity) {
+      temp_output_neighbor_list.resize(output_graph_degree);
+      my_out_graph                   = temp_output_neighbor_list.data();
+      const auto mst_graph_num_edges = mst_graph_num_edges_ptr[i];
+
+      for (uint32_t j = 0; j < mst_graph_num_edges; j++) {
+        my_out_graph[j] = mst_graph_ptr[i * output_graph_degree + j];
+      }
+
+      for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges;
+           (pruned_j < output_graph_degree) && (output_j < output_graph_degree);
+           pruned_j++) {
+        const auto v = output_graph_ptr[output_graph_degree * i + pruned_j];
+
+        bool dup = false;
+        for (uint32_t m = 0; m < output_j; m++) {
+          if (v == my_out_graph[m]) {
+            dup = true;
+            break;
+          }
+        }
+
+        if (!dup) {
+          my_out_graph[output_j] = v;
+          output_j++;
+        }
+      }
+    }
+
+    const auto num_protected_edges =
+      std::max<uint64_t>(mst_graph_num_edges_ptr[i], output_graph_degree / 2);
+    if (num_protected_edges > output_graph_degree) { check_num_protected_edges = false; }
+    if (num_protected_edges == output_graph_degree) continue;
+
+    auto kr = std::min<uint32_t>(rev_graph_count_ptr[i], output_graph_degree);
+    while (kr) {
+      kr -= 1;
+      if (my_rev_graph[kr] < graph_size) {
+        uint64_t pos = pos_in_array<IdxT>(my_rev_graph[kr], my_out_graph, output_graph_degree);
+        if (pos < num_protected_edges) { continue; }
+        uint64_t num_shift = pos - num_protected_edges;
+        if (pos >= output_graph_degree) {
+          num_shift = output_graph_degree - num_protected_edges - 1;
+        }
+        shift_array<IdxT>(my_out_graph + num_protected_edges, num_shift);
+        my_out_graph[num_protected_edges] = my_rev_graph[kr];
+      }
+    }
+
+    if (guarantee_connectivity) {
+      for (uint32_t j = 0; j < output_graph_degree; j++) {
+        output_graph_ptr[(output_graph_degree * i) + j] = my_out_graph[j];
+      }
+    }
+  }
+  RAFT_EXPECTS(check_num_protected_edges,
+               "Failed to merge the MST, pruned, and reverse edge graphs. Some nodes have too "
+               "many MST optimization edges.");
+
+  const double time_replace_end = cur_time();
+  RAFT_LOG_DEBUG("# Replacing edges time: %.1lf ms",
+                 (time_replace_end - time_replace_start) * 1000.0);
+}
+
+template <typename IdxT>
+void make_reverse_graph_gpu(raft::resources const& res,
+                            IdxT* d_rev_graph,
+                            uint32_t* d_rev_graph_count,
+                            raft::host_matrix_view<IdxT, int64_t, raft::row_major> new_graph)
+{
+  const uint64_t graph_size          = new_graph.extent(0);
+  const uint64_t output_graph_degree = new_graph.extent(1);
+  const IdxT* output_graph_ptr       = new_graph.data_handle();
+
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+    "cagra::graph::optimize/reverse");
+
+  auto large_tmp_mr = raft::resource::get_large_workspace_resource(res);
+  auto dest_nodes   = raft::make_host_vector<IdxT, int64_t>(graph_size);
+  auto d_dest_nodes =
+    raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph,
+                                0xff,
+                                graph_size * output_graph_degree * sizeof(IdxT),
+                                raft::resource::get_cuda_stream(res)));
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(
+    d_rev_graph_count, 0x00, graph_size * sizeof(uint32_t), raft::resource::get_cuda_stream(res)));
+
+  for (uint64_t k = 0; k < output_graph_degree; k++) {
+#pragma omp parallel for
+    for (uint64_t i = 0; i < graph_size; i++) {
+      dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)];
+    }
+    raft::resource::sync_stream(res);
+
+    raft::copy(d_dest_nodes.data_handle(),
+               dest_nodes.data_handle(),
+               graph_size,
+               raft::resource::get_cuda_stream(res));
+
+    dim3 threads(256, 1, 1);
+    dim3 blocks(1024, 1, 1);
+    kern_make_rev_graph<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
+      d_dest_nodes.data_handle(),
+      d_rev_graph,
+      d_rev_graph_count,
+      static_cast<uint32_t>(graph_size),
+      static_cast<uint32_t>(output_graph_degree));
+    RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %lu    \r", k, output_graph_degree);
+  }
+
+  raft::resource::sync_stream(res);
+  RAFT_LOG_DEBUG("\n");
+}
 }  // namespace
 
 template <typename DataT,
@@ -1355,232 +1665,215 @@ void count_2hop_detours(raft::host_matrix_view<IdxT, int64_t, raft::row_major> k
 // specified number of edges are picked up for each node, starting with the edge with
 // the lowest number of 2-hop detours.
 //
-template <typename IdxT, typename InputMatrixView, typename OutputMatrixView>
-void prune_graph(raft::resources const& res,
-                 InputMatrixView knn_graph,
-                 OutputMatrixView output_graph,
-                 bool use_gpu)
+template <typename IdxT>
+void prune_graph_gpu(raft::resources const& res,
+                     IdxT* knn_graph_ptr,
+                     uint64_t graph_size,
+                     uint64_t knn_graph_degree,
+                     IdxT* output_graph_ptr,
+                     uint64_t output_graph_degree)
 {
-  const uint64_t graph_size          = output_graph.extent(0);
-  const uint64_t knn_graph_degree    = knn_graph.extent(1);
-  const uint64_t output_graph_degree = output_graph.extent(1);
-  auto output_graph_ptr              = output_graph.data_handle();
-
-  auto large_tmp_mr = raft::resource::get_large_workspace_resource(res);
+  auto default_ws_mr = raft::resource::get_workspace_resource(res);
 
   uint32_t batch_size =
     std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
   const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
 
-  bool use_gpu_prune = use_gpu;
-  if (use_gpu_prune) {
-    try {
-      auto d_detour_count = raft::make_device_mdarray<uint8_t>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, knn_graph_degree));
-      auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size));
-      auto d_output_graph = raft::make_device_mdarray<IdxT>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, output_graph_degree));
-      auto d_input_graph = raft::make_device_mdarray<IdxT>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, knn_graph_degree));
-    } catch (std::bad_alloc& e) {
-      RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU");
-      use_gpu_prune = false;
-    } catch (raft::logic_error& e) {
-      RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU (logic error)");
-      use_gpu_prune = false;
-    }
-  }
-
-  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-    "cagra::graph::optimize/prune");
-  const double time_prune_start = cur_time();
+  RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");
 
-  if (use_gpu_prune) {
-    RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");
+  constexpr int MAX_DEGREE = 1024;
+  if (knn_graph_degree > MAX_DEGREE) {
+    RAFT_FAIL(
+      "The degree of input knn graph is too large (%zu). "
+      "It must be equal to or smaller than %d.",
+      knn_graph_degree,
+      MAX_DEGREE);
+  }
 
-    constexpr int MAX_DEGREE = 1024;
-    if (knn_graph_degree > MAX_DEGREE) {
-      RAFT_FAIL(
-        "The degree of input knn graph is too large (%zu). "
-        "It must be equal to or smaller than %d.",
+  const double prune_start = cur_time();
+
+  uint64_t num_keep __attribute__((unused)) = 0;
+  uint64_t num_full __attribute__((unused)) = 0;
+  auto dev_stats                            = raft::make_device_vector<uint64_t>(res, 2);
+  auto host_stats                           = raft::make_host_vector<uint64_t>(2);
+  RAFT_CUDA_TRY(cudaMemsetAsync(
+    dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res)));
+
+  device_matrix_view_from_host<IdxT, int64_t> d_input_graph(
+    res, raft::make_host_matrix_view<IdxT, int64_t>(knn_graph_ptr, graph_size, knn_graph_degree));
+
+  auto d_detour_count = raft::make_device_mdarray<uint8_t>(
+    res, default_ws_mr, raft::make_extents<int64_t>(batch_size, knn_graph_degree));
+  auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
+    res, default_ws_mr, raft::make_extents<int64_t>(batch_size));
+  auto d_output_graph = raft::make_device_mdarray<IdxT>(
+    res, default_ws_mr, raft::make_extents<int64_t>(batch_size, output_graph_degree));
+  auto d_invalid_neighbor_list = raft::make_device_scalar<uint32_t>(res, 0u);
+
+  for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
+                                  0xff,
+                                  batch_size * knn_graph_degree * sizeof(uint8_t),
+                                  raft::resource::get_cuda_stream(res)));
+
+    RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(),
+                                  0x00,
+                                  batch_size * sizeof(uint32_t),
+                                  raft::resource::get_cuda_stream(res)));
+
+    const dim3 threads_prune(32, 1, 1);
+    const dim3 blocks_prune(batch_size, 1, 1);
+    const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT);
+    kern_prune<MAX_DEGREE, IdxT>
+      <<<blocks_prune, threads_prune, prune_smem_size, raft::resource::get_cuda_stream(res)>>>(
+        d_input_graph.data_handle(),
+        graph_size,
         knn_graph_degree,
-        MAX_DEGREE);
-    }
+        output_graph_degree,
+        batch_size,
+        i_batch,
+        d_detour_count.data_handle(),
+        d_num_no_detour_edges.data_handle(),
+        dev_stats.data_handle());
+
+    const size_t select_smem_size = (knn_graph_degree) * (sizeof(uint16_t) + sizeof(IdxT));
+    const dim3 threads_select(32, 1, 1);
+    const dim3 blocks_select(batch_size, 1, 1);
+    kern_select_smallest_detour_neighbors<IdxT>
+      <<<blocks_select, threads_select, select_smem_size, raft::resource::get_cuda_stream(res)>>>(
+        d_input_graph.data_handle(),
+        graph_size,
+        knn_graph_degree,
+        output_graph_degree,
+        d_detour_count.data_handle(),
+        d_output_graph.data_handle(),
+        batch_size,
+        i_batch,
+        d_invalid_neighbor_list.data_handle());
+
+    size_t copy_size =
+      std::min(static_cast<size_t>(batch_size), graph_size - i_batch * batch_size) *
+      output_graph_degree;
+    raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree,
+               d_output_graph.data_handle(),
+               copy_size,
+               raft::resource::get_cuda_stream(res));
 
-    const double prune_start = cur_time();
+    raft::resource::sync_stream(res);
+    RAFT_LOG_DEBUG(
+      "# Pruning kNN Graph on GPUs (%.1lf %%)\r",
+      (double)std::min<IdxT>((i_batch + 1) * batch_size, graph_size) / graph_size * 100);
+  }
+  raft::resource::sync_stream(res);
+  RAFT_LOG_DEBUG("\n");
 
-    uint64_t num_keep __attribute__((unused)) = 0;
-    uint64_t num_full __attribute__((unused)) = 0;
-    auto dev_stats                            = raft::make_device_vector<uint64_t>(res, 2);
-    auto host_stats                           = raft::make_host_vector<uint64_t>(2);
-    RAFT_CUDA_TRY(cudaMemsetAsync(
-      dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res)));
+  uint32_t invalid_neighbor_list = 0;
+  raft::copy(&invalid_neighbor_list,
+             d_invalid_neighbor_list.data_handle(),
+             1,
+             raft::resource::get_cuda_stream(res));
+  raft::resource::sync_stream(res);
+  RAFT_EXPECTS(
+    invalid_neighbor_list == 0,
+    "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
+    "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
+    "overflows occur during the norm computation between the dataset vectors.");
 
-    device_matrix_view_from_host d_input_graph(
-      res,
-      raft::make_host_matrix_view<IdxT, int64_t>(
-        knn_graph.data_handle(), graph_size, knn_graph_degree));
+  raft::copy(
+    host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res));
+  num_keep = host_stats.data_handle()[0];
+  num_full = host_stats.data_handle()[1];
 
-    auto d_detour_count = raft::make_device_mdarray<uint8_t>(
-      res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, knn_graph_degree));
-    auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
-      res, large_tmp_mr, raft::make_extents<int64_t>(batch_size));
-    auto d_output_graph = raft::make_device_mdarray<IdxT>(
-      res, large_tmp_mr, raft::make_extents<int64_t>(batch_size, output_graph_degree));
-    auto d_invalid_neighbor_list = raft::make_device_scalar<uint32_t>(res, 0u);
-
-    for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
-      RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
-                                    0xff,
-                                    batch_size * knn_graph_degree * sizeof(uint8_t),
-                                    raft::resource::get_cuda_stream(res)));
-
-      RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(),
-                                    0x00,
-                                    batch_size * sizeof(uint32_t),
-                                    raft::resource::get_cuda_stream(res)));
-
-      const dim3 threads_prune(32, 1, 1);
-      const dim3 blocks_prune(batch_size, 1, 1);
-      const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT);
-      kern_prune<MAX_DEGREE, IdxT>
-        <<<blocks_prune, threads_prune, prune_smem_size, raft::resource::get_cuda_stream(res)>>>(
-          d_input_graph.data_handle(),
-          graph_size,
-          knn_graph_degree,
-          output_graph_degree,
-          batch_size,
-          i_batch,
-          d_detour_count.data_handle(),
-          d_num_no_detour_edges.data_handle(),
-          dev_stats.data_handle());
-
-      const size_t select_smem_size = (knn_graph_degree) * (sizeof(uint16_t) + sizeof(IdxT));
-      const dim3 threads_select(32, 1, 1);
-      const dim3 blocks_select(batch_size, 1, 1);
-      kern_select_smallest_detour_neighbors<IdxT>
-        <<<blocks_select, threads_select, select_smem_size, raft::resource::get_cuda_stream(res)>>>(
-          d_input_graph.data_handle(),
-          graph_size,
-          knn_graph_degree,
-          output_graph_degree,
-          d_detour_count.data_handle(),
-          d_output_graph.data_handle(),
-          batch_size,
-          i_batch,
-          d_invalid_neighbor_list.data_handle());
-
-      size_t copy_size =
-        std::min(static_cast<size_t>(batch_size), graph_size - i_batch * batch_size) *
-        output_graph_degree;
-      raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree,
-                 d_output_graph.data_handle(),
-                 copy_size,
-                 raft::resource::get_cuda_stream(res));
+  const double prune_end = cur_time();
+  RAFT_LOG_DEBUG(
+    "# Time for pruning on GPU: %.1lf sec, "
+    "avg_no_detour_edges_per_node: %.2lf/%u, "
+    "nodes_with_no_detour_at_all_edges: %.1lf%%",
+    prune_end - prune_start,
+    (double)num_keep / graph_size,
+    output_graph_degree,
+    (double)num_full / graph_size * 100);
+}
 
-      raft::resource::sync_stream(res);
-      RAFT_LOG_DEBUG(
-        "# Pruning kNN Graph on GPUs (%.1lf %%)\r",
-        (double)std::min<IdxT>((i_batch + 1) * batch_size, graph_size) / graph_size * 100);
-    }
-    raft::resource::sync_stream(res);
-    RAFT_LOG_DEBUG("\n");
+template <typename IdxT>
+void prune_graph_cpu(IdxT* knn_graph_ptr,
+                     uint64_t graph_size,
+                     uint64_t knn_graph_degree,
+                     IdxT* output_graph_ptr,
+                     uint64_t output_graph_degree)
+{
+  auto detour_count = raft::make_host_matrix<uint8_t, int64_t>(graph_size, knn_graph_degree);
 
-    uint32_t invalid_neighbor_list = 0;
-    raft::copy(&invalid_neighbor_list,
-               d_invalid_neighbor_list.data_handle(),
-               1,
-               raft::resource::get_cuda_stream(res));
-    raft::resource::sync_stream(res);
-    RAFT_EXPECTS(
-      invalid_neighbor_list == 0,
-      "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
-      "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
-      "overflows occur during the norm computation between the dataset vectors.");
-
-    raft::copy(
-      host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res));
-    num_keep = host_stats.data_handle()[0];
-    num_full = host_stats.data_handle()[1];
-
-    const double prune_end = cur_time();
-    RAFT_LOG_DEBUG(
-      "# Time for pruning on GPU: %.1lf sec, "
-      "avg_no_detour_edges_per_node: %.2lf/%u, "
-      "nodes_with_no_detour_at_all_edges: %.1lf%%",
-      prune_end - prune_start,
-      (double)num_keep / graph_size,
-      output_graph_degree,
-      (double)num_full / graph_size * 100);
-  } else {
-    auto detour_count = raft::make_host_matrix<uint8_t, int64_t>(graph_size, knn_graph_degree);
+  auto knn_graph_view =
+    raft::make_host_matrix_view<IdxT, int64_t>(knn_graph_ptr, graph_size, knn_graph_degree);
 
-    {
-      raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-        "cagra::graph::optimize/prune/2-hop-counting-by-CPU");
-      const double time_2hop_count_start = cur_time();
+  {
+    raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+      "cagra::graph::optimize/prune/2-hop-counting-by-CPU");
+    const double time_2hop_count_start = cur_time();
 
-      auto knn_graph_view = raft::make_host_matrix_view<IdxT, int64_t>(
-        knn_graph.data_handle(), knn_graph.extent(0), knn_graph.extent(1));
-      count_2hop_detours(knn_graph_view, detour_count.view());
+    count_2hop_detours(knn_graph_view, detour_count.view());
 
-      const double time_2hop_count_end = cur_time();
-      RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec",
-                     time_2hop_count_end - time_2hop_count_start);
-    }
-    bool invalid_neighbor_list = false;
+    const double time_2hop_count_end = cur_time();
+    RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec",
+                   time_2hop_count_end - time_2hop_count_start);
+  }
+  bool invalid_neighbor_list = false;
 #pragma omp parallel for
-    for (uint64_t i = 0; i < graph_size; i++) {
-      uint64_t pk         = 0;
-      uint32_t num_detour = 0;
-      for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) {
-        uint32_t next_num_detour = std::numeric_limits<uint32_t>::max();
-        for (uint64_t k = 0; k < knn_graph_degree; k++) {
-          const auto num_detour_k = detour_count(i, k);
-          if (num_detour_k > num_detour) {
-            next_num_detour = std::min(static_cast<uint32_t>(num_detour_k), next_num_detour);
-          }
+  for (uint64_t i = 0; i < graph_size; i++) {
+    uint64_t pk         = 0;
+    uint32_t num_detour = 0;
+    for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) {
+      uint32_t next_num_detour = std::numeric_limits<uint32_t>::max();
+      for (uint64_t k = 0; k < knn_graph_degree; k++) {
+        const auto num_detour_k = detour_count(i, k);
+        if (num_detour_k > num_detour) {
+          next_num_detour = std::min(static_cast<uint32_t>(num_detour_k), next_num_detour);
+        }
 
-          if (num_detour_k != num_detour) { continue; }
+        if (num_detour_k != num_detour) { continue; }
 
-          const auto candidate_node = knn_graph(i, k);
-          bool dup                  = false;
-          for (uint32_t dk = 0; dk < pk; dk++) {
-            if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) {
-              dup = true;
-              break;
-            }
-          }
-          if (!dup && candidate_node < graph_size) {
-            output_graph_ptr[i * output_graph_degree + pk] = candidate_node;
-            pk += 1;
+        const auto candidate_node = knn_graph_view(i, k);
+        bool dup                  = false;
+        for (uint32_t dk = 0; dk < pk; dk++) {
+          if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) {
+            dup = true;
+            break;
           }
-          if (pk >= output_graph_degree) break;
+        }
+        if (!dup && candidate_node < graph_size) {
+          output_graph_ptr[i * output_graph_degree + pk] = candidate_node;
+          pk += 1;
         }
         if (pk >= output_graph_degree) break;
-
-        if (next_num_detour == std::numeric_limits<uint32_t>::max()) { break; }
-        num_detour = next_num_detour;
-      }
-      if (pk != output_graph_degree) {
-        RAFT_LOG_DEBUG(
-          "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for "
-          "node %lu in the rank-based node reranking process",
-          output_graph_degree,
-          i);
-        invalid_neighbor_list = true;
       }
+      if (pk >= output_graph_degree) break;
+
+      if (next_num_detour == std::numeric_limits<uint32_t>::max()) { break; }
+      num_detour = next_num_detour;
+    }
+    if (pk != output_graph_degree) {
+      RAFT_LOG_DEBUG(
+        "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for "
+        "node %lu in the rank-based node reranking process",
+        output_graph_degree,
+        i);
+      invalid_neighbor_list = true;
     }
-    RAFT_EXPECTS(
-      !invalid_neighbor_list,
-      "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
-      "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
-      "overflows occur during the norm computation between the dataset vectors.");
   }
+  RAFT_EXPECTS(
+    !invalid_neighbor_list,
+    "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
+    "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
+    "overflows occur during the norm computation between the dataset vectors.");
+}
 
-  const double time_prune_end = cur_time();
-  RAFT_LOG_DEBUG("# Pruning time: %.1lf ms", (time_prune_end - time_prune_start) * 1000.0);
+template <typename T>
+bool is_gpu_accessible(T* ptr)
+{
+  cudaPointerAttributes attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr));
+  return attr.devicePointer != nullptr;
 }
 
 // TODO allow pinned input for both knn_graph and new_graph
@@ -1600,7 +1893,7 @@ void optimize(
   // large temporary memory for large arrays, e.g. everything >= O(graph_size)
   auto large_tmp_mr = raft::resource::get_large_workspace_resource(res);
   // temporary memory for small arrays, e.g. everything <= O(batchsize * graph_degree)
-  // auto tmp_mr = raft::resource::get_tmp_workspace_resource(res);
+  auto default_ws_mr = raft::resource::get_workspace_resource(res);
 
   RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0),
                "Each input array is expected to have the same number of rows");
@@ -1611,409 +1904,174 @@ void optimize(
   const uint64_t output_graph_degree = new_graph.extent(1);
   const uint64_t graph_size          = new_graph.extent(0);
   // auto input_graph_ptr               = knn_graph.data_handle();
-  auto output_graph_ptr = new_graph.data_handle();
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::graph::optimize(%zu, %zu, %u)", graph_size, knn_graph_degree, output_graph_degree);
 
-  // MST optimization
-  auto mst_graph           = raft::make_pinned_matrix<IdxT, int64_t, raft::row_major>(res, 0, 0);
-  auto mst_graph_num_edges = raft::make_pinned_vector<uint32_t, int64_t>(res, graph_size);
-  auto mst_graph_num_edges_ptr = mst_graph_num_edges.data_handle();
+  // check if input and output are both device accessible
+  // in this case we assume data to be ONLY device accessible and not host accessible
+  // furthermore we ensure all large allocations to go to the large workspace resource
+  // and all small allocations to go to the default workspace resource
+  bool inout_device_accessible = false;
+  {
+    bool input_device_accessible  = is_gpu_accessible(knn_graph.data_handle());
+    bool output_device_accessible = is_gpu_accessible(new_graph.data_handle());
+    RAFT_EXPECTS(input_device_accessible == output_device_accessible,
+                 "Input and output must be either both device accessible or both host accessible");
+    inout_device_accessible = input_device_accessible && output_device_accessible;
+  }
 
+  // MST optimization
+  // currently, only using GPU path for MST optimization
+  auto p_mst_graph           = raft::make_pinned_matrix<IdxT, int64_t, raft::row_major>(res, 0, 0);
+  auto p_mst_graph_num_edges = raft::make_pinned_vector<uint32_t>(res, graph_size);
+  auto p_mst_graph_num_edges_ptr = p_mst_graph_num_edges.data_handle();
 #pragma omp parallel for
   for (uint64_t i = 0; i < graph_size; i++) {
-    mst_graph_num_edges_ptr[i] = 0;
+    p_mst_graph_num_edges_ptr[i] = 0;
   }
   if (guarantee_connectivity) {
     raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
       "cagra::graph::optimize/check_connectivity");
-    mst_graph = raft::make_pinned_matrix<IdxT, int64_t, raft::row_major>(
+    p_mst_graph = raft::make_pinned_matrix<IdxT, int64_t, raft::row_major>(
       res, graph_size, output_graph_degree);
     RAFT_LOG_INFO("MST optimization is used to guarantee graph connectivity.");
-    mst_optimization<IdxT>(res, knn_graph, mst_graph.view(), mst_graph_num_edges.view(), use_gpu);
+    mst_optimization<IdxT>(
+      res, knn_graph, p_mst_graph.view(), p_mst_graph_num_edges.view(), use_gpu);
 
     for (uint64_t i = 0; i < graph_size; i++) {
       if (i < 8 || i >= graph_size - 8) {
-        RAFT_LOG_DEBUG("# mst_graph_num_edges_ptr[%lu]: %u\n", i, mst_graph_num_edges_ptr[i]);
+        RAFT_LOG_DEBUG("# p_mst_graph_num_edges_ptr[%lu]: %u\n", i, p_mst_graph_num_edges_ptr[i]);
       }
     }
   }
 
-  prune_graph<IdxT>(res, knn_graph, new_graph, use_gpu);
-
-  auto rev_graph       = raft::make_host_matrix<IdxT, int64_t>(graph_size, output_graph_degree);
-  auto rev_graph_count = raft::make_host_vector<uint32_t, int64_t>(graph_size);
-
-  bool _use_gpu_rev_graph = use_gpu;
-  // TODO: should we use pinned memory if we have issues fitting on GPU?
-  if (_use_gpu_rev_graph) {
+  // prune graph -- will use GPU path if possible, otherwise CPU path
+  // we only need to check in case input is not alreadydevice accessible
+  bool use_gpu_prune = use_gpu;
+  if (!inout_device_accessible) {
     try {
-      auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
-      auto d_dest_nodes =
-        raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
-      auto d_rev_graph = raft::make_device_mdarray<IdxT>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
-      auto d_output_graph = raft::make_device_mdarray<IdxT>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
+      auto d_input_graph = raft::make_device_mdarray<IdxT>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, knn_graph_degree));
     } catch (std::bad_alloc& e) {
-      RAFT_LOG_DEBUG("Insufficient memory for reverse graph on GPU");
-      _use_gpu_rev_graph = false;
+      RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU");
+      use_gpu_prune = false;
     } catch (raft::logic_error& e) {
-      RAFT_LOG_DEBUG("Insufficient memory for reverse graph on GPU (logic error)");
-      _use_gpu_rev_graph = false;
+      RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU (logic error)");
+      use_gpu_prune = false;
     }
   }
-
-  const double time_make_start = cur_time();
-  if (_use_gpu_rev_graph) {
-    //
-    // Make reverse graph on GPU
-    //
-    auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
-      res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
-
-    device_matrix_view_from_host<IdxT, int64_t> d_rev_graph(res, rev_graph.view());
-    device_matrix_view_from_host<IdxT, int64_t> d_output_graph(
+  if (use_gpu_prune) {
+    // should be noop in case input is already device accessible
+    device_matrix_view_from_host<IdxT, int64_t> d_input_graph(
       res,
       raft::make_host_matrix_view<IdxT, int64_t>(
-        output_graph_ptr, graph_size, output_graph_degree));
-
-    {
-      raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-        "cagra::graph::optimize/reverse");
-      auto dest_nodes = raft::make_host_vector<IdxT, int64_t>(graph_size);
-      auto d_dest_nodes =
-        raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
-
-      RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph.data_handle(),
-                                    0xff,
-                                    graph_size * output_graph_degree * sizeof(IdxT),
-                                    raft::resource::get_cuda_stream(res)));
-
-      RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph_count.data_handle(),
-                                    0x00,
-                                    graph_size * sizeof(uint32_t),
-                                    raft::resource::get_cuda_stream(res)));
-
-      for (uint64_t k = 0; k < output_graph_degree; k++) {
-#pragma omp parallel for
-        for (uint64_t i = 0; i < graph_size; i++) {
-          // dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)];
-          dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)];
-        }
-        raft::resource::sync_stream(res);
-
-        raft::copy(d_dest_nodes.data_handle(),
-                   dest_nodes.data_handle(),
-                   graph_size,
-                   raft::resource::get_cuda_stream(res));
-
-        dim3 threads(256, 1, 1);
-        dim3 blocks(1024, 1, 1);
-        kern_make_rev_graph<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
-          d_dest_nodes.data_handle(),
-          d_rev_graph.data_handle(),
-          d_rev_graph_count.data_handle(),
-          graph_size,
-          output_graph_degree);
-        RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u    \r", k, output_graph_degree);
-      }
-
-      raft::resource::sync_stream(res);
-      RAFT_LOG_DEBUG("\n");
-
-      if (d_rev_graph.allocated_memory()) {
-        raft::copy(rev_graph.data_handle(),
-                   d_rev_graph.data_handle(),
-                   graph_size * output_graph_degree,
-                   raft::resource::get_cuda_stream(res));
-      }
-      raft::copy(rev_graph_count.data_handle(),
-                 d_rev_graph_count.data_handle(),
-                 graph_size,
-                 raft::resource::get_cuda_stream(res));
-
-      const double time_make_end = cur_time();
-      RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms",
-                     (time_make_end - time_make_start) * 1000.0);
-    }
-
-    {
-      raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-        "cagra::graph::optimize/combine");
-
-      // Merging the prunned graph and the reverse graph
-      const double merge_graph_start = cur_time();
-
-      // Create a boolean variable on the GPU using RAFT device allocator
-      auto d_check_num_protected_edges = raft::make_device_scalar<bool>(res, true);
-
-      uint32_t batch_size =
-        std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
-      const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
-
-      const dim3 threads_merge(32, 1, 1);
-      const dim3 blocks_merge(batch_size, 1, 1);
-      const size_t merge_smem_size = (output_graph_degree + output_graph_degree) * sizeof(IdxT);
-      for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
-        kern_merge_graph<IdxT>
-          <<<blocks_merge, threads_merge, merge_smem_size, raft::resource::get_cuda_stream(res)>>>(
-            d_output_graph.data_handle(),
-            d_rev_graph.data_handle(),
-            d_rev_graph_count.data_handle(),
-            graph_size,
-            output_graph_degree,
-            mst_graph.data_handle(),
-            output_graph_degree,
-            mst_graph_num_edges_ptr,
-            batch_size,
-            i_batch,
-            guarantee_connectivity,
-            d_check_num_protected_edges.data_handle());
-      }
-
-      bool check_num_protected_edges = true;
-      raft::copy(&check_num_protected_edges,
-                 d_check_num_protected_edges.data_handle(),
-                 1,
-                 raft::resource::get_cuda_stream(res));
-
-      if (d_output_graph.allocated_memory()) {
-        raft::copy(output_graph_ptr,
-                   d_output_graph.data_handle(),
-                   graph_size * output_graph_degree,
-                   raft::resource::get_cuda_stream(res));
-      }
-
-      raft::resource::sync_stream(res);
+        knn_graph.data_handle(), graph_size, knn_graph_degree));
 
-      const auto merge_graph_end = cur_time();
-      RAFT_EXPECTS(check_num_protected_edges,
-                   "Failed to merge the MST, pruned, and reverse edge graphs. "
-                   "Some nodes have too "
-                   "many MST optimization edges.");
+    prune_graph_gpu<IdxT>(res,
+                          d_input_graph.data_handle(),
+                          graph_size,
+                          knn_graph_degree,
+                          new_graph.data_handle(),
+                          output_graph_degree);
 
-      RAFT_LOG_DEBUG("# Time for merging graphs: %.1lf ms",
-                     (merge_graph_end - merge_graph_start) * 1000.0);
-    }
   } else {
-    {
-      // Make reverse graph on CPU
-      raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-        "cagra::graph::optimize/reverse");
-
-      auto rev_graph_ptr       = rev_graph.data_handle();
-      auto rev_graph_count_ptr = rev_graph_count.data_handle();
-
-#pragma omp parallel for
-      for (uint64_t i = 0; i < graph_size; i++) {
-        rev_graph_count_ptr[i] = 0;
-      }
-
-      for (uint32_t k = 0; k < output_graph_degree; k++) {
-#pragma omp parallel for
-        for (uint64_t src_id = 0; src_id < graph_size; src_id++) {
-          const IdxT dest_id =
-            output_graph_ptr[k + (static_cast<uint64_t>(output_graph_degree) * src_id)];
-          if (dest_id >= graph_size) continue;
-          uint32_t pos;
-#pragma omp atomic capture
-          pos = rev_graph_count_ptr[dest_id]++;
-          if (pos < output_graph_degree) {
-            rev_graph_ptr[(static_cast<uint64_t>(output_graph_degree) * dest_id) + pos] =
-              static_cast<IdxT>(src_id);
-          }
-        }
-      }
+    prune_graph_cpu<IdxT>(knn_graph.data_handle(),
+                          graph_size,
+                          knn_graph_degree,
+                          new_graph.data_handle(),
+                          output_graph_degree);
+  }
 
-      const double time_make_end = cur_time();
-      RAFT_LOG_DEBUG("# Making reverse graph time (CPU): %.1lf ms",
-                     (time_make_end - time_make_start) * 1000.0);
-    }
+  // reverse graph creation will always use the GPU
+  auto d_rev_graph = raft::make_device_mdarray<IdxT>(
+    res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
 
-    {
-      raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-        "cagra::graph::optimize/combine");
-      //
-      // Create search graphs from MST and pruned and reverse graphs
-      //
-      const double time_replace_start = cur_time();
+  // This should use the default workspace resource for random access / atomics
+  auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
+    res, default_ws_mr, raft::make_extents<int64_t>(graph_size));
 
-      bool check_num_protected_edges = true;
-#pragma omp parallel for
-      for (uint64_t i = 0; i < graph_size; i++) {
-        auto my_rev_graph = rev_graph.data_handle() + (output_graph_degree * i);
-        auto my_out_graph = output_graph_ptr + (output_graph_degree * i);
-
-        // If guarantee_connectivity == true, use a temporal list to merge the neighbor lists of the
-        // graphs.
-        std::vector<IdxT> temp_output_neighbor_list;
-        if (guarantee_connectivity) {
-          temp_output_neighbor_list.resize(output_graph_degree);
-          my_out_graph                   = temp_output_neighbor_list.data();
-          const auto mst_graph_num_edges = mst_graph_num_edges_ptr[i];
-
-          // Set MST graph edges
-          for (uint32_t j = 0; j < mst_graph_num_edges; j++) {
-            my_out_graph[j] = mst_graph(i, j);
-          }
-
-          // Set pruned graph edges
-          for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges;
-               (pruned_j < output_graph_degree) && (output_j < output_graph_degree);
-               pruned_j++) {
-            const auto v = output_graph_ptr[output_graph_degree * i + pruned_j];
-
-            // duplication check
-            bool dup = false;
-            for (uint32_t m = 0; m < output_j; m++) {
-              if (v == my_out_graph[m]) {
-                dup = true;
-                break;
-              }
-            }
+  const double time_make_start = cur_time();
 
-            if (!dup) {
-              my_out_graph[output_j] = v;
-              output_j++;
-            }
-          }
-        }
+  make_reverse_graph_gpu<IdxT>(
+    res, d_rev_graph.data_handle(), d_rev_graph_count.data_handle(), new_graph);
 
-        const auto num_protected_edges =
-          std::max<uint64_t>(mst_graph_num_edges_ptr[i], output_graph_degree / 2);
-        if (num_protected_edges > output_graph_degree) { check_num_protected_edges = false; }
-        if (num_protected_edges == output_graph_degree) continue;
-
-        // Replace some edges of the output graph with edges of the reverse graph.
-        auto kr = std::min<uint32_t>(rev_graph_count.data_handle()[i], output_graph_degree);
-        while (kr) {
-          kr -= 1;
-          if (my_rev_graph[kr] < graph_size) {
-            uint64_t pos = pos_in_array<IdxT>(my_rev_graph[kr], my_out_graph, output_graph_degree);
-            if (pos < num_protected_edges) { continue; }
-            uint64_t num_shift = pos - num_protected_edges;
-            if (pos >= output_graph_degree) {
-              num_shift = output_graph_degree - num_protected_edges - 1;
-            }
-            shift_array<IdxT>(my_out_graph + num_protected_edges, num_shift);
-            my_out_graph[num_protected_edges] = my_rev_graph[kr];
-          }
-        }
+  const double time_make_end = cur_time();
+  RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms",
+                 (time_make_end - time_make_start) * 1000.0);
 
-        // If guarantee_connectivity == true, move the output neighbor list from the temporal list
-        // to the output list. If false, the copy is not needed because my_out_graph is a pointer to
-        // the output buffer.
-        if (guarantee_connectivity) {
-          for (uint32_t j = 0; j < output_graph_degree; j++) {
-            output_graph_ptr[(output_graph_degree * i) + j] = my_out_graph[j];
-          }
-        }
-      }
-      RAFT_EXPECTS(check_num_protected_edges,
-                   "Failed to merge the MST, pruned, and reverse edge graphs. Some nodes have too "
-                   "many MST optimization edges.");
-
-      const double time_replace_end = cur_time();
-      RAFT_LOG_DEBUG("# Replacing edges time: %.1lf ms",
-                     (time_replace_end - time_replace_start) * 1000.0);
+  // merge graph -- will use GPU path if possible, otherwise CPU path
+  // we only need to check in case output is not already device accessible
+  bool use_gpu_merge = use_gpu;
+  if (!inout_device_accessible) {
+    try {
+      auto d_new_graph = raft::make_device_mdarray<IdxT>(
+        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
+    } catch (std::bad_alloc& e) {
+      RAFT_LOG_DEBUG("Insufficient memory for merging on GPU");
+      use_gpu_merge = false;
+    } catch (raft::logic_error& e) {
+      RAFT_LOG_DEBUG("Insufficient memory for merging on GPU (logic error)");
+      use_gpu_merge = false;
     }
   }
 
-  // Check stats
-  {
-    raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-      "cagra::graph::optimize/stats");
-    /* stats */
-    uint64_t num_replaced_edges = 0;
-#pragma omp parallel for reduction(+ : num_replaced_edges)
-    for (uint64_t i = 0; i < graph_size; i++) {
-      for (uint64_t k = 0; k < output_graph_degree; k++) {
-        const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)];
-        const uint64_t pos =
-          pos_in_array<IdxT>(j, output_graph_ptr + (output_graph_degree * i), output_graph_degree);
-        if (pos == output_graph_degree) { num_replaced_edges += 1; }
-      }
+  if (use_gpu_merge) {
+    // should be noop in case output is already device accessible
+    device_matrix_view_from_host<IdxT, int64_t> d_new_graph(
+      res,
+      raft::make_host_matrix_view<IdxT, int64_t>(
+        new_graph.data_handle(), graph_size, output_graph_degree));
+
+    merge_graph_gpu<IdxT>(res,
+                          d_new_graph.data_handle(),
+                          d_rev_graph.data_handle(),
+                          d_rev_graph_count.data_handle(),
+                          p_mst_graph.data_handle(),
+                          p_mst_graph_num_edges.data_handle(),
+                          graph_size,
+                          output_graph_degree,
+                          guarantee_connectivity);
+
+    if (d_new_graph.allocated_memory()) {
+      raft::copy(new_graph.data_handle(),
+                 d_new_graph.data_handle(),
+                 graph_size * output_graph_degree,
+                 raft::resource::get_cuda_stream(res));
     }
-    RAFT_LOG_DEBUG("# Average number of replaced edges per node: %.2f",
-                   (double)num_replaced_edges / graph_size);
-  }
+  } else {
+    auto rev_graph       = raft::make_host_matrix<IdxT, int64_t>(graph_size, output_graph_degree);
+    auto rev_graph_count = raft::make_host_vector<uint32_t, int64_t>(graph_size);
+    auto mst_graph       = raft::make_host_matrix<IdxT, int64_t>(0, 0);
+    raft::copy(rev_graph.data_handle(),
+               d_rev_graph.data_handle(),
+               graph_size * output_graph_degree,
+               raft::resource::get_cuda_stream(res));
+    raft::copy(rev_graph_count.data_handle(),
+               d_rev_graph_count.data_handle(),
+               graph_size,
+               raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
 
-  // Check number of incoming edges
-  {
-    raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-      "cagra::graph::optimize/check_edges");
-    auto in_edge_count     = raft::make_host_vector<uint32_t, int64_t>(graph_size);
-    auto in_edge_count_ptr = in_edge_count.data_handle();
-#pragma omp parallel for
-    for (uint64_t i = 0; i < graph_size; i++) {
-      in_edge_count_ptr[i] = 0;
-    }
-#pragma omp parallel for
-    for (uint64_t i = 0; i < graph_size; i++) {
-      for (uint64_t k = 0; k < output_graph_degree; k++) {
-        const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)];
-        if (j >= graph_size) continue;
-#pragma omp atomic
-        in_edge_count_ptr[j] += 1;
-      }
-    }
-    auto hist     = raft::make_host_vector<uint32_t, int64_t>(output_graph_degree);
-    auto hist_ptr = hist.data_handle();
-    for (uint64_t k = 0; k < output_graph_degree; k++) {
-      hist_ptr[k] = 0;
-    }
-#pragma omp parallel for
-    for (uint64_t i = 0; i < graph_size; i++) {
-      uint32_t count = in_edge_count_ptr[i];
-      if (count >= output_graph_degree) continue;
-#pragma omp atomic
-      hist_ptr[count] += 1;
-    }
-    RAFT_LOG_DEBUG("# Histogram for number of incoming edges\n");
-    uint32_t sum_hist = 0;
-    for (uint64_t k = 0; k < output_graph_degree; k++) {
-      sum_hist += hist_ptr[k];
-      RAFT_LOG_DEBUG("# %3lu, %8u, %lf, (%8u, %lf)\n",
-                     k,
-                     hist_ptr[k],
-                     (double)hist_ptr[k] / graph_size,
-                     sum_hist,
-                     (double)sum_hist / graph_size);
-    }
+    merge_graph_cpu<IdxT>(new_graph.data_handle(),
+                          rev_graph.data_handle(),
+                          rev_graph_count.data_handle(),
+                          p_mst_graph.data_handle(),
+                          p_mst_graph_num_edges_ptr,
+                          graph_size,
+                          output_graph_degree,
+                          guarantee_connectivity);
   }
 
-  // Check duplication and out-of-range indices
-  {
-    raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-      "cagra::graph::optimize/check_duplicates");
-    uint64_t num_dup = 0;
-    uint64_t num_oor = 0;
-#pragma omp parallel for reduction(+ : num_dup) reduction(+ : num_oor)
-    for (uint64_t i = 0; i < graph_size; i++) {
-      auto my_out_graph = output_graph_ptr + (output_graph_degree * i);
-      for (uint32_t j = 0; j < output_graph_degree; j++) {
-        const auto neighbor_a = my_out_graph[j];
+  if (!inout_device_accessible) {
+    // following checks require host access
+    log_replaced_edges_stats<IdxT>(new_graph.data_handle(), graph_size, output_graph_degree);
 
-        // Check oor
-        if (neighbor_a > graph_size) {
-          num_oor++;
-          continue;
-        }
+    log_incoming_edges_histogram<IdxT>(new_graph.data_handle(), graph_size, output_graph_degree);
 
-        // Check duplication
-        for (uint32_t k = j + 1; k < output_graph_degree; k++) {
-          const auto neighbor_b = my_out_graph[k];
-          if (neighbor_a == neighbor_b) { num_dup++; }
-        }
-      }
-    }
-    RAFT_EXPECTS(
-      num_dup == 0, "%lu duplicated node(s) are found in the generated CAGRA graph", num_dup);
-    RAFT_EXPECTS(num_oor == 0,
-                 "%lu out-of-range index node(s) are found in the generated CAGRA graph",
-                 num_oor);
+    check_duplicates_and_out_of_range<IdxT>(
+      new_graph.data_handle(), graph_size, output_graph_degree);
+  } else {
+    RAFT_LOG_DEBUG("Output graph is on GPU, skipping checks");
   }
 }
 

From 5e9ebc53950e472c8ee0035f280905dc5b1984b5 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Mon, 2 Mar 2026 17:34:17 +0000
Subject: [PATCH 069/119] enable both host/device inout graphs for optimize

---
 .../neighbors/detail/cagra/cagra_build.cuh    | 23 ++---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 97 +++++++++++--------
 cpp/src/neighbors/detail/cagra/utils.hpp      | 18 +++-
 3 files changed, 86 insertions(+), 52 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index a1c16250c5..009362aa96 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -822,8 +822,6 @@ inline std::pair<size_t, size_t> optimize_workspace_size(size_t n_rows,
                                                          size_t index_size,
                                                          bool mst_optimize = false)
 {
-  // TODO: MODIFY!!
-
   // MST optimization memory (host only)
   size_t mst_host = n_rows * index_size;  // mst_graph_num_edges
   if (mst_optimize) {
@@ -835,27 +833,26 @@ inline std::pair<size_t, size_t> optimize_workspace_size(size_t n_rows,
 
   // Prune stage memory
   // We neglect 8 bytes (both on host and device) for stats
-  size_t prune_host = n_rows * intermediate_degree * sizeof(uint8_t);  // detour count
+  size_t batch_size = std::min(static_cast<size_t>(256 * 1024), n_rows);
 
-  size_t prune_dev = n_rows * intermediate_degree * 1;     // detour count (uint8_t)
-  prune_dev += n_rows * sizeof(uint32_t);                  // d_num_detour_edges
-  prune_dev += n_rows * intermediate_degree * index_size;  // d_input_graph
+  size_t prune_dev = batch_size * intermediate_degree * 1;  // detour count (uint8_t)
+  prune_dev += batch_size * sizeof(uint32_t);               // d_num_detour_edges
+  prune_dev += n_rows * intermediate_degree * index_size;   // d_input_graph
 
   // Reverse graph stage memory
-  size_t rev_host = n_rows * graph_degree * index_size;  // rev_graph
-  rev_host += n_rows * sizeof(uint32_t);                 // rev_graph_count
-  rev_host += n_rows * index_size;                       // dest_nodes
-
   size_t rev_dev = n_rows * graph_degree * index_size;  // d_rev_graph
   rev_dev += n_rows * sizeof(uint32_t);                 // d_rev_graph_count
   rev_dev += n_rows * sizeof(uint32_t);                 // d_dest_nodes
 
-  // Memory for merging graphs (host only)
+  // Memory for merging graphs (host only optional)
   size_t combine_host =
     n_rows * sizeof(uint32_t) + graph_degree * sizeof(uint32_t);  // in_edge_count + hist
 
-  size_t total_host = mst_host + std::max({prune_host, rev_host, combine_host});
-  size_t total_dev  = std::max(prune_dev, rev_dev);
+  // additional memory for combine stage on device
+  size_t combine_dev = n_rows * graph_degree * index_size;  // d_output_graph
+
+  size_t total_host = mst_host + combine_host;
+  size_t total_dev  = std::max(prune_dev, rev_dev + combine_dev);
 
   return std::make_pair(total_host, total_dev);
 }
diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 713b03ca20..96110c9613 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -246,6 +246,26 @@ __global__ void kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_
   }
 }
 
+template <typename IdxT>
+__global__ void kern_make_rev_graph_k(const IdxT* const dest_nodes,     // [graph_size]
+                                      IdxT* const rev_graph,            // [size, degree]
+                                      uint32_t* const rev_graph_count,  // [graph_size]
+                                      const uint32_t graph_size,
+                                      const uint32_t degree,
+                                      uint64_t k)
+{
+  const uint64_t tid  = threadIdx.x + (blockDim.x * blockIdx.x);
+  const uint64_t tnum = blockDim.x * gridDim.x;
+
+  for (uint64_t src_id = tid; src_id < graph_size; src_id += tnum) {
+    IdxT dest_id = dest_nodes[k + (degree * src_id)];
+    if (dest_id >= graph_size) continue;
+
+    const uint32_t pos = atomicAdd(rev_graph_count + dest_id, 1);
+    if (pos < degree) { rev_graph[(degree * dest_id) + pos] = static_cast<IdxT>(src_id); }
+  }
+}
+
 // Based on the detour count, select the smallest detour count and its index
 // (Pruning Update Kernel)
 template <typename IdxT>
@@ -932,11 +952,11 @@ void merge_graph_cpu(IdxT* output_graph_ptr,
                  (time_replace_end - time_replace_start) * 1000.0);
 }
 
-template <typename IdxT>
+template <typename IdxT, typename InOutMatrixView>
 void make_reverse_graph_gpu(raft::resources const& res,
                             IdxT* d_rev_graph,
                             uint32_t* d_rev_graph_count,
-                            raft::host_matrix_view<IdxT, int64_t, raft::row_major> new_graph)
+                            InOutMatrixView new_graph)
 {
   const uint64_t graph_size          = new_graph.extent(0);
   const uint64_t output_graph_degree = new_graph.extent(1);
@@ -958,26 +978,38 @@ void make_reverse_graph_gpu(raft::resources const& res,
   RAFT_CUDA_TRY(cudaMemsetAsync(
     d_rev_graph_count, 0x00, graph_size * sizeof(uint32_t), raft::resource::get_cuda_stream(res)));
 
+  bool output_graph_device_accessible = is_ptr_device_accessible(output_graph_ptr);
+  dim3 threads(256, 1, 1);
+  dim3 blocks(1024, 1, 1);
+
   for (uint64_t k = 0; k < output_graph_degree; k++) {
+    if (output_graph_device_accessible) {
+      kern_make_rev_graph_k<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
+        output_graph_ptr,
+        d_rev_graph,
+        d_rev_graph_count,
+        static_cast<uint32_t>(graph_size),
+        static_cast<uint32_t>(output_graph_degree),
+        k);
+    } else {
 #pragma omp parallel for
-    for (uint64_t i = 0; i < graph_size; i++) {
-      dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)];
-    }
-    raft::resource::sync_stream(res);
+      for (uint64_t i = 0; i < graph_size; i++) {
+        dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)];
+      }
+      raft::resource::sync_stream(res);
 
-    raft::copy(d_dest_nodes.data_handle(),
-               dest_nodes.data_handle(),
-               graph_size,
-               raft::resource::get_cuda_stream(res));
+      raft::copy(d_dest_nodes.data_handle(),
+                 dest_nodes.data_handle(),
+                 graph_size,
+                 raft::resource::get_cuda_stream(res));
 
-    dim3 threads(256, 1, 1);
-    dim3 blocks(1024, 1, 1);
-    kern_make_rev_graph<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
-      d_dest_nodes.data_handle(),
-      d_rev_graph,
-      d_rev_graph_count,
-      static_cast<uint32_t>(graph_size),
-      static_cast<uint32_t>(output_graph_degree));
+      kern_make_rev_graph<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
+        d_dest_nodes.data_handle(),
+        d_rev_graph,
+        d_rev_graph_count,
+        static_cast<uint32_t>(graph_size),
+        static_cast<uint32_t>(output_graph_degree));
+    }
     RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %lu    \r", k, output_graph_degree);
   }
 
@@ -1868,24 +1900,13 @@ void prune_graph_cpu(IdxT* knn_graph_ptr,
     "overflows occur during the norm computation between the dataset vectors.");
 }
 
-template <typename T>
-bool is_gpu_accessible(T* ptr)
-{
-  cudaPointerAttributes attr;
-  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr));
-  return attr.devicePointer != nullptr;
-}
-
 // TODO allow pinned input for both knn_graph and new_graph
-template <typename IdxT = uint32_t,
-          typename g_accessor =
-            raft::host_device_accessor<cuda::std::default_accessor<IdxT>, raft::memory_type::host>>
-void optimize(
-  raft::resources const& res,
-  raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, g_accessor> knn_graph,
-  raft::host_matrix_view<IdxT, int64_t, raft::row_major> new_graph,
-  const bool guarantee_connectivity = true,
-  const bool use_gpu                = true)
+template <typename IdxT = uint32_t, typename InOutMatrixView>
+void optimize(raft::resources const& res,
+              InOutMatrixView knn_graph,
+              InOutMatrixView new_graph,
+              const bool guarantee_connectivity = true,
+              const bool use_gpu                = true)
 {
   RAFT_LOG_DEBUG(
     "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
@@ -1913,8 +1934,8 @@ void optimize(
   // and all small allocations to go to the default workspace resource
   bool inout_device_accessible = false;
   {
-    bool input_device_accessible  = is_gpu_accessible(knn_graph.data_handle());
-    bool output_device_accessible = is_gpu_accessible(new_graph.data_handle());
+    bool input_device_accessible  = is_ptr_device_accessible(knn_graph.data_handle());
+    bool output_device_accessible = is_ptr_device_accessible(new_graph.data_handle());
     RAFT_EXPECTS(input_device_accessible == output_device_accessible,
                  "Input and output must be either both device accessible or both host accessible");
     inout_device_accessible = input_device_accessible && output_device_accessible;
@@ -2062,7 +2083,7 @@ void optimize(
                           guarantee_connectivity);
   }
 
-  if (!inout_device_accessible) {
+  if (is_ptr_host_accessible(new_graph.data_handle())) {
     // following checks require host access
     log_replaced_edges_stats<IdxT>(new_graph.data_handle(), graph_size, output_graph_degree);
 
diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp
index 30c7287430..7889d6d9a9 100644
--- a/cpp/src/neighbors/detail/cagra/utils.hpp
+++ b/cpp/src/neighbors/detail/cagra/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -152,6 +152,22 @@ struct gen_index_msb_1_mask {
 };
 }  // namespace utils
 
+template <typename T>
+bool is_ptr_device_accessible(T* ptr)
+{
+  cudaPointerAttributes attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr));
+  return attr.devicePointer != nullptr;
+}
+
+template <typename T>
+bool is_ptr_host_accessible(T* ptr)
+{
+  cudaPointerAttributes attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr));
+  return attr.hostPointer != nullptr;
+}
+
 /**
  * Utility to sync memory from a host_matrix_view to a device_matrix_view
  *

From 40977e2e456f2fd9ee32413be3590acfe2e7bdd4 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Mon, 2 Mar 2026 23:35:32 +0000
Subject: [PATCH 070/119] smaller fixes

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 61 +++++++++++--------
 1 file changed, 35 insertions(+), 26 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 6c0d6c747c..f77f6367e5 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -801,8 +801,8 @@ void check_duplicates_and_out_of_range(const IdxT* output_graph_ptr,
 template <typename IdxT>
 void merge_graph_gpu(raft::resources const& res,
                      IdxT* output_graph_ptr,
-                     const IdxT* d_rev_graph,
-                     uint32_t* d_rev_graph_count,
+                     const IdxT* d_rev_graph_ptr,
+                     uint32_t* d_rev_graph_count_ptr,
                      const IdxT* mst_graph_ptr,
                      const uint32_t* mst_graph_num_edges_ptr,
                      uint64_t graph_size,
@@ -831,8 +831,8 @@ void merge_graph_gpu(raft::resources const& res,
     kern_merge_graph<IdxT>
       <<<blocks_merge, threads_merge, merge_smem_size, raft::resource::get_cuda_stream(res)>>>(
         d_output_graph.data_handle(),
-        d_rev_graph,
-        d_rev_graph_count,
+        d_rev_graph_ptr,
+        d_rev_graph_count_ptr,
         static_cast<uint32_t>(graph_size),
         static_cast<uint32_t>(output_graph_degree),
         mst_graph_ptr,
@@ -955,8 +955,8 @@ void merge_graph_cpu(IdxT* output_graph_ptr,
 
 template <typename IdxT, typename InOutMatrixView>
 void make_reverse_graph_gpu(raft::resources const& res,
-                            IdxT* d_rev_graph,
-                            uint32_t* d_rev_graph_count,
+                            IdxT* d_rev_graph_ptr,
+                            uint32_t* d_rev_graph_count_ptr,
                             InOutMatrixView new_graph)
 {
   const uint64_t graph_size          = new_graph.extent(0);
@@ -966,18 +966,19 @@ void make_reverse_graph_gpu(raft::resources const& res,
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
     "cagra::graph::optimize/reverse");
 
-  auto large_tmp_mr = raft::resource::get_large_workspace_resource(res);
   auto dest_nodes   = raft::make_host_vector<IdxT, int64_t>(graph_size);
-  auto d_dest_nodes =
-    raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
+  auto d_dest_nodes = raft::make_device_mdarray<IdxT>(
+    res, raft::resource::get_workspace_resource(res), raft::make_extents<int64_t>(graph_size));
 
   raft::matrix::fill(
     res,
-    raft::make_device_vector_view<IdxT, int64_t>(d_rev_graph, graph_size * output_graph_degree),
+    raft::make_device_vector_view<IdxT, int64_t>(d_rev_graph_ptr, graph_size * output_graph_degree),
     IdxT(-1));
 
   raft::matrix::fill(
-    res, raft::make_device_vector_view<IdxT, int64_t>(d_rev_graph_count, graph_size), uint32_t(0));
+    res,
+    raft::make_device_vector_view<IdxT, int64_t>(d_rev_graph_count_ptr, graph_size),
+    uint32_t(0));
 
   bool output_graph_device_accessible = is_ptr_device_accessible(output_graph_ptr);
   dim3 threads(256, 1, 1);
@@ -987,8 +988,8 @@ void make_reverse_graph_gpu(raft::resources const& res,
     if (output_graph_device_accessible) {
       kern_make_rev_graph_k<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
         output_graph_ptr,
-        d_rev_graph,
-        d_rev_graph_count,
+        d_rev_graph_ptr,
+        d_rev_graph_count_ptr,
         static_cast<uint32_t>(graph_size),
         static_cast<uint32_t>(output_graph_degree),
         k);
@@ -1003,8 +1004,8 @@ void make_reverse_graph_gpu(raft::resources const& res,
 
       kern_make_rev_graph<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
         d_dest_nodes.data_handle(),
-        d_rev_graph,
-        d_rev_graph_count,
+        d_rev_graph_ptr,
+        d_rev_graph_count_ptr,
         static_cast<uint32_t>(graph_size),
         static_cast<uint32_t>(output_graph_degree));
     }
@@ -1679,6 +1680,8 @@ void prune_graph_gpu(raft::resources const& res,
                      IdxT* output_graph_ptr,
                      uint64_t output_graph_degree)
 {
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+    "cagra::graph::optimize/prune");
   auto default_ws_mr = raft::resource::get_workspace_resource(res);
 
   uint32_t batch_size =
@@ -1715,6 +1718,8 @@ void prune_graph_gpu(raft::resources const& res,
     res, default_ws_mr, raft::make_extents<int64_t>(batch_size, output_graph_degree));
   auto d_invalid_neighbor_list = raft::make_device_scalar<uint32_t>(res, 0u);
 
+  bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr);
+
   for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
     raft::matrix::fill(res, d_detour_count.view(), uint8_t(0xff));
     raft::matrix::fill(res, d_num_no_detour_edges.view(), uint32_t(0));
@@ -1744,18 +1749,21 @@ void prune_graph_gpu(raft::resources const& res,
         knn_graph_degree,
         output_graph_degree,
         d_detour_count.data_handle(),
-        d_output_graph.data_handle(),
+        output_device_accessible ? d_output_graph.data_handle()
+                                 : output_graph_ptr + i_batch * batch_size * output_graph_degree,
         batch_size,
         i_batch,
         d_invalid_neighbor_list.data_handle());
 
-    size_t copy_size =
-      std::min(static_cast<size_t>(batch_size), graph_size - i_batch * batch_size) *
-      output_graph_degree;
-    raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree,
-               d_output_graph.data_handle(),
-               copy_size,
-               raft::resource::get_cuda_stream(res));
+    if (!output_device_accessible) {
+      size_t copy_size =
+        std::min(static_cast<size_t>(batch_size), graph_size - i_batch * batch_size) *
+        output_graph_degree;
+      raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree,
+                 d_output_graph.data_handle(),
+                 copy_size,
+                 raft::resource::get_cuda_stream(res));
+    }
 
     raft::resource::sync_stream(res);
     RAFT_LOG_DEBUG(
@@ -1799,14 +1807,14 @@ void prune_graph_cpu(IdxT* knn_graph_ptr,
                      IdxT* output_graph_ptr,
                      uint64_t output_graph_degree)
 {
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
+    "cagra::graph::optimize/prune");
   auto detour_count = raft::make_host_matrix<uint8_t, int64_t>(graph_size, knn_graph_degree);
 
   auto knn_graph_view =
     raft::make_host_matrix_view<IdxT, int64_t>(knn_graph_ptr, graph_size, knn_graph_degree);
 
   {
-    raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-      "cagra::graph::optimize/prune/2-hop-counting-by-CPU");
     const double time_2hop_count_start = cur_time();
 
     count_2hop_detours(knn_graph_view, detour_count.view());
@@ -2022,7 +2030,6 @@ void optimize(raft::resources const& res,
   } else {
     auto rev_graph       = raft::make_host_matrix<IdxT, int64_t>(graph_size, output_graph_degree);
     auto rev_graph_count = raft::make_host_vector<uint32_t, int64_t>(graph_size);
-    auto mst_graph       = raft::make_host_matrix<IdxT, int64_t>(0, 0);
     raft::copy(res, rev_graph.view(), d_rev_graph.view());
     raft::copy(res, rev_graph_count.view(), d_rev_graph_count.view());
 
@@ -2036,6 +2043,8 @@ void optimize(raft::resources const& res,
                           guarantee_connectivity);
   }
 
+  raft::resource::sync_stream(res);
+
   if (is_ptr_host_accessible(new_graph.data_handle())) {
     // following checks require host access
     log_replaced_edges_stats<IdxT>(new_graph.data_handle(), graph_size, output_graph_degree);

From 14e9f3ebc94aec7031d5c8eb685dc9b6fb36595d Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Tue, 3 Mar 2026 12:41:33 +0000
Subject: [PATCH 071/119] bugfix

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index f77f6367e5..5b2893e77f 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -1749,8 +1749,8 @@ void prune_graph_gpu(raft::resources const& res,
         knn_graph_degree,
         output_graph_degree,
         d_detour_count.data_handle(),
-        output_device_accessible ? d_output_graph.data_handle()
-                                 : output_graph_ptr + i_batch * batch_size * output_graph_degree,
+        output_device_accessible ? output_graph_ptr + i_batch * batch_size * output_graph_degree
+                                 : d_output_graph.data_handle(),
         batch_size,
         i_batch,
         d_invalid_neighbor_list.data_handle());

From 416558d40b1207ca7f1b8aad0aeda68b24e68aea Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 5 Mar 2026 21:43:49 +0000
Subject: [PATCH 072/119] fuse and simplify pruning, remove CPU path

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 331 +++++-------------
 1 file changed, 92 insertions(+), 239 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 5b2893e77f..25be6ae393 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -157,79 +157,6 @@ __global__ void kern_sort(const DATA_T* const dataset,  // [dataset_chunk_size,
   }
 }
 
-template <int MAX_DEGREE, class IdxT>
-__global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
-                           const uint32_t graph_size,
-                           const uint32_t graph_degree,
-                           const uint32_t degree,
-                           const uint32_t batch_size,
-                           const uint32_t batch_id,
-                           uint8_t* const detour_count,          // [batch_size, graph_degree]
-                           uint32_t* const num_no_detour_edges,  // [batch_size]
-                           uint64_t* const stats)
-{
-  __shared__ uint32_t smem_num_detour[MAX_DEGREE];
-  extern __shared__ unsigned char smem_buf[];
-  IdxT* const smem_knn_iA_neighbors = reinterpret_cast<IdxT*>(smem_buf);
-
-  uint64_t* const num_retain = stats;
-  uint64_t* const num_full   = stats + 1;
-
-  const uint64_t iA       = blockIdx.x + (batch_size * batch_id);
-  const uint64_t iA_batch = blockIdx.x;
-
-  if (iA >= graph_size) { return; }
-
-  // Load this node's neighbor row into shared memory to reduce global reads
-  for (uint32_t k = threadIdx.x; k < graph_degree; k += blockDim.x) {
-    smem_num_detour[k]       = 0;
-    smem_knn_iA_neighbors[k] = knn_graph[k + ((uint64_t)graph_degree * iA)];
-    if (smem_knn_iA_neighbors[k] == iA) {
-      // Lower the priority of self-edge
-      smem_num_detour[k] = graph_degree;
-    }
-  }
-  __syncthreads();
-
-  // count number of detours (A->D->B)
-  for (uint32_t kAD = 0; kAD < graph_degree - 1; kAD++) {
-    const uint64_t iD = smem_knn_iA_neighbors[kAD];
-    if (iD >= graph_size) { continue; }
-    for (uint32_t kDB = threadIdx.x; kDB < graph_degree; kDB += blockDim.x) {
-      const uint64_t iB_candidate = knn_graph[kDB + ((uint64_t)graph_degree * iD)];
-      for (uint32_t kAB = kAD + 1; kAB < graph_degree; kAB++) {
-        // if ( kDB < kAB )
-        {
-          const uint64_t iB = smem_knn_iA_neighbors[kAB];
-          if (iB == iB_candidate) {
-            atomicAdd(smem_num_detour + kAB, 1);
-            break;
-          }
-        }
-      }
-    }
-    __syncthreads();
-  }
-
-  uint32_t num_edges_no_detour = 0;
-  for (uint32_t k = threadIdx.x; k < graph_degree; k += blockDim.x) {
-    detour_count[k + (graph_degree * iA_batch)] = min(smem_num_detour[k], (uint32_t)255);
-    if (smem_num_detour[k] == 0) { num_edges_no_detour++; }
-  }
-  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 1);
-  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 2);
-  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 4);
-  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 8);
-  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 16);
-  num_edges_no_detour = min(num_edges_no_detour, degree);
-
-  if (threadIdx.x == 0) {
-    num_no_detour_edges[iA_batch] = num_edges_no_detour;
-    atomicAdd((unsigned long long int*)num_retain, (unsigned long long int)num_edges_no_detour);
-    if (num_edges_no_detour >= degree) { atomicAdd((unsigned long long int*)num_full, 1); }
-  }
-}
-
 template <class IdxT>
 __global__ void kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_size]
                                     IdxT* const rev_graph,            // [size, degree]
@@ -269,48 +196,98 @@ __global__ void kern_make_rev_graph_k(const IdxT* const dest_nodes,     // [grap
   }
 }
 
-// Based on the detour count, select the smallest detour count and its index
-// (Pruning Update Kernel)
-template <typename IdxT>
-__global__ void kern_select_smallest_detour_neighbors(
-  const IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
-  uint64_t graph_size,
-  uint64_t knn_graph_degree,
-  uint64_t output_graph_degree,
-  uint8_t* const d_detour_count,  // [batch_size, graph_degree]
-  IdxT* output_graph_ptr,
-  const uint32_t batch_size,  // [batch_size, output_graph_degree]
-  const uint32_t batch_id,
-  uint32_t* const d_invalid_neighbor_list)
+template <class IdxT, uint32_t num_warps>
+__global__ void kern_fused_prune(const IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
+                                 IdxT* const output_graph_ptr,
+                                 const uint32_t graph_size,
+                                 const uint32_t knn_graph_degree,
+                                 const uint32_t output_graph_degree,
+                                 const uint32_t batch_size,
+                                 const uint32_t batch_id,
+                                 uint32_t* const d_invalid_neighbor_list,
+                                 uint64_t* const stats)
 {
-  assert(blockDim.x == 32);
+  extern __shared__ unsigned char smem_buf[];
 
-  // Allocate shared memory for detour counts and their indices
-  extern __shared__ IdxT smem_indices[];
-  uint16_t* smem_detour_count = (uint16_t*)&smem_indices[knn_graph_degree];
+  const uint32_t wid     = threadIdx.x / raft::WarpSize;
+  const uint32_t lane_id = threadIdx.x % raft::WarpSize;
 
-  const uint64_t nid       = blockIdx.x + (batch_size * batch_id);
-  const uint64_t nid_batch = blockIdx.x;
+  IdxT* const smem_indices =
+    reinterpret_cast<IdxT*>(smem_buf + wid * knn_graph_degree * sizeof(IdxT));
+  uint32_t* const smem_num_detour = reinterpret_cast<uint32_t*>(
+    smem_buf + wid * knn_graph_degree * sizeof(IdxT) + num_warps * knn_graph_degree * sizeof(IdxT));
+
+  uint64_t* const num_retain = stats;
+  uint64_t* const num_full   = stats + 1;
+
+  const unsigned warp_mask = 0xffffffff;
   const uint32_t maxval16  = 0x0000ffff;
 
+  const uint64_t nid_batch = blockIdx.x * num_warps + wid;
+  const uint64_t nid       = nid_batch + (batch_size * batch_id);
+
   if (nid >= graph_size) { return; }
 
-  // Load indices and detour counts for each neighbor; invalidate out-of-bounds entries
-  for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) {
-    smem_indices[k]      = knn_graph[knn_graph_degree * nid + k];
-    smem_detour_count[k] = (smem_indices[k] >= graph_size)
-                             ? maxval16
-                             : (uint16_t)d_detour_count[nid_batch * knn_graph_degree + k];
+  // Load this node's neighbor row into shared memory to reduce global reads
+  for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) {
+    smem_num_detour[k] = 0;
+    smem_indices[k]    = knn_graph[k + ((uint64_t)knn_graph_degree * nid)];
+    if (smem_indices[k] == nid) {
+      // Lower the priority of self-edge
+      smem_num_detour[k] = knn_graph_degree;
+    }
   }
   __syncwarp();
 
-  const unsigned warp_mask = 0xffffffff;
+  // count number of detours (A->D->B)
+  for (uint32_t kAD = 0; kAD < knn_graph_degree - 1; kAD++) {
+    const uint64_t iD = smem_indices[kAD];
+    if (iD >= graph_size) { continue; }
+    for (uint32_t kDB = lane_id; kDB < knn_graph_degree; kDB += raft::WarpSize) {
+      const uint64_t iB_candidate = knn_graph[kDB + ((uint64_t)knn_graph_degree * iD)];
+      for (uint32_t kAB = kAD + 1; kAB < knn_graph_degree; kAB++) {
+        // if ( kDB < kAB )
+        {
+          const uint64_t iB = smem_indices[kAB];
+          if (iB == iB_candidate) {
+            atomicAdd(smem_num_detour + kAB, 1);
+            break;
+          }
+        }
+      }
+    }
+    __syncwarp();
+  }
+
+  uint32_t num_edges_no_detour = 0;
+  for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) {
+    smem_num_detour[k] = min(smem_num_detour[k], maxval16);
+    if (smem_num_detour[k] == 0) { num_edges_no_detour++; }
+    if (smem_indices[k] >= graph_size) { smem_num_detour[k] = maxval16; }
+  }
+
+  __syncwarp();
+
+  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 1);
+  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 2);
+  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 4);
+  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 8);
+  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 16);
+  num_edges_no_detour = min(num_edges_no_detour, output_graph_degree);
+
+  if (lane_id == 0) {
+    atomicAdd((unsigned long long int*)num_retain, (unsigned long long int)num_edges_no_detour);
+    if (num_edges_no_detour >= output_graph_degree) {
+      atomicAdd((unsigned long long int*)num_full, 1);
+    }
+  }
+
   for (uint32_t i = 0; i < output_graph_degree; i++) {
     uint32_t local_min = maxval16;
     uint32_t local_idx = maxval16;
-    for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) {
-      if (smem_detour_count[k] < local_min) {
-        local_min = smem_detour_count[k];
+    for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) {
+      if (smem_num_detour[k] < local_min) {
+        local_min = smem_num_detour[k];
         local_idx = k;
       }
     }
@@ -321,18 +298,18 @@ __global__ void kern_select_smallest_detour_neighbors(
     uint32_t warp_local_idx     = warp_min_with_tag & 0xffff;
 
     if (warp_min_count == maxval16 || warp_local_idx == maxval16) {
-      if (threadIdx.x == 0) { atomicExch(d_invalid_neighbor_list, 1u); }
+      if (lane_id == 0) { atomicExch(d_invalid_neighbor_list, 1u); }
       break;
     }
 
     IdxT selected_node = smem_indices[warp_local_idx];
 
-    for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) {
-      if (smem_indices[k] == selected_node) { smem_detour_count[k] = maxval16; }
+    for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) {
+      if (smem_indices[k] == selected_node) { smem_num_detour[k] = maxval16; }
     }
     __syncwarp(warp_mask);
 
-    if (threadIdx.x == 0) { output_graph_ptr[nid_batch * output_graph_degree + i] = selected_node; }
+    if (lane_id == 0) { output_graph_ptr[nid_batch * output_graph_degree + i] = selected_node; }
   }
 }
 
@@ -1690,15 +1667,6 @@ void prune_graph_gpu(raft::resources const& res,
 
   RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");
 
-  constexpr int MAX_DEGREE = 1024;
-  if (knn_graph_degree > MAX_DEGREE) {
-    RAFT_FAIL(
-      "The degree of input knn graph is too large (%zu). "
-      "It must be equal to or smaller than %d.",
-      knn_graph_degree,
-      MAX_DEGREE);
-  }
-
   const double prune_start = cur_time();
 
   uint64_t num_keep __attribute__((unused)) = 0;
@@ -1710,10 +1678,6 @@ void prune_graph_gpu(raft::resources const& res,
   device_matrix_view_from_host<IdxT, int64_t> d_input_graph(
     res, raft::make_host_matrix_view<IdxT, int64_t>(knn_graph_ptr, graph_size, knn_graph_degree));
 
-  auto d_detour_count = raft::make_device_mdarray<uint8_t>(
-    res, default_ws_mr, raft::make_extents<int64_t>(batch_size, knn_graph_degree));
-  auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
-    res, default_ws_mr, raft::make_extents<int64_t>(batch_size));
   auto d_output_graph = raft::make_device_mdarray<IdxT>(
     res, default_ws_mr, raft::make_extents<int64_t>(batch_size, output_graph_degree));
   auto d_invalid_neighbor_list = raft::make_device_scalar<uint32_t>(res, 0u);
@@ -1721,40 +1685,23 @@ void prune_graph_gpu(raft::resources const& res,
   bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr);
 
   for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
-    raft::matrix::fill(res, d_detour_count.view(), uint8_t(0xff));
-    raft::matrix::fill(res, d_num_no_detour_edges.view(), uint32_t(0));
-
-    const dim3 threads_prune(32, 1, 1);
+    const uint32_t num_warps = 4;
+    const dim3 threads_prune(raft::WarpSize * num_warps, 1, 1);
     const dim3 blocks_prune(batch_size, 1, 1);
-    const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT);
-    kern_prune<MAX_DEGREE, IdxT>
+    const size_t prune_smem_size = num_warps * knn_graph_degree * (sizeof(IdxT) + sizeof(uint32_t));
+    kern_fused_prune<IdxT, num_warps>
       <<<blocks_prune, threads_prune, prune_smem_size, raft::resource::get_cuda_stream(res)>>>(
         d_input_graph.data_handle(),
+        output_device_accessible ? output_graph_ptr + i_batch * batch_size * output_graph_degree
+                                 : d_output_graph.data_handle(),
         graph_size,
         knn_graph_degree,
         output_graph_degree,
         batch_size,
         i_batch,
-        d_detour_count.data_handle(),
-        d_num_no_detour_edges.data_handle(),
+        d_invalid_neighbor_list.data_handle(),
         dev_stats.data_handle());
 
-    const size_t select_smem_size = (knn_graph_degree) * (sizeof(uint16_t) + sizeof(IdxT));
-    const dim3 threads_select(32, 1, 1);
-    const dim3 blocks_select(batch_size, 1, 1);
-    kern_select_smallest_detour_neighbors<IdxT>
-      <<<blocks_select, threads_select, select_smem_size, raft::resource::get_cuda_stream(res)>>>(
-        d_input_graph.data_handle(),
-        graph_size,
-        knn_graph_degree,
-        output_graph_degree,
-        d_detour_count.data_handle(),
-        output_device_accessible ? output_graph_ptr + i_batch * batch_size * output_graph_degree
-                                 : d_output_graph.data_handle(),
-        batch_size,
-        i_batch,
-        d_invalid_neighbor_list.data_handle());
-
     if (!output_device_accessible) {
       size_t copy_size =
         std::min(static_cast<size_t>(batch_size), graph_size - i_batch * batch_size) *
@@ -1800,79 +1747,6 @@ void prune_graph_gpu(raft::resources const& res,
     (double)num_full / graph_size * 100);
 }
 
-template <typename IdxT>
-void prune_graph_cpu(IdxT* knn_graph_ptr,
-                     uint64_t graph_size,
-                     uint64_t knn_graph_degree,
-                     IdxT* output_graph_ptr,
-                     uint64_t output_graph_degree)
-{
-  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-    "cagra::graph::optimize/prune");
-  auto detour_count = raft::make_host_matrix<uint8_t, int64_t>(graph_size, knn_graph_degree);
-
-  auto knn_graph_view =
-    raft::make_host_matrix_view<IdxT, int64_t>(knn_graph_ptr, graph_size, knn_graph_degree);
-
-  {
-    const double time_2hop_count_start = cur_time();
-
-    count_2hop_detours(knn_graph_view, detour_count.view());
-
-    const double time_2hop_count_end = cur_time();
-    RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec",
-                   time_2hop_count_end - time_2hop_count_start);
-  }
-  bool invalid_neighbor_list = false;
-#pragma omp parallel for
-  for (uint64_t i = 0; i < graph_size; i++) {
-    uint64_t pk         = 0;
-    uint32_t num_detour = 0;
-    for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) {
-      uint32_t next_num_detour = std::numeric_limits<uint32_t>::max();
-      for (uint64_t k = 0; k < knn_graph_degree; k++) {
-        const auto num_detour_k = detour_count(i, k);
-        if (num_detour_k > num_detour) {
-          next_num_detour = std::min(static_cast<uint32_t>(num_detour_k), next_num_detour);
-        }
-
-        if (num_detour_k != num_detour) { continue; }
-
-        const auto candidate_node = knn_graph_view(i, k);
-        bool dup                  = false;
-        for (uint32_t dk = 0; dk < pk; dk++) {
-          if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) {
-            dup = true;
-            break;
-          }
-        }
-        if (!dup && candidate_node < graph_size) {
-          output_graph_ptr[i * output_graph_degree + pk] = candidate_node;
-          pk += 1;
-        }
-        if (pk >= output_graph_degree) break;
-      }
-      if (pk >= output_graph_degree) break;
-
-      if (next_num_detour == std::numeric_limits<uint32_t>::max()) { break; }
-      num_detour = next_num_detour;
-    }
-    if (pk != output_graph_degree) {
-      RAFT_LOG_DEBUG(
-        "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for "
-        "node %lu in the rank-based node reranking process",
-        output_graph_degree,
-        i);
-      invalid_neighbor_list = true;
-    }
-  }
-  RAFT_EXPECTS(
-    !invalid_neighbor_list,
-    "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
-    "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
-    "overflows occur during the norm computation between the dataset vectors.");
-}
-
 // TODO allow pinned input for both knn_graph and new_graph
 template <typename IdxT = uint32_t, typename InOutMatrixView>
 void optimize(raft::resources const& res,
@@ -1939,22 +1813,8 @@ void optimize(raft::resources const& res,
     }
   }
 
-  // prune graph -- will use GPU path if possible, otherwise CPU path
-  // we only need to check in case input is not alreadydevice accessible
-  bool use_gpu_prune = use_gpu;
-  if (!inout_device_accessible) {
-    try {
-      auto d_input_graph = raft::make_device_mdarray<IdxT>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, knn_graph_degree));
-    } catch (std::bad_alloc& e) {
-      RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU");
-      use_gpu_prune = false;
-    } catch (raft::logic_error& e) {
-      RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU (logic error)");
-      use_gpu_prune = false;
-    }
-  }
-  if (use_gpu_prune) {
+  // prune graph -- will always use GPU path
+  {
     // should be noop in case input is already device accessible
     device_matrix_view_from_host<IdxT, int64_t> d_input_graph(
       res,
@@ -1967,13 +1827,6 @@ void optimize(raft::resources const& res,
                           knn_graph_degree,
                           new_graph.data_handle(),
                           output_graph_degree);
-
-  } else {
-    prune_graph_cpu<IdxT>(knn_graph.data_handle(),
-                          graph_size,
-                          knn_graph_degree,
-                          new_graph.data_handle(),
-                          output_graph_degree);
   }
 
   // reverse graph creation will always use the GPU

From d8d8bd877db9596720efaf67bb1373084dbf17c8 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 5 Mar 2026 22:49:16 +0000
Subject: [PATCH 073/119] cleanup merge, remove CPU path

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 339 +++++-------------
 1 file changed, 85 insertions(+), 254 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 25be6ae393..392edc97d9 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -197,8 +197,8 @@ __global__ void kern_make_rev_graph_k(const IdxT* const dest_nodes,     // [grap
 }
 
 template <class IdxT, uint32_t num_warps>
-__global__ void kern_fused_prune(const IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
-                                 IdxT* const output_graph_ptr,
+__global__ void kern_fused_prune(const IdxT* const knn_graph,   // [graph_chunk_size, graph_degree]
+                                 IdxT* const output_graph_ptr,  // [batch_size, output_graph_degree]
                                  const uint32_t graph_size,
                                  const uint32_t knn_graph_degree,
                                  const uint32_t output_graph_degree,
@@ -337,8 +337,8 @@ __device__ void thread_shift_array(T* array, uint64_t num)
   }
 }
 
-template <typename IdxT>
-__global__ void kern_merge_graph(IdxT* output_graph,
+template <typename IdxT, uint32_t num_warps>
+__global__ void kern_merge_graph(IdxT* output_graph,  // [batch_size, output_graph_degree]
                                  const IdxT* const rev_graph,
                                  uint32_t* const rev_graph_count,  // [graph_size]
                                  const uint32_t graph_size,
@@ -352,29 +352,32 @@ __global__ void kern_merge_graph(IdxT* output_graph,
                                  bool* check_num_protected_edges)
 {
   extern __shared__ unsigned char smem_buf[];
-  IdxT* smem_sorted_output_graph = reinterpret_cast<IdxT*>(smem_buf);
 
-  assert(blockDim.x == 32);
+  const uint32_t wid     = threadIdx.x / raft::WarpSize;
+  const uint32_t lane_id = threadIdx.x % raft::WarpSize;
 
-  const uint64_t nid = blockIdx.x + (batch_size * batch_id);
-  if (nid >= graph_size) { return; }
+  IdxT* smem_sorted_output_graph =
+    reinterpret_cast<IdxT*>(smem_buf + wid * output_graph_degree * sizeof(IdxT));
+
+  const uint64_t nid_batch = blockIdx.x * num_warps + wid;
+  const uint64_t nid       = nid_batch + (batch_size * batch_id);
 
-  if (threadIdx.x == 0) check_num_protected_edges[0] = true;
+  if (nid >= graph_size) { return; }
 
-  const auto mst_graph_num_edges = mst_graph_num_edges_ptr[nid];
+  const auto mst_graph_num_edges = guarantee_connectivity ? mst_graph_num_edges_ptr[nid] : 0;
   // If guarantee_connectivity == true, use a temporal list to merge the
   // neighbor lists of the graphs.
   if (guarantee_connectivity) {
-    for (uint32_t i = threadIdx.x; i < mst_graph_degree; i += 32) {
+    for (uint32_t i = lane_id; i < mst_graph_degree; i += raft::WarpSize) {
       smem_sorted_output_graph[i] = mst_graph[nid * mst_graph_degree + i];
     }
     __syncwarp();
     for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges;
          (pruned_j < output_graph_degree) && (output_j < output_graph_degree);
          pruned_j++) {
-      const auto v     = output_graph[output_graph_degree * nid + pruned_j];
+      const auto v     = output_graph[output_graph_degree * nid_batch + pruned_j];
       unsigned int dup = 0;
-      for (uint32_t m = threadIdx.x; m < output_j; m += 32) {
+      for (uint32_t m = lane_id; m < output_j; m += raft::WarpSize) {
         if (v == smem_sorted_output_graph[m]) {
           dup = 1;
           break;
@@ -383,7 +386,7 @@ __global__ void kern_merge_graph(IdxT* output_graph,
 
       unsigned int warp_dup = __ballot_sync(0xffffffff, dup);
       if (warp_dup == 0) {
-        if (threadIdx.x == 0) smem_sorted_output_graph[output_j] = v;
+        if (lane_id == 0) smem_sorted_output_graph[output_j] = v;
         output_j++;
       }
       __syncwarp();
@@ -391,8 +394,8 @@ __global__ void kern_merge_graph(IdxT* output_graph,
   }
 
   else {
-    for (uint32_t i = threadIdx.x; i < output_graph_degree; i += 32) {
-      smem_sorted_output_graph[i] = output_graph[output_graph_degree * nid + i];
+    for (uint32_t i = lane_id; i < output_graph_degree; i += raft::WarpSize) {
+      smem_sorted_output_graph[i] = output_graph[output_graph_degree * nid_batch + i];
     }
     __syncwarp();
   }
@@ -412,7 +415,7 @@ __global__ void kern_merge_graph(IdxT* output_graph,
       if (pos < num_protected_edges) { continue; }
       uint64_t num_shift = pos - num_protected_edges;
       if (pos >= output_graph_degree) { num_shift = output_graph_degree - num_protected_edges - 1; }
-      if (threadIdx.x == 0) {
+      if (lane_id == 0) {
         thread_shift_array<IdxT>(smem_sorted_output_graph + num_protected_edges, num_shift);
         smem_sorted_output_graph[num_protected_edges] = rev_graph[kr + (output_graph_degree * nid)];
       }
@@ -420,8 +423,8 @@ __global__ void kern_merge_graph(IdxT* output_graph,
     }
   }
 
-  for (uint32_t i = threadIdx.x; i < output_graph_degree; i += 32) {
-    output_graph[(output_graph_degree * nid) + i] = smem_sorted_output_graph[i];
+  for (uint32_t i = lane_id; i < output_graph_degree; i += raft::WarpSize) {
+    output_graph[(output_graph_degree * nid_batch) + i] = smem_sorted_output_graph[i];
   }
 }
 
@@ -780,8 +783,8 @@ void merge_graph_gpu(raft::resources const& res,
                      IdxT* output_graph_ptr,
                      const IdxT* d_rev_graph_ptr,
                      uint32_t* d_rev_graph_count_ptr,
-                     const IdxT* mst_graph_ptr,
-                     const uint32_t* mst_graph_num_edges_ptr,
+                     IdxT* mst_graph_ptr,
+                     uint32_t* mst_graph_num_edges_ptr,
                      uint64_t graph_size,
                      uint64_t output_graph_degree,
                      bool guarantee_connectivity)
@@ -789,36 +792,62 @@ void merge_graph_gpu(raft::resources const& res,
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
     "cagra::graph::optimize/combine");
 
+  auto default_ws_mr             = raft::resource::get_workspace_resource(res);
   const double merge_graph_start = cur_time();
 
-  device_matrix_view_from_host<IdxT, int64_t> d_output_graph(
-    res,
-    raft::make_host_matrix_view<IdxT, int64_t>(output_graph_ptr, graph_size, output_graph_degree));
-
   auto d_check_num_protected_edges = raft::make_device_scalar<bool>(res, true);
+  auto d_invalid_neighbor_list     = raft::make_device_scalar<uint32_t>(res, 0u);
 
   uint32_t batch_size =
     std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
   const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
 
-  const dim3 threads_merge(32, 1, 1);
-  const dim3 blocks_merge(batch_size, 1, 1);
-  const size_t merge_smem_size = (output_graph_degree + output_graph_degree) * sizeof(IdxT);
+  bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr);
+  auto d_output_graph           = raft::make_device_mdarray<IdxT>(
+    res,
+    default_ws_mr,
+    raft::make_extents<int64_t>(output_device_accessible ? 0 : batch_size, output_graph_degree));
+
+  device_matrix_view_from_host<IdxT, int64_t> d_mst_graph(
+    res,
+    raft::make_host_matrix_view<IdxT, int64_t>(
+      mst_graph_ptr, guarantee_connectivity ? graph_size : 0, output_graph_degree));
+
+  device_matrix_view_from_host<uint32_t, int64_t> d_mst_graph_num_edges(
+    res,
+    raft::make_host_matrix_view<uint32_t, int64_t>(
+      mst_graph_num_edges_ptr, guarantee_connectivity ? graph_size : 0, 1));
+
+  const uint32_t num_warps = 4;
+  const dim3 threads_merge(raft::WarpSize * num_warps, 1, 1);
+  const dim3 blocks_merge(batch_size / num_warps, 1, 1);
+  const size_t merge_smem_size = num_warps * output_graph_degree * sizeof(IdxT);
   for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
-    kern_merge_graph<IdxT>
+    kern_merge_graph<IdxT, num_warps>
       <<<blocks_merge, threads_merge, merge_smem_size, raft::resource::get_cuda_stream(res)>>>(
-        d_output_graph.data_handle(),
+        output_device_accessible ? output_graph_ptr + (i_batch * batch_size * output_graph_degree)
+                                 : d_output_graph.data_handle(),
         d_rev_graph_ptr,
         d_rev_graph_count_ptr,
         static_cast<uint32_t>(graph_size),
         static_cast<uint32_t>(output_graph_degree),
-        mst_graph_ptr,
+        d_mst_graph.data_handle(),
         static_cast<uint32_t>(output_graph_degree),
-        mst_graph_num_edges_ptr,
+        d_mst_graph_num_edges.data_handle(),
         batch_size,
         i_batch,
         guarantee_connectivity,
         d_check_num_protected_edges.data_handle());
+
+    if (!output_device_accessible) {
+      size_t copy_size =
+        std::min(static_cast<size_t>(batch_size), graph_size - i_batch * batch_size) *
+        output_graph_degree;
+      raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree,
+                 d_output_graph.data_handle(),
+                 copy_size,
+                 raft::resource::get_cuda_stream(res));
+    }
   }
 
   bool check_num_protected_edges = true;
@@ -827,13 +856,6 @@ void merge_graph_gpu(raft::resources const& res,
              1,
              raft::resource::get_cuda_stream(res));
 
-  if (d_output_graph.allocated_memory()) {
-    raft::copy(
-      res,
-      raft::make_host_matrix_view<IdxT, int64_t>(output_graph_ptr, graph_size, output_graph_degree),
-      d_output_graph.view());
-  }
-
   const auto merge_graph_end = cur_time();
   RAFT_EXPECTS(check_num_protected_edges,
                "Failed to merge the MST, pruned, and reverse edge graphs. "
@@ -844,92 +866,6 @@ void merge_graph_gpu(raft::resources const& res,
                  (merge_graph_end - merge_graph_start) * 1000.0);
 }
 
-template <typename IdxT>
-void merge_graph_cpu(IdxT* output_graph_ptr,
-                     const IdxT* rev_graph_ptr,
-                     const uint32_t* rev_graph_count_ptr,
-                     const IdxT* mst_graph_ptr,
-                     const uint32_t* mst_graph_num_edges_ptr,
-                     uint64_t graph_size,
-                     uint64_t output_graph_degree,
-                     bool guarantee_connectivity)
-{
-  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-    "cagra::graph::optimize/combine");
-
-  const double time_replace_start = cur_time();
-
-  bool check_num_protected_edges = true;
-#pragma omp parallel for
-  for (uint64_t i = 0; i < graph_size; i++) {
-    auto my_rev_graph = rev_graph_ptr + (output_graph_degree * i);
-    auto my_out_graph = output_graph_ptr + (output_graph_degree * i);
-
-    std::vector<IdxT> temp_output_neighbor_list;
-    if (guarantee_connectivity) {
-      temp_output_neighbor_list.resize(output_graph_degree);
-      my_out_graph                   = temp_output_neighbor_list.data();
-      const auto mst_graph_num_edges = mst_graph_num_edges_ptr[i];
-
-      for (uint32_t j = 0; j < mst_graph_num_edges; j++) {
-        my_out_graph[j] = mst_graph_ptr[i * output_graph_degree + j];
-      }
-
-      for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges;
-           (pruned_j < output_graph_degree) && (output_j < output_graph_degree);
-           pruned_j++) {
-        const auto v = output_graph_ptr[output_graph_degree * i + pruned_j];
-
-        bool dup = false;
-        for (uint32_t m = 0; m < output_j; m++) {
-          if (v == my_out_graph[m]) {
-            dup = true;
-            break;
-          }
-        }
-
-        if (!dup) {
-          my_out_graph[output_j] = v;
-          output_j++;
-        }
-      }
-    }
-
-    const auto num_protected_edges =
-      std::max<uint64_t>(mst_graph_num_edges_ptr[i], output_graph_degree / 2);
-    if (num_protected_edges > output_graph_degree) { check_num_protected_edges = false; }
-    if (num_protected_edges == output_graph_degree) continue;
-
-    auto kr = std::min<uint32_t>(rev_graph_count_ptr[i], output_graph_degree);
-    while (kr) {
-      kr -= 1;
-      if (my_rev_graph[kr] < graph_size) {
-        uint64_t pos = pos_in_array<IdxT>(my_rev_graph[kr], my_out_graph, output_graph_degree);
-        if (pos < num_protected_edges) { continue; }
-        uint64_t num_shift = pos - num_protected_edges;
-        if (pos >= output_graph_degree) {
-          num_shift = output_graph_degree - num_protected_edges - 1;
-        }
-        shift_array<IdxT>(my_out_graph + num_protected_edges, num_shift);
-        my_out_graph[num_protected_edges] = my_rev_graph[kr];
-      }
-    }
-
-    if (guarantee_connectivity) {
-      for (uint32_t j = 0; j < output_graph_degree; j++) {
-        output_graph_ptr[(output_graph_degree * i) + j] = my_out_graph[j];
-      }
-    }
-  }
-  RAFT_EXPECTS(check_num_protected_edges,
-               "Failed to merge the MST, pruned, and reverse edge graphs. Some nodes have too "
-               "many MST optimization edges.");
-
-  const double time_replace_end = cur_time();
-  RAFT_LOG_DEBUG("# Replacing edges time: %.1lf ms",
-                 (time_replace_end - time_replace_start) * 1000.0);
-}
-
 template <typename IdxT, typename InOutMatrixView>
 void make_reverse_graph_gpu(raft::resources const& res,
                             IdxT* d_rev_graph_ptr,
@@ -1585,58 +1521,6 @@ void mst_optimization(raft::resources const& res,
   RAFT_LOG_DEBUG("# MST optimization time: %.1lf sec", time_mst_opt_end - time_mst_opt_start);
 }
 
-template <typename IdxT = uint32_t>
-void count_2hop_detours(raft::host_matrix_view<IdxT, int64_t, raft::row_major> knn_graph,
-                        raft::host_matrix_view<uint8_t, int64_t, raft::row_major> detour_count)
-{
-  RAFT_EXPECTS(knn_graph.extent(0) == detour_count.extent(0),
-               "knn_graph and detour_count are expected to have the same number of rows");
-  RAFT_EXPECTS(knn_graph.extent(1) == detour_count.extent(1),
-               "knn_graph and detour_count are expected to have the same number of cols");
-  const uint64_t graph_size   = knn_graph.extent(0);
-  const uint64_t graph_degree = knn_graph.extent(1);
-
-#pragma omp parallel for
-  for (IdxT iA = 0; iA < graph_size; iA++) {
-    // Create a list of nodes, iB_candidates, that can be reached in 2-hops from node A.
-    auto iB_candidates =
-      raft::make_host_vector<IdxT, int64_t>((graph_degree - 1) * (graph_degree - 1));
-    for (uint64_t kAC = 0; kAC < graph_degree - 1; kAC++) {
-      IdxT iC = knn_graph(iA, kAC);
-      for (uint64_t kCB = 0; kCB < graph_degree - 1; kCB++) {
-        IdxT iB_candidate;
-        if (iC == iA || iC >= graph_size) {
-          iB_candidate = graph_size;
-        } else {
-          iB_candidate = knn_graph(iC, kCB);
-          if (iB_candidate == iA || iB_candidate == iC) { iB_candidate = graph_size; }
-        }
-        uint64_t idx;
-        if (kAC < kCB) {
-          idx = (kCB * kCB) + kAC;
-        } else {
-          idx = (kAC * (kAC + 1)) + kCB;
-        }
-        iB_candidates(idx) = iB_candidate;
-      }
-    }
-    // Count how many 2-hop detours are on each edge of node A.
-    for (uint64_t kAB = 0; kAB < graph_degree; kAB++) {
-      constexpr uint32_t max_count = 255;
-      uint32_t count               = 0;
-      IdxT iB                      = knn_graph(iA, kAB);
-      if (iB == iA) {
-        count = max_count;
-      } else {
-        for (uint64_t idx = 0; idx < kAB * kAB; idx++) {
-          if (iB_candidates(idx) == iB) { count += 1; }
-        }
-      }
-      detour_count(iA, kAB) = std::min(count, max_count);
-    }
-  }
-}
-
 //
 // Prune unimportant edges based on 2-hop detour counts.
 //
@@ -1678,16 +1562,18 @@ void prune_graph_gpu(raft::resources const& res,
   device_matrix_view_from_host<IdxT, int64_t> d_input_graph(
     res, raft::make_host_matrix_view<IdxT, int64_t>(knn_graph_ptr, graph_size, knn_graph_degree));
 
-  auto d_output_graph = raft::make_device_mdarray<IdxT>(
-    res, default_ws_mr, raft::make_extents<int64_t>(batch_size, output_graph_degree));
   auto d_invalid_neighbor_list = raft::make_device_scalar<uint32_t>(res, 0u);
 
   bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr);
+  auto d_output_graph           = raft::make_device_mdarray<IdxT>(
+    res,
+    default_ws_mr,
+    raft::make_extents<int64_t>(output_device_accessible ? 0 : batch_size, output_graph_degree));
 
   for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
     const uint32_t num_warps = 4;
     const dim3 threads_prune(raft::WarpSize * num_warps, 1, 1);
-    const dim3 blocks_prune(batch_size, 1, 1);
+    const dim3 blocks_prune(batch_size / num_warps, 1, 1);
     const size_t prune_smem_size = num_warps * knn_graph_degree * (sizeof(IdxT) + sizeof(uint32_t));
     kern_fused_prune<IdxT, num_warps>
       <<<blocks_prune, threads_prune, prune_smem_size, raft::resource::get_cuda_stream(res)>>>(
@@ -1775,54 +1661,36 @@ void optimize(raft::resources const& res,
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::graph::optimize(%zu, %zu, %u)", graph_size, knn_graph_degree, output_graph_degree);
 
-  // check if input and output are both device accessible
-  // in this case we assume data to be ONLY device accessible and not host accessible
-  // furthermore we ensure all large allocations to go to the large workspace resource
-  // and all small allocations to go to the default workspace resource
-  bool inout_device_accessible = false;
-  {
-    bool input_device_accessible  = is_ptr_device_accessible(knn_graph.data_handle());
-    bool output_device_accessible = is_ptr_device_accessible(new_graph.data_handle());
-    RAFT_EXPECTS(input_device_accessible == output_device_accessible,
-                 "Input and output must be either both device accessible or both host accessible");
-    inout_device_accessible = input_device_accessible && output_device_accessible;
-  }
-
   // MST optimization
   // currently, only using GPU path for MST optimization
-  auto p_mst_graph           = raft::make_pinned_matrix<IdxT, int64_t, raft::row_major>(res, 0, 0);
-  auto p_mst_graph_num_edges = raft::make_pinned_vector<uint32_t>(res, graph_size);
-  auto p_mst_graph_num_edges_ptr = p_mst_graph_num_edges.data_handle();
-#pragma omp parallel for
-  for (uint64_t i = 0; i < graph_size; i++) {
-    p_mst_graph_num_edges_ptr[i] = 0;
-  }
+  auto mst_graph           = raft::make_host_matrix<IdxT, int64_t, raft::row_major>(0, 0);
+  auto mst_graph_num_edges = raft::make_host_vector<uint32_t, int64_t>(0);
+
   if (guarantee_connectivity) {
+    auto mst_graph_num_edges     = raft::make_host_vector<uint32_t, int64_t>(graph_size);
+    auto mst_graph_num_edges_ptr = mst_graph_num_edges.data_handle();
+#pragma omp parallel for
+    for (uint64_t i = 0; i < graph_size; i++) {
+      mst_graph_num_edges_ptr[i] = 0;
+    }
     raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
       "cagra::graph::optimize/check_connectivity");
-    p_mst_graph = raft::make_pinned_matrix<IdxT, int64_t, raft::row_major>(
-      res, graph_size, output_graph_degree);
+    mst_graph =
+      raft::make_host_matrix<IdxT, int64_t, raft::row_major>(graph_size, output_graph_degree);
     RAFT_LOG_INFO("MST optimization is used to guarantee graph connectivity.");
-    mst_optimization<IdxT>(
-      res, knn_graph, p_mst_graph.view(), p_mst_graph_num_edges.view(), use_gpu);
+    mst_optimization<IdxT>(res, knn_graph, mst_graph.view(), mst_graph_num_edges.view(), use_gpu);
 
     for (uint64_t i = 0; i < graph_size; i++) {
       if (i < 8 || i >= graph_size - 8) {
-        RAFT_LOG_DEBUG("# p_mst_graph_num_edges_ptr[%lu]: %u\n", i, p_mst_graph_num_edges_ptr[i]);
+        RAFT_LOG_DEBUG("# mst_graph_num_edges_ptr[%lu]: %u\n", i, mst_graph_num_edges_ptr[i]);
       }
     }
   }
 
   // prune graph -- will always use GPU path
   {
-    // should be noop in case input is already device accessible
-    device_matrix_view_from_host<IdxT, int64_t> d_input_graph(
-      res,
-      raft::make_host_matrix_view<IdxT, int64_t>(
-        knn_graph.data_handle(), graph_size, knn_graph_degree));
-
     prune_graph_gpu<IdxT>(res,
-                          d_input_graph.data_handle(),
+                          knn_graph.data_handle(),
                           graph_size,
                           knn_graph_degree,
                           new_graph.data_handle(),
@@ -1846,51 +1714,14 @@ void optimize(raft::resources const& res,
   RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms",
                  (time_make_end - time_make_start) * 1000.0);
 
-  // merge graph -- will use GPU path if possible, otherwise CPU path
-  // we only need to check in case output is not already device accessible
-  bool use_gpu_merge = use_gpu;
-  if (!inout_device_accessible) {
-    try {
-      auto d_new_graph = raft::make_device_mdarray<IdxT>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
-    } catch (std::bad_alloc& e) {
-      RAFT_LOG_DEBUG("Insufficient memory for merging on GPU");
-      use_gpu_merge = false;
-    } catch (raft::logic_error& e) {
-      RAFT_LOG_DEBUG("Insufficient memory for merging on GPU (logic error)");
-      use_gpu_merge = false;
-    }
-  }
-
-  if (use_gpu_merge) {
-    // should be noop in case output is already device accessible
-    device_matrix_view_from_host<IdxT, int64_t> d_new_graph(
-      res,
-      raft::make_host_matrix_view<IdxT, int64_t>(
-        new_graph.data_handle(), graph_size, output_graph_degree));
-
+  // merge graph -- will always use GPU path
+  {
     merge_graph_gpu<IdxT>(res,
-                          d_new_graph.data_handle(),
+                          new_graph.data_handle(),
                           d_rev_graph.data_handle(),
                           d_rev_graph_count.data_handle(),
-                          p_mst_graph.data_handle(),
-                          p_mst_graph_num_edges.data_handle(),
-                          graph_size,
-                          output_graph_degree,
-                          guarantee_connectivity);
-
-    if (d_new_graph.allocated_memory()) { raft::copy(res, new_graph, d_new_graph.view()); }
-  } else {
-    auto rev_graph       = raft::make_host_matrix<IdxT, int64_t>(graph_size, output_graph_degree);
-    auto rev_graph_count = raft::make_host_vector<uint32_t, int64_t>(graph_size);
-    raft::copy(res, rev_graph.view(), d_rev_graph.view());
-    raft::copy(res, rev_graph_count.view(), d_rev_graph_count.view());
-
-    merge_graph_cpu<IdxT>(new_graph.data_handle(),
-                          rev_graph.data_handle(),
-                          rev_graph_count.data_handle(),
-                          p_mst_graph.data_handle(),
-                          p_mst_graph_num_edges_ptr,
+                          mst_graph.data_handle(),
+                          mst_graph_num_edges.data_handle(),
                           graph_size,
                           output_graph_degree,
                           guarantee_connectivity);

From 00c42045aa9f0f7d148865ec7e570078e5f16658 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 6 Mar 2026 00:01:10 +0000
Subject: [PATCH 074/119] batch reverse creation

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 116 ++++++++----------
 1 file changed, 52 insertions(+), 64 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 392edc97d9..9f2bc09d86 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -157,38 +157,24 @@ __global__ void kern_sort(const DATA_T* const dataset,  // [dataset_chunk_size,
   }
 }
 
-template <class IdxT>
-__global__ void kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_size]
-                                    IdxT* const rev_graph,            // [size, degree]
-                                    uint32_t* const rev_graph_count,  // [graph_size]
-                                    const uint32_t graph_size,
-                                    const uint32_t degree)
-{
-  const uint32_t tid  = threadIdx.x + (blockDim.x * blockIdx.x);
-  const uint32_t tnum = blockDim.x * gridDim.x;
-
-  for (uint32_t src_id = tid; src_id < graph_size; src_id += tnum) {
-    const IdxT dest_id = dest_nodes[src_id];
-    if (dest_id >= graph_size) continue;
-
-    const uint32_t pos = atomicAdd(rev_graph_count + dest_id, 1);
-    if (pos < degree) { rev_graph[pos + ((uint64_t)degree * dest_id)] = src_id; }
-  }
-}
-
 template <typename IdxT>
-__global__ void kern_make_rev_graph_k(const IdxT* const dest_nodes,     // [graph_size]
-                                      IdxT* const rev_graph,            // [size, degree]
-                                      uint32_t* const rev_graph_count,  // [graph_size]
-                                      const uint32_t graph_size,
-                                      const uint32_t degree,
-                                      uint64_t k)
+__global__ void kern_rev_graph_batched(const IdxT* const dest_nodes,     // [batch_size, degree]
+                                       IdxT* const rev_graph,            // [graph_size, degree]
+                                       uint32_t* const rev_graph_count,  // [graph_size]
+                                       const uint32_t graph_size,
+                                       const uint32_t degree,
+                                       const uint32_t batch_size,
+                                       const uint32_t batch_id)
 {
   const uint64_t tid  = threadIdx.x + (blockDim.x * blockIdx.x);
   const uint64_t tnum = blockDim.x * gridDim.x;
 
-  for (uint64_t src_id = tid; src_id < graph_size; src_id += tnum) {
-    IdxT dest_id = dest_nodes[k + (degree * src_id)];
+  const uint64_t block_batch_size = min(batch_size, graph_size - batch_id * batch_size);
+
+  for (uint64_t idx = tid; idx < block_batch_size * degree; idx += tnum) {
+    const IdxT dest_id    = dest_nodes[idx];
+    const uint32_t src_id = idx / degree;
+
     if (dest_id >= graph_size) continue;
 
     const uint32_t pos = atomicAdd(rev_graph_count + dest_id, 1);
@@ -866,22 +852,18 @@ void merge_graph_gpu(raft::resources const& res,
                  (merge_graph_end - merge_graph_start) * 1000.0);
 }
 
-template <typename IdxT, typename InOutMatrixView>
+template <typename IdxT>
 void make_reverse_graph_gpu(raft::resources const& res,
+                            IdxT* output_graph_ptr,
                             IdxT* d_rev_graph_ptr,
                             uint32_t* d_rev_graph_count_ptr,
-                            InOutMatrixView new_graph)
+                            uint64_t graph_size,
+                            uint64_t output_graph_degree)
 {
-  const uint64_t graph_size          = new_graph.extent(0);
-  const uint64_t output_graph_degree = new_graph.extent(1);
-  const IdxT* output_graph_ptr       = new_graph.data_handle();
-
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
     "cagra::graph::optimize/reverse");
 
-  auto dest_nodes   = raft::make_host_vector<IdxT, int64_t>(graph_size);
-  auto d_dest_nodes = raft::make_device_mdarray<IdxT>(
-    res, raft::resource::get_workspace_resource(res), raft::make_extents<int64_t>(graph_size));
+  auto default_ws_mr = raft::resource::get_workspace_resource(res);
 
   raft::matrix::fill(
     res,
@@ -893,36 +875,38 @@ void make_reverse_graph_gpu(raft::resources const& res,
     raft::make_device_vector_view<IdxT, int64_t>(d_rev_graph_count_ptr, graph_size),
     uint32_t(0));
 
-  bool output_graph_device_accessible = is_ptr_device_accessible(output_graph_ptr);
-  dim3 threads(256, 1, 1);
-  dim3 blocks(1024, 1, 1);
+  const uint32_t batch_size =
+    std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
+  const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
 
-  for (uint64_t k = 0; k < output_graph_degree; k++) {
-    if (output_graph_device_accessible) {
-      kern_make_rev_graph_k<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
-        output_graph_ptr,
-        d_rev_graph_ptr,
-        d_rev_graph_count_ptr,
-        static_cast<uint32_t>(graph_size),
-        static_cast<uint32_t>(output_graph_degree),
-        k);
-    } else {
-#pragma omp parallel for
-      for (uint64_t i = 0; i < graph_size; i++) {
-        dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)];
-      }
-      raft::resource::sync_stream(res);
+  bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr);
+  auto d_output_graph           = raft::make_device_mdarray<IdxT>(
+    res,
+    default_ws_mr,
+    raft::make_extents<int64_t>(output_device_accessible ? 0 : batch_size, output_graph_degree));
 
-      raft::copy(res, d_dest_nodes.view(), dest_nodes.view());
+  for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
+    dim3 threads(256, 1, 1);
+    dim3 blocks(1024, 1, 1);
 
-      kern_make_rev_graph<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
-        d_dest_nodes.data_handle(),
-        d_rev_graph_ptr,
-        d_rev_graph_count_ptr,
-        static_cast<uint32_t>(graph_size),
-        static_cast<uint32_t>(output_graph_degree));
+    if (!output_device_accessible) {
+      size_t copy_size =
+        std::min(static_cast<size_t>(batch_size), graph_size - i_batch * batch_size) *
+        output_graph_degree;
+      raft::copy(d_output_graph.data_handle(),
+                 output_graph_ptr + i_batch * batch_size * output_graph_degree,
+                 copy_size,
+                 raft::resource::get_cuda_stream(res));
     }
-    RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %lu    \r", k, output_graph_degree);
+    kern_rev_graph_batched<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
+      output_device_accessible ? output_graph_ptr + (i_batch * batch_size * output_graph_degree)
+                               : d_output_graph.data_handle(),
+      d_rev_graph_ptr,
+      d_rev_graph_count_ptr,
+      static_cast<uint32_t>(graph_size),
+      static_cast<uint32_t>(output_graph_degree),
+      static_cast<uint32_t>(batch_size),
+      static_cast<uint32_t>(i_batch));
   }
 
   raft::resource::sync_stream(res);
@@ -1707,8 +1691,12 @@ void optimize(raft::resources const& res,
 
   const double time_make_start = cur_time();
 
-  make_reverse_graph_gpu<IdxT>(
-    res, d_rev_graph.data_handle(), d_rev_graph_count.data_handle(), new_graph);
+  make_reverse_graph_gpu<IdxT>(res,
+                               new_graph.data_handle(),
+                               d_rev_graph.data_handle(),
+                               d_rev_graph_count.data_handle(),
+                               graph_size,
+                               output_graph_degree);
 
   const double time_make_end = cur_time();
   RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms",

From 9e63a7c442d6725703cbb52e575b68e2625f0694 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 6 Mar 2026 12:05:48 +0000
Subject: [PATCH 075/119] add prefetch view to handle managed & host

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh |  17 +-
 cpp/src/neighbors/detail/cagra/utils.hpp      | 300 +++++++++++++++++-
 2 files changed, 313 insertions(+), 4 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 9f2bc09d86..28006fa133 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -1543,8 +1543,19 @@ void prune_graph_gpu(raft::resources const& res,
   auto host_stats                           = raft::make_host_vector<uint64_t>(2);
   raft::matrix::fill(res, dev_stats.view(), uint64_t(0));
 
-  device_matrix_view_from_host<IdxT, int64_t> d_input_graph(
-    res, raft::make_host_matrix_view<IdxT, int64_t>(knn_graph_ptr, graph_size, knn_graph_degree));
+  // device_matrix_view_from_host<IdxT, int64_t> d_input_graph(
+  //   res, raft::make_host_matrix_view<IdxT, int64_t>(knn_graph_ptr, graph_size,
+  //   knn_graph_degree));
+
+  batched_device_view_from_host<IdxT, int64_t> d_input_graph(
+    res,
+    raft::make_host_matrix_view<IdxT, int64_t>(knn_graph_ptr, graph_size, knn_graph_degree),
+    /*batch_size*/ graph_size,
+    /*read_only*/ true,
+    /*host_writeback*/ false,
+    /*initialize*/ true,
+    /*evict*/ true);
+  auto input_view = d_input_graph.next_view();
 
   auto d_invalid_neighbor_list = raft::make_device_scalar<uint32_t>(res, 0u);
 
@@ -1561,7 +1572,7 @@ void prune_graph_gpu(raft::resources const& res,
     const size_t prune_smem_size = num_warps * knn_graph_degree * (sizeof(IdxT) + sizeof(uint32_t));
     kern_fused_prune<IdxT, num_warps>
       <<<blocks_prune, threads_prune, prune_smem_size, raft::resource::get_cuda_stream(res)>>>(
-        d_input_graph.data_handle(),
+        input_view.data_handle(),
         output_device_accessible ? output_graph_ptr + i_batch * batch_size * output_graph_degree
                                  : d_output_graph.data_handle(),
         graph_size,
diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp
index a59ac7fd57..c3d15e59f4 100644
--- a/cpp/src/neighbors/detail/cagra/utils.hpp
+++ b/cpp/src/neighbors/detail/cagra/utils.hpp
@@ -9,9 +9,13 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/host_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/cuda_stream_pool.hpp>
+#include <raft/core/resource/device_id.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resources.hpp>
 #include <raft/matrix/init.cuh>
 #include <raft/util/integer_utils.hpp>
-
 #include <rmm/resource_ref.hpp>
 
 #include <cuda.h>
@@ -308,4 +312,298 @@ void copy_with_padding(
                                     raft::resource::get_cuda_stream(res)));
   }
 }
+
+/**
+ * Utility to create a batched device view from a host view
+ *
+ * This utility will create a batched device view from a host view and will handle the prefetch and
+ * writeback of the data Each batch can be referenced exactlyonce by calling the next_view()
+ * function
+ *
+ * @tparam T The type of the data
+ * @tparam IdxT The type of the index
+ * @param res The resources
+ * @param host_view The host view to create the batched device view from
+ * @param batch_size The batch size
+ * @param read_only Whether the data is read only (only for managed memory)
+ * @param host_writeback Whether to write back the data to the host (only for host memory)
+ * @param initialize Whether to initialize the data (only for managed memory)
+ * @param evict Whether to evict the data (only for managed memory)
+ *
+ * @return The batched device view
+ */
+template <typename T, typename IdxT>
+class batched_device_view_from_host {
+ public:
+  batched_device_view_from_host(raft::resources const& res,
+                                raft::host_matrix_view<T, IdxT> host_view,
+                                uint64_t batch_size,
+                                bool read_only      = false,
+                                bool host_writeback = false,
+                                bool initialize     = true,
+                                bool evict          = false)
+    : res_(res),
+      host_view_(host_view),
+      batch_size_(batch_size),
+      offset_(0),
+      batch_id_(0),
+      num_buffers_(2),
+      read_only_(read_only),
+      host_writeback_(host_writeback),
+      next_buffer_pos_(0),
+      evict_(evict),
+      initialize_(initialize)
+  {
+    cudaPointerAttributes attr;
+    RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle()));
+    mem_type_ = attr.type;
+    // cudaMemoryTypeUnregistered = 0
+    // cudaMemoryTypeHost = 1
+    // cudaMemoryTypeDevice = 2
+    // cudaMemoryTypeManaged = 3
+
+    prefetch_stream_  = raft::resource::get_cuda_stream(res);
+    writeback_stream_ = raft::resource::get_cuda_stream(res);
+    if (res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL)) {
+      if (raft::resource::get_stream_pool_size(res) >= 1) {
+        prefetch_stream_  = raft::resource::get_stream_from_stream_pool(res);
+        writeback_stream_ = raft::resource::get_stream_from_stream_pool(res);
+      }
+    }
+
+    // allocations
+    if (mem_type_ == cudaMemoryTypeHost || mem_type_ == cudaMemoryTypeUnregistered) {
+      device_mem_[0].emplace(raft::make_device_mdarray<T, IdxT>(
+        res,
+        raft::resource::get_large_workspace_resource(res),
+        raft::make_extents<int64_t>(batch_size, host_view.extent(1))));
+      device_ptr[0] = device_mem_[0]->data_handle();
+      if (batch_size < static_cast<uint64_t>(host_view.extent(0))) {
+        device_mem_[1].emplace(raft::make_device_mdarray<T, IdxT>(
+          res,
+          raft::resource::get_large_workspace_resource(res),
+          raft::make_extents<int64_t>(batch_size, host_view.extent(1))));
+        device_ptr[1] = device_mem_[1]->data_handle();
+      }
+      if (host_writeback_ && batch_size * 2 < static_cast<uint64_t>(host_view.extent(0))) {
+        num_buffers_ = 3;
+        device_mem_[2].emplace(raft::make_device_mdarray<T, IdxT>(
+          res,
+          raft::resource::get_large_workspace_resource(res),
+          raft::make_extents<int64_t>(batch_size, host_view.extent(1))));
+        device_ptr[2] = device_mem_[2]->data_handle();
+      }
+    }
+
+    // if data is managed and not for_write_ we can set the attribute on the device ptr
+    if (mem_type_ == cudaMemoryTypeManaged) {
+      // location_.type = CU_MEM_LOCATION_TYPE_DEVICE;
+      location_.type = cudaMemLocationTypeDevice;
+      location_.id   = static_cast<CUdevice>(raft::resource::get_device_id(res_));
+      if (read_only_) {
+#if CUDA_VERSION >= 13000
+        RAFT_CUDA_TRY(cudaMemAdvise(host_view_.data_handle(),
+                                    host_view_.extent(0) * host_view_.extent(1) * sizeof(T),
+                                    cudaMemAdviseSetReadMostly,
+                                    location_));
+#else
+        RAFT_CUDA_TRY(cudaMemAdvise_v2(host_view_.data_handle(),
+                                       host_view_.extent(0) * host_view_.extent(1) * sizeof(T),
+                                       cudaMemAdviseSetReadMostly,
+                                       location_));
+#endif
+        // TODO maybe also reset upon destruction
+      }
+    }
+
+    // prefetch next batch (0)
+    prefetch_next_batch();
+  }
+
+  bool prefetch_next_batch()
+  {
+    // this function will ensure the device_ptr [next_buffer_pos_] is pointing to the correct memory
+    // after the next synchronization with the prefetch stream
+
+    // if data is on host and we are writing to it we will have to copy it back
+    // if data is on host we will have to copy it to the device_ptr
+
+    // if data is managed and evict_ is true we can evict the data from device memory
+    // if data is managed we have to prefetch it
+
+    bool next_batch_exists = offset_ < static_cast<uint64_t>(host_view_.extent(0));
+
+    if (next_batch_exists) {
+      actual_batch_size_[next_buffer_pos_] =
+        next_batch_exists ? min(batch_size_, host_view_.extent(0) - offset_) : 0;
+
+      switch (mem_type_) {
+        case cudaMemoryTypeManaged:
+#if CUDA_VERSION >= 13000
+          if (evict_ && batch_id_ > 1) {
+            // evict last active
+            CUdeviceptr dptrs[]      = {device_ptr[next_buffer_pos_]};
+            size_t sizes[]           = {batch_size_ * host_view_.extent(1) * sizeof(T)};
+            size_t prefetchLocIdxs[] = {0};
+            RAFT_CUDA_TRY(cuMemDiscardBatchAsync(
+              dptrs, sizes, 1, &location_, prefetchLocIdxs, 1, 0, prefetch_stream_));
+          }
+#endif
+          // prefetch
+          device_ptr[next_buffer_pos_] = host_view_.data_handle() + offset_ * host_view_.extent(1);
+          if (initialize_) {
+            // managed API call to prefetch async
+#if CUDA_VERSION >= 13000
+            RAFT_CUDA_TRY(cudaMemPrefetchAsync(
+              device_ptr[next_buffer_pos_],
+              actual_batch_size_[next_buffer_pos_] * host_view_.extent(1) * sizeof(T),
+              location_,
+              0,
+              prefetch_stream_));
+#else
+            RAFT_CUDA_TRY(cudaMemPrefetchAsync_v2(
+              device_ptr[next_buffer_pos_],
+              actual_batch_size_[next_buffer_pos_] * host_view_.extent(1) * sizeof(T),
+              location_,
+              0,
+              prefetch_stream_));
+#endif
+          } else {
+            // managed API call to cuMemDiscardAndPrefetchBatchAsync (discard and prefetch batch)
+#if CUDA_VERSION >= 13000
+            CUdeviceptr dptrs[] = {device_ptr[next_buffer_pos_]};
+            size_t sizes[]      = {actual_batch_size_[next_buffer_pos_] * host_view_.extent(1) *
+                                   sizeof(T)};
+            size_t prefetchLocIdxs[] = {0};
+            RAFT_CUDA_TRY(cuMemDiscardAndPrefetchBatchAsync(
+              dptrs, sizes, 1, &location_, prefetchLocIdxs, 1, 0, prefetch_stream_));
+#endif
+          }
+
+          break;
+        case cudaMemoryTypeHost:
+        case cudaMemoryTypeUnregistered:
+          if (host_writeback_ && batch_id_ > 1) {
+            writeback_stream_.synchronize();
+            // copy back last active
+            uint32_t writeback_pos    = (next_buffer_pos_ + num_buffers_ - 2) % num_buffers_;
+            uint64_t writeback_offset = (offset_ - 2 * batch_size_) * host_view_.extent(1);
+            raft::copy(host_view_.data_handle() + writeback_offset,
+                       device_ptr[writeback_pos],
+                       actual_batch_size_[writeback_pos] * host_view_.extent(1),
+                       writeback_stream_);
+          }
+          if (initialize_) {
+            // prefetch next position
+            raft::copy(device_ptr[next_buffer_pos_],
+                       host_view_.data_handle() + offset_ * host_view_.extent(1),
+                       actual_batch_size_[next_buffer_pos_] * host_view_.extent(1),
+                       prefetch_stream_);
+          }
+
+          break;
+        case cudaMemoryTypeDevice:
+          // just move pointer to next position
+          device_ptr[next_buffer_pos_] = host_view_.data_handle() + offset_ * host_view_.extent(1);
+          break;
+      }
+
+      offset_ += actual_batch_size_[next_buffer_pos_];
+      // swap next_buffer_pos_
+      next_buffer_pos_ = (next_buffer_pos_ + 1) % num_buffers_;
+    }
+
+    return next_batch_exists;
+  }
+
+  ~batched_device_view_from_host() noexcept
+  {
+    prefetch_stream_.synchronize();
+    writeback_stream_.synchronize();
+    raft::resource::sync_stream(res_);
+
+    // if data is on host and for_write --> make sure to copy back last active
+    // if data is managed and evict --> evict last active
+
+    // make sure to sync on prefetch & writeback stream & res
+    switch (mem_type_) {
+      case cudaMemoryTypeManaged:
+#if CUDA_VERSION >= 13000
+        if (evict_ && batch_id_ > 0) {
+          // managed API call to evict 2
+          uint32_t evict_pos       = (next_buffer_pos_ + num_buffers_ - 1) % num_buffers_;
+          CUdeviceptr dptrs[]      = {device_ptr[evict_pos]};
+          size_t sizes[]           = {batch_size_ * host_view_.extent(1) * sizeof(T)};
+          size_t prefetchLocIdxs[] = {0};
+          RAFT_CUDA_TRY(cuMemDiscardBatchAsync(
+            dptrs, sizes, 1, &location_, prefetchLocIdxs, 1, 0, prefetch_stream_));
+        }
+        prefetch_stream_.synchronize();
+#endif
+        break;
+      case cudaMemoryTypeHost:
+      case cudaMemoryTypeUnregistered:
+        if (host_writeback_ && batch_id_ > 0) {
+          // TODO managed API call to copy back last active
+          uint32_t writeback_pos = (next_buffer_pos_ + num_buffers_ - 1) % num_buffers_;
+          uint64_t writeback_offset =
+            (offset_ - actual_batch_size_[writeback_pos]) * host_view_.extent(1);
+          raft::copy(host_view_.data_handle() + writeback_offset,
+                     device_ptr[writeback_pos],
+                     actual_batch_size_[writeback_pos] * host_view_.extent(1),
+                     writeback_stream_);
+        }
+        writeback_stream_.synchronize();
+        break;
+      case cudaMemoryTypeDevice: break;
+    }
+  }
+
+  /**
+   * Returns the next view of the batch
+   *
+   * This function will ensure the next batch is ready and will trigger the prefetch of the
+   * subsequent next batch
+   *
+   * @return The next view of the batch
+   */
+  raft::device_matrix_view<T, IdxT> next_view()
+  {
+    RAFT_EXPECTS(batch_id_ * batch_size_ < host_view_.extent(0), "Batch index out of bounds");
+
+    // ensure current batch is ready
+    prefetch_stream_.synchronize();
+
+    // trigger prefetch of next batch
+    bool next_batch_exists = prefetch_next_batch();
+
+    batch_id_++;
+
+    uint32_t current_pos =
+      (next_buffer_pos_ + num_buffers_ - (next_batch_exists ? 2 : 1)) % num_buffers_;
+    return raft::make_device_matrix_view<T, IdxT>(
+      device_ptr[current_pos], actual_batch_size_[current_pos], host_view_.extent(1));
+  }
+
+ private:
+  cudaMemoryType mem_type_;
+  const raft::resources& res_;
+  uint64_t batch_size_;
+  uint64_t offset_;
+  uint64_t num_buffers_;
+  bool initialize_;
+  rmm::cuda_stream_view prefetch_stream_;
+  rmm::cuda_stream_view writeback_stream_;
+  bool read_only_;
+  bool host_writeback_;
+  bool evict_;
+  int32_t next_buffer_pos_;
+  int32_t batch_id_;
+  cudaMemLocation location_;
+  std::optional<raft::device_matrix<T, IdxT>> device_mem_[3];
+  raft::host_matrix_view<T, IdxT> host_view_;
+  T* device_ptr[3];
+  uint32_t actual_batch_size_[3];
+};
+
 }  // namespace cuvs::neighbors::cagra::detail

From a38ad525570d31882a1c86ff04eb679a6b1c4476 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Mon, 9 Mar 2026 20:49:08 +0000
Subject: [PATCH 076/119] fix batched iterator

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 123 +++----
 cpp/src/neighbors/detail/cagra/utils.hpp      | 313 ++++++++++--------
 2 files changed, 233 insertions(+), 203 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 28006fa133..ef8b1f8daf 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -22,6 +22,7 @@
 
 #include <raft/util/bitonic_sort.cuh>
 #include <raft/util/cuda_rt_essentials.hpp>
+#include <raft/util/integer_utils.hpp>
 
 #include <cuda_fp16.h>
 
@@ -324,14 +325,14 @@ __device__ void thread_shift_array(T* array, uint64_t num)
 }
 
 template <typename IdxT, uint32_t num_warps>
-__global__ void kern_merge_graph(IdxT* output_graph,  // [batch_size, output_graph_degree]
-                                 const IdxT* const rev_graph,
+__global__ void kern_merge_graph(IdxT* output_graph,           // [batch_size, output_graph_degree]
+                                 const IdxT* const rev_graph,  // [graph_size, output_graph_degree]
                                  uint32_t* const rev_graph_count,  // [graph_size]
                                  const uint32_t graph_size,
                                  const uint32_t output_graph_degree,
-                                 const IdxT* const mst_graph,
+                                 const IdxT* const mst_graph,  // [batch_size, output_graph_degree]
                                  const uint32_t mst_graph_degree,
-                                 const uint32_t* const mst_graph_num_edges_ptr,
+                                 const uint32_t* const mst_graph_num_edges_ptr,  // [batch_size]
                                  const uint32_t batch_size,
                                  const uint32_t batch_id,
                                  bool guarantee_connectivity,
@@ -350,12 +351,12 @@ __global__ void kern_merge_graph(IdxT* output_graph,  // [batch_size, output_gra
 
   if (nid >= graph_size) { return; }
 
-  const auto mst_graph_num_edges = guarantee_connectivity ? mst_graph_num_edges_ptr[nid] : 0;
+  const auto mst_graph_num_edges = guarantee_connectivity ? mst_graph_num_edges_ptr[nid_batch] : 0;
   // If guarantee_connectivity == true, use a temporal list to merge the
   // neighbor lists of the graphs.
   if (guarantee_connectivity) {
     for (uint32_t i = lane_id; i < mst_graph_degree; i += raft::WarpSize) {
-      smem_sorted_output_graph[i] = mst_graph[nid * mst_graph_degree + i];
+      smem_sorted_output_graph[i] = mst_graph[nid_batch * mst_graph_degree + i];
     }
     __syncwarp();
     for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges;
@@ -788,52 +789,54 @@ void merge_graph_gpu(raft::resources const& res,
     std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
   const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
 
-  bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr);
-  auto d_output_graph           = raft::make_device_mdarray<IdxT>(
+  batched_device_view_from_host<IdxT, int64_t> d_output_graph(
     res,
-    default_ws_mr,
-    raft::make_extents<int64_t>(output_device_accessible ? 0 : batch_size, output_graph_degree));
+    raft::make_host_matrix_view<IdxT, int64_t>(output_graph_ptr, graph_size, output_graph_degree),
+    /*batch_size*/ batch_size,
+    /*host_writeback*/ true,
+    /*initialize*/ true,
+    /*hmm_as_managed*/ false);
 
-  device_matrix_view_from_host<IdxT, int64_t> d_mst_graph(
+  batched_device_view_from_host<IdxT, int64_t> d_mst_graph(
     res,
     raft::make_host_matrix_view<IdxT, int64_t>(
-      mst_graph_ptr, guarantee_connectivity ? graph_size : 0, output_graph_degree));
+      mst_graph_ptr, guarantee_connectivity ? graph_size : 0, output_graph_degree),
+    /*batch_size*/ batch_size,
+    /*host_writeback*/ false,
+    /*initialize*/ true,
+    /*hmm_as_managed*/ false);
 
-  device_matrix_view_from_host<uint32_t, int64_t> d_mst_graph_num_edges(
+  batched_device_view_from_host<IdxT, int64_t> d_mst_graph_num_edges(
     res,
-    raft::make_host_matrix_view<uint32_t, int64_t>(
-      mst_graph_num_edges_ptr, guarantee_connectivity ? graph_size : 0, 1));
+    raft::make_host_matrix_view<IdxT, int64_t>(
+      mst_graph_ptr, guarantee_connectivity ? graph_size : 0, output_graph_degree),
+    /*batch_size*/ batch_size,
+    /*host_writeback*/ false,
+    /*initialize*/ true,
+    /*hmm_as_managed*/ false);
 
   const uint32_t num_warps = 4;
   const dim3 threads_merge(raft::WarpSize * num_warps, 1, 1);
-  const dim3 blocks_merge(batch_size / num_warps, 1, 1);
+  const dim3 blocks_merge(raft::ceildiv(batch_size, num_warps), 1, 1);
   const size_t merge_smem_size = num_warps * output_graph_degree * sizeof(IdxT);
   for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
+    auto mst_graph_view           = d_mst_graph.next_view();
+    auto mst_graph_num_edges_view = d_mst_graph_num_edges.next_view();
+    auto output_view              = d_output_graph.next_view();
     kern_merge_graph<IdxT, num_warps>
       <<<blocks_merge, threads_merge, merge_smem_size, raft::resource::get_cuda_stream(res)>>>(
-        output_device_accessible ? output_graph_ptr + (i_batch * batch_size * output_graph_degree)
-                                 : d_output_graph.data_handle(),
+        output_view.data_handle(),
         d_rev_graph_ptr,
         d_rev_graph_count_ptr,
         static_cast<uint32_t>(graph_size),
         static_cast<uint32_t>(output_graph_degree),
-        d_mst_graph.data_handle(),
+        mst_graph_view.data_handle(),
         static_cast<uint32_t>(output_graph_degree),
-        d_mst_graph_num_edges.data_handle(),
+        mst_graph_num_edges_view.data_handle(),
         batch_size,
         i_batch,
         guarantee_connectivity,
         d_check_num_protected_edges.data_handle());
-
-    if (!output_device_accessible) {
-      size_t copy_size =
-        std::min(static_cast<size_t>(batch_size), graph_size - i_batch * batch_size) *
-        output_graph_degree;
-      raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree,
-                 d_output_graph.data_handle(),
-                 copy_size,
-                 raft::resource::get_cuda_stream(res));
-    }
   }
 
   bool check_num_protected_edges = true;
@@ -879,28 +882,21 @@ void make_reverse_graph_gpu(raft::resources const& res,
     std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
   const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
 
-  bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr);
-  auto d_output_graph           = raft::make_device_mdarray<IdxT>(
+  batched_device_view_from_host<IdxT, int64_t> d_output_graph(
     res,
-    default_ws_mr,
-    raft::make_extents<int64_t>(output_device_accessible ? 0 : batch_size, output_graph_degree));
+    raft::make_host_matrix_view<IdxT, int64_t>(output_graph_ptr, graph_size, output_graph_degree),
+    /*batch_size*/ batch_size,
+    /*host_writeback*/ false,
+    /*initialize*/ true,
+    /*hmm_as_managed*/ false);
 
   for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
     dim3 threads(256, 1, 1);
     dim3 blocks(1024, 1, 1);
+    auto output_view = d_output_graph.next_view();
 
-    if (!output_device_accessible) {
-      size_t copy_size =
-        std::min(static_cast<size_t>(batch_size), graph_size - i_batch * batch_size) *
-        output_graph_degree;
-      raft::copy(d_output_graph.data_handle(),
-                 output_graph_ptr + i_batch * batch_size * output_graph_degree,
-                 copy_size,
-                 raft::resource::get_cuda_stream(res));
-    }
     kern_rev_graph_batched<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
-      output_device_accessible ? output_graph_ptr + (i_batch * batch_size * output_graph_degree)
-                               : d_output_graph.data_handle(),
+      output_view.data_handle(),
       d_rev_graph_ptr,
       d_rev_graph_count_ptr,
       static_cast<uint32_t>(graph_size),
@@ -1543,38 +1539,35 @@ void prune_graph_gpu(raft::resources const& res,
   auto host_stats                           = raft::make_host_vector<uint64_t>(2);
   raft::matrix::fill(res, dev_stats.view(), uint64_t(0));
 
-  // device_matrix_view_from_host<IdxT, int64_t> d_input_graph(
-  //   res, raft::make_host_matrix_view<IdxT, int64_t>(knn_graph_ptr, graph_size,
-  //   knn_graph_degree));
-
   batched_device_view_from_host<IdxT, int64_t> d_input_graph(
     res,
     raft::make_host_matrix_view<IdxT, int64_t>(knn_graph_ptr, graph_size, knn_graph_degree),
     /*batch_size*/ graph_size,
-    /*read_only*/ true,
     /*host_writeback*/ false,
     /*initialize*/ true,
-    /*evict*/ true);
+    /*hmm_as_managed*/ true);
   auto input_view = d_input_graph.next_view();
 
-  auto d_invalid_neighbor_list = raft::make_device_scalar<uint32_t>(res, 0u);
-
-  bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr);
-  auto d_output_graph           = raft::make_device_mdarray<IdxT>(
+  batched_device_view_from_host<IdxT, int64_t> d_output_graph(
     res,
-    default_ws_mr,
-    raft::make_extents<int64_t>(output_device_accessible ? 0 : batch_size, output_graph_degree));
+    raft::make_host_matrix_view<IdxT, int64_t>(output_graph_ptr, graph_size, output_graph_degree),
+    /*batch_size*/ batch_size,
+    /*host_writeback*/ true,
+    /*initialize*/ false,
+    /*hmm_as_managed*/ false);
+
+  auto d_invalid_neighbor_list = raft::make_device_scalar<uint32_t>(res, 0u);
 
   for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
+    auto output_view         = d_output_graph.next_view();
     const uint32_t num_warps = 4;
     const dim3 threads_prune(raft::WarpSize * num_warps, 1, 1);
-    const dim3 blocks_prune(batch_size / num_warps, 1, 1);
+    const dim3 blocks_prune(raft::ceildiv(batch_size, num_warps), 1, 1);
     const size_t prune_smem_size = num_warps * knn_graph_degree * (sizeof(IdxT) + sizeof(uint32_t));
     kern_fused_prune<IdxT, num_warps>
       <<<blocks_prune, threads_prune, prune_smem_size, raft::resource::get_cuda_stream(res)>>>(
         input_view.data_handle(),
-        output_device_accessible ? output_graph_ptr + i_batch * batch_size * output_graph_degree
-                                 : d_output_graph.data_handle(),
+        output_view.data_handle(),
         graph_size,
         knn_graph_degree,
         output_graph_degree,
@@ -1583,16 +1576,6 @@ void prune_graph_gpu(raft::resources const& res,
         d_invalid_neighbor_list.data_handle(),
         dev_stats.data_handle());
 
-    if (!output_device_accessible) {
-      size_t copy_size =
-        std::min(static_cast<size_t>(batch_size), graph_size - i_batch * batch_size) *
-        output_graph_degree;
-      raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree,
-                 d_output_graph.data_handle(),
-                 copy_size,
-                 raft::resource::get_cuda_stream(res));
-    }
-
     raft::resource::sync_stream(res);
     RAFT_LOG_DEBUG(
       "# Pruning kNN Graph on GPUs (%.1lf %%)\r",
diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp
index c3d15e59f4..df6ef1ce6f 100644
--- a/cpp/src/neighbors/detail/cagra/utils.hpp
+++ b/cpp/src/neighbors/detail/cagra/utils.hpp
@@ -16,6 +16,7 @@
 #include <raft/core/resources.hpp>
 #include <raft/matrix/init.cuh>
 #include <raft/util/integer_utils.hpp>
+#include <rmm/cuda_stream_pool.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cuda.h>
@@ -23,6 +24,7 @@
 
 #include <cfloat>
 #include <cstdint>
+#include <iostream>
 #include <type_traits>
 
 namespace cuvs::neighbors::cagra::detail {
@@ -328,7 +330,7 @@ void copy_with_padding(
  * @param read_only Whether the data is read only (only for managed memory)
  * @param host_writeback Whether to write back the data to the host (only for host memory)
  * @param initialize Whether to initialize the data (only for managed memory)
- * @param evict Whether to evict the data (only for managed memory)
+ * @param discard Whether to discard the data (only for managed memory)
  *
  * @return The batched device view
  */
@@ -338,22 +340,24 @@ class batched_device_view_from_host {
   batched_device_view_from_host(raft::resources const& res,
                                 raft::host_matrix_view<T, IdxT> host_view,
                                 uint64_t batch_size,
-                                bool read_only      = false,
                                 bool host_writeback = false,
                                 bool initialize     = true,
-                                bool evict          = false)
+                                bool hmm_as_managed = false)
     : res_(res),
       host_view_(host_view),
       batch_size_(batch_size),
       offset_(0),
-      batch_id_(0),
+      batch_id_(-2),
       num_buffers_(2),
-      read_only_(read_only),
       host_writeback_(host_writeback),
-      next_buffer_pos_(0),
-      evict_(evict),
-      initialize_(initialize)
+      initialize_(initialize),
+      hmm_as_managed_(hmm_as_managed)
   {
+    if (host_view.extent(0) == 0) {
+      mem_type_ = cudaMemoryTypeDevice;
+      return;
+    }
+
     cudaPointerAttributes attr;
     RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle()));
     mem_type_ = attr.type;
@@ -361,27 +365,35 @@ class batched_device_view_from_host {
     // cudaMemoryTypeHost = 1
     // cudaMemoryTypeDevice = 2
     // cudaMemoryTypeManaged = 3
+    //
+    // On HMM systems, unregistered (malloc) memory can have devicePointer != nullptr,
+    // meaning it's directly accessible from the GPU. Treat it like managed memory:
+    if (mem_type_ == cudaMemoryTypeUnregistered && attr.devicePointer != nullptr &&
+        hmm_as_managed) {
+      mem_type_ = cudaMemoryTypeManaged;
+    }
 
-    prefetch_stream_  = raft::resource::get_cuda_stream(res);
-    writeback_stream_ = raft::resource::get_cuda_stream(res);
-    if (res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL)) {
-      if (raft::resource::get_stream_pool_size(res) >= 1) {
-        prefetch_stream_  = raft::resource::get_stream_from_stream_pool(res);
-        writeback_stream_ = raft::resource::get_stream_from_stream_pool(res);
-      }
+    if (res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL) &&
+        raft::resource::get_stream_pool_size(res) >= 1) {
+      prefetch_stream_  = raft::resource::get_stream_from_stream_pool(res);
+      writeback_stream_ = raft::resource::get_stream_from_stream_pool(res);
+    } else {
+      local_stream_pool_ = std::make_shared<rmm::cuda_stream_pool>(2);
+      prefetch_stream_   = local_stream_pool_.value()->get_stream();
+      writeback_stream_  = local_stream_pool_.value()->get_stream();
     }
 
     // allocations
     if (mem_type_ == cudaMemoryTypeHost || mem_type_ == cudaMemoryTypeUnregistered) {
       device_mem_[0].emplace(raft::make_device_mdarray<T, IdxT>(
         res,
-        raft::resource::get_large_workspace_resource(res),
+        raft::resource::get_workspace_resource(res),
         raft::make_extents<int64_t>(batch_size, host_view.extent(1))));
       device_ptr[0] = device_mem_[0]->data_handle();
       if (batch_size < static_cast<uint64_t>(host_view.extent(0))) {
         device_mem_[1].emplace(raft::make_device_mdarray<T, IdxT>(
           res,
-          raft::resource::get_large_workspace_resource(res),
+          raft::resource::get_workspace_resource(res),
           raft::make_extents<int64_t>(batch_size, host_view.extent(1))));
         device_ptr[1] = device_mem_[1]->data_handle();
       }
@@ -389,7 +401,7 @@ class batched_device_view_from_host {
         num_buffers_ = 3;
         device_mem_[2].emplace(raft::make_device_mdarray<T, IdxT>(
           res,
-          raft::resource::get_large_workspace_resource(res),
+          raft::resource::get_workspace_resource(res),
           raft::make_extents<int64_t>(batch_size, host_view.extent(1))));
         device_ptr[2] = device_mem_[2]->data_handle();
       }
@@ -400,18 +412,9 @@ class batched_device_view_from_host {
       // location_.type = CU_MEM_LOCATION_TYPE_DEVICE;
       location_.type = cudaMemLocationTypeDevice;
       location_.id   = static_cast<CUdevice>(raft::resource::get_device_id(res_));
-      if (read_only_) {
-#if CUDA_VERSION >= 13000
-        RAFT_CUDA_TRY(cudaMemAdvise(host_view_.data_handle(),
-                                    host_view_.extent(0) * host_view_.extent(1) * sizeof(T),
-                                    cudaMemAdviseSetReadMostly,
-                                    location_));
-#else
-        RAFT_CUDA_TRY(cudaMemAdvise_v2(host_view_.data_handle(),
-                                       host_view_.extent(0) * host_view_.extent(1) * sizeof(T),
-                                       cudaMemAdviseSetReadMostly,
-                                       location_));
-#endif
+      if (!host_writeback_) {
+        advise_read_mostly(host_view_.data_handle(),
+                           host_view_.extent(0) * host_view_.extent(1) * sizeof(T));
         // TODO maybe also reset upon destruction
       }
     }
@@ -422,95 +425,72 @@ class batched_device_view_from_host {
 
   bool prefetch_next_batch()
   {
-    // this function will ensure the device_ptr [next_buffer_pos_] is pointing to the correct memory
-    // after the next synchronization with the prefetch stream
+    batch_id_++;
+
+    // ensure previous batch at position batch_id_ is ready
+    prefetch_stream_.synchronize();
+    if (host_writeback_) { writeback_stream_.synchronize(); }
 
-    // if data is on host and we are writing to it we will have to copy it back
-    // if data is on host we will have to copy it to the device_ptr
+    // this step will
+    // * write back data from batch_id_ - 1
+    // * prefetch data for batch_id_ + 1
 
-    // if data is managed and evict_ is true we can evict the data from device memory
-    // if data is managed we have to prefetch it
+    // if data is on host and host_writeback_ is true we will have to copy it back
+    // if data is on host and initialize_ is true we will have to copy it to the device_ptr
+
+    // if data is managed and !host_writeback_ we can discard the data from device memory
+    // if data is managed and initialize_ is true we can prefetch it to the device
+    // if data is managed and !initialize_ we can discard and prefetch the data location
+
+    // if data is on device only this is almost a noop, just prepping the pointers
+
+    RAFT_EXPECTS(offset_ <= host_view_.extent(0), "Offset out of bounds");
 
     bool next_batch_exists = offset_ < static_cast<uint64_t>(host_view_.extent(0));
 
     if (next_batch_exists) {
-      actual_batch_size_[next_buffer_pos_] =
-        next_batch_exists ? min(batch_size_, host_view_.extent(0) - offset_) : 0;
+      // synchronize to ensure all previous operations are completed
+      // in particular all work on batch_id_ - 1
+      raft::resource::sync_stream(res_);
+
+      int32_t prefetch_pos             = (batch_id_ + 1) % num_buffers_;
+      actual_batch_size_[prefetch_pos] = min(batch_size_, host_view_.extent(0) - offset_);
 
       switch (mem_type_) {
         case cudaMemoryTypeManaged:
-#if CUDA_VERSION >= 13000
-          if (evict_ && batch_id_ > 1) {
-            // evict last active
-            CUdeviceptr dptrs[]      = {device_ptr[next_buffer_pos_]};
-            size_t sizes[]           = {batch_size_ * host_view_.extent(1) * sizeof(T)};
-            size_t prefetchLocIdxs[] = {0};
-            RAFT_CUDA_TRY(cuMemDiscardBatchAsync(
-              dptrs, sizes, 1, &location_, prefetchLocIdxs, 1, 0, prefetch_stream_));
+          if (!host_writeback_ && batch_id_ > 1) {
+            uint32_t discard_pos = (batch_id_ - 1) % num_buffers_;
+            size_t discard_size  = batch_size_ * host_view_.extent(1) * sizeof(T);
+            discard_managed_region(device_ptr[discard_pos], discard_size);
           }
-#endif
-          // prefetch
-          device_ptr[next_buffer_pos_] = host_view_.data_handle() + offset_ * host_view_.extent(1);
-          if (initialize_) {
-            // managed API call to prefetch async
-#if CUDA_VERSION >= 13000
-            RAFT_CUDA_TRY(cudaMemPrefetchAsync(
-              device_ptr[next_buffer_pos_],
-              actual_batch_size_[next_buffer_pos_] * host_view_.extent(1) * sizeof(T),
-              location_,
-              0,
-              prefetch_stream_));
-#else
-            RAFT_CUDA_TRY(cudaMemPrefetchAsync_v2(
-              device_ptr[next_buffer_pos_],
-              actual_batch_size_[next_buffer_pos_] * host_view_.extent(1) * sizeof(T),
-              location_,
-              0,
-              prefetch_stream_));
-#endif
-          } else {
-            // managed API call to cuMemDiscardAndPrefetchBatchAsync (discard and prefetch batch)
-#if CUDA_VERSION >= 13000
-            CUdeviceptr dptrs[] = {device_ptr[next_buffer_pos_]};
-            size_t sizes[]      = {actual_batch_size_[next_buffer_pos_] * host_view_.extent(1) *
-                                   sizeof(T)};
-            size_t prefetchLocIdxs[] = {0};
-            RAFT_CUDA_TRY(cuMemDiscardAndPrefetchBatchAsync(
-              dptrs, sizes, 1, &location_, prefetchLocIdxs, 1, 0, prefetch_stream_));
-#endif
-          }
-
+          // prefetch next position
+          device_ptr[prefetch_pos] = host_view_.data_handle() + offset_ * host_view_.extent(1);
+          prefetch_managed_region(
+            device_ptr[prefetch_pos],
+            actual_batch_size_[prefetch_pos] * host_view_.extent(1) * sizeof(T));
           break;
         case cudaMemoryTypeHost:
         case cudaMemoryTypeUnregistered:
-          if (host_writeback_ && batch_id_ > 1) {
-            writeback_stream_.synchronize();
+          if (host_writeback_ && batch_id_ > 0) {
             // copy back last active
-            uint32_t writeback_pos    = (next_buffer_pos_ + num_buffers_ - 2) % num_buffers_;
-            uint64_t writeback_offset = (offset_ - 2 * batch_size_) * host_view_.extent(1);
-            raft::copy(host_view_.data_handle() + writeback_offset,
-                       device_ptr[writeback_pos],
-                       actual_batch_size_[writeback_pos] * host_view_.extent(1),
-                       writeback_stream_);
+            uint32_t writeback_pos    = (batch_id_ - 1) % num_buffers_;
+            uint64_t writeback_offset = (batch_id_ - 1) * batch_size_;
+            writeback_from_device_to_host(device_ptr[writeback_pos], writeback_offset, batch_size_);
           }
           if (initialize_) {
             // prefetch next position
-            raft::copy(device_ptr[next_buffer_pos_],
-                       host_view_.data_handle() + offset_ * host_view_.extent(1),
-                       actual_batch_size_[next_buffer_pos_] * host_view_.extent(1),
-                       prefetch_stream_);
+            prefetch_from_host_to_device(
+              device_ptr[prefetch_pos], offset_, actual_batch_size_[prefetch_pos]);
           }
 
           break;
         case cudaMemoryTypeDevice:
           // just move pointer to next position
-          device_ptr[next_buffer_pos_] = host_view_.data_handle() + offset_ * host_view_.extent(1);
+          device_ptr[prefetch_pos] = host_view_.data_handle() + offset_ * host_view_.extent(1);
           break;
       }
 
-      offset_ += actual_batch_size_[next_buffer_pos_];
-      // swap next_buffer_pos_
-      next_buffer_pos_ = (next_buffer_pos_ + 1) % num_buffers_;
+      offset_ += actual_batch_size_[prefetch_pos];
     }
 
     return next_batch_exists;
@@ -525,33 +505,36 @@ class batched_device_view_from_host {
     // if data is on host and for_write --> make sure to copy back last active
     // if data is managed and evict --> evict last active
 
-    // make sure to sync on prefetch & writeback stream & res
+    // make sure to sync on prefetch stream & res
     switch (mem_type_) {
       case cudaMemoryTypeManaged:
-#if CUDA_VERSION >= 13000
-        if (evict_ && batch_id_ > 0) {
-          // managed API call to evict 2
-          uint32_t evict_pos       = (next_buffer_pos_ + num_buffers_ - 1) % num_buffers_;
-          CUdeviceptr dptrs[]      = {device_ptr[evict_pos]};
-          size_t sizes[]           = {batch_size_ * host_view_.extent(1) * sizeof(T)};
-          size_t prefetchLocIdxs[] = {0};
-          RAFT_CUDA_TRY(cuMemDiscardBatchAsync(
-            dptrs, sizes, 1, &location_, prefetchLocIdxs, 1, 0, prefetch_stream_));
+        if (!host_writeback_) {
+          uint32_t discard_pos     = batch_id_ % num_buffers_;
+          size_t discard_size_rows = actual_batch_size_[discard_pos];
+          if (batch_id_ > 0) {
+            discard_pos = (batch_id_ - 1) % num_buffers_;
+            discard_size_rows += batch_size_;
+          }
+          discard_managed_region(device_ptr[discard_pos],
+                                 discard_size_rows * host_view_.extent(1) * sizeof(T));
         }
-        prefetch_stream_.synchronize();
-#endif
+        writeback_stream_.synchronize();
         break;
       case cudaMemoryTypeHost:
       case cudaMemoryTypeUnregistered:
-        if (host_writeback_ && batch_id_ > 0) {
-          // TODO managed API call to copy back last active
-          uint32_t writeback_pos = (next_buffer_pos_ + num_buffers_ - 1) % num_buffers_;
-          uint64_t writeback_offset =
-            (offset_ - actual_batch_size_[writeback_pos]) * host_view_.extent(1);
-          raft::copy(host_view_.data_handle() + writeback_offset,
-                     device_ptr[writeback_pos],
-                     actual_batch_size_[writeback_pos] * host_view_.extent(1),
-                     writeback_stream_);
+        if (host_writeback_) {
+          uint32_t writeback_pos_last = batch_id_ % num_buffers_;
+          if (batch_id_ > 0) {
+            uint32_t writeback_pos    = (batch_id_ - 1) % num_buffers_;
+            uint64_t writeback_offset = (batch_id_ - 1) * batch_size_;
+            writeback_from_device_to_host(device_ptr[writeback_pos], writeback_offset, batch_size_);
+          }
+          {
+            uint64_t writeback_offset_last = batch_id_ * batch_size_;
+            writeback_from_device_to_host(device_ptr[writeback_pos_last],
+                                          writeback_offset_last,
+                                          actual_batch_size_[writeback_pos_last]);
+          }
         }
         writeback_stream_.synchronize();
         break;
@@ -569,39 +552,103 @@ class batched_device_view_from_host {
    */
   raft::device_matrix_view<T, IdxT> next_view()
   {
-    RAFT_EXPECTS(batch_id_ * batch_size_ < host_view_.extent(0), "Batch index out of bounds");
-
-    // ensure current batch is ready
-    prefetch_stream_.synchronize();
+    // special case for empty host view
+    if (host_view_.extent(0) == 0) {
+      return raft::make_device_matrix_view<T, IdxT>(nullptr, 0, host_view_.extent(1));
+    }
 
     // trigger prefetch of next batch
     bool next_batch_exists = prefetch_next_batch();
 
-    batch_id_++;
+    RAFT_EXPECTS(batch_id_ * batch_size_ < host_view_.extent(0), "Batch index out of bounds");
 
-    uint32_t current_pos =
-      (next_buffer_pos_ + num_buffers_ - (next_batch_exists ? 2 : 1)) % num_buffers_;
+    uint32_t current_pos = batch_id_ % num_buffers_;
     return raft::make_device_matrix_view<T, IdxT>(
       device_ptr[current_pos], actual_batch_size_[current_pos], host_view_.extent(1));
   }
 
  private:
-  cudaMemoryType mem_type_;
-  const raft::resources& res_;
-  uint64_t batch_size_;
-  uint64_t offset_;
-  uint64_t num_buffers_;
-  bool initialize_;
+  void advise_read_mostly(T* ptr, size_t size)
+  {
+#if CUDA_VERSION >= 13000
+    RAFT_CUDA_TRY(cudaMemAdvise(ptr, size, cudaMemAdviseSetReadMostly, location_));
+#else
+    RAFT_CUDA_TRY(cudaMemAdvise_v2(ptr, size, cudaMemAdviseSetReadMostly, location_));
+#endif
+  }
+
+  void discard_managed_region(T* dev_ptr, size_t size)
+  {
+#if CUDA_VERSION >= 13000
+    void* dptrs[1]  = {dev_ptr};
+    size_t sizes[1] = {size};
+    RAFT_CUDA_TRY(cudaMemDiscardBatchAsync(dptrs, sizes, 1, 0, writeback_stream_));
+#endif
+    // FIXME: CUDA12 does not support discard
+  }
+
+  void prefetch_managed_region(T* dev_ptr, size_t size)
+  {
+#if CUDA_VERSION >= 13000
+    if (initialize_) {
+      RAFT_CUDA_TRY(cudaMemPrefetchAsync(dev_ptr, size, location_, 0, prefetch_stream_));
+    } else {
+      void* dptrs[1]  = {dev_ptr};
+      size_t sizes[1] = {size};
+      RAFT_CUDA_TRY(
+        cudaMemDiscardAndPrefetchBatchAsync(dptrs, sizes, 1, location_, 0, prefetch_stream_));
+    }
+#else
+    // FIXME: CUDA12 does not support discard - so we just prefetch
+    if (initialize_) {
+      RAFT_CUDA_TRY(cudaMemPrefetchAsync_v2(dev_ptr, size, location_, 0, prefetch_stream_));
+    } else {
+      RAFT_CUDA_TRY(cudaMemPrefetchAsync_v2(dev_ptr, size, location_, 0, prefetch_stream_));
+    }
+#endif
+  }
+
+  void prefetch_from_host_to_device(T* dev_ptr, size_t src_row_offset, size_t num_rows)
+  {
+    raft::copy(dev_ptr,
+               host_view_.data_handle() + src_row_offset * host_view_.extent(1),
+               num_rows * host_view_.extent(1),
+               prefetch_stream_);
+  }
+
+  void writeback_from_device_to_host(T* dev_ptr, size_t dst_row_offset, size_t num_rows)
+  {
+    raft::copy(host_view_.data_handle() + dst_row_offset * host_view_.extent(1),
+               dev_ptr,
+               num_rows * host_view_.extent(1),
+               writeback_stream_);
+  }
+
+  // stream pool for local streams
+  std::optional<std::shared_ptr<rmm::cuda_stream_pool>> local_stream_pool_;
   rmm::cuda_stream_view prefetch_stream_;
   rmm::cuda_stream_view writeback_stream_;
-  bool read_only_;
-  bool host_writeback_;
-  bool evict_;
-  int32_t next_buffer_pos_;
+
+  // configuration
+  const raft::resources& res_;
+  bool initialize_;      // initialize the data on the device
+  bool host_writeback_;  // write back the data to the host
+  bool hmm_as_managed_;  // treat unregistered memory as managed memory
+
+  // batch position information
+  uint64_t batch_size_;
   int32_t batch_id_;
+  uint64_t offset_;
+
   cudaMemLocation location_;
-  std::optional<raft::device_matrix<T, IdxT>> device_mem_[3];
+
+  // input pointer information
+  cudaMemoryType mem_type_;
   raft::host_matrix_view<T, IdxT> host_view_;
+
+  // internal device buffers
+  uint64_t num_buffers_;
+  std::optional<raft::device_matrix<T, IdxT>> device_mem_[3];
   T* device_ptr[3];
   uint32_t actual_batch_size_[3];
 };

From 89b0d1c25bbff782cf906be7d9b2dc58a5927116 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Mon, 9 Mar 2026 21:57:25 +0000
Subject: [PATCH 077/119] implement fallback / simplify strategy

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh |  18 +--
 cpp/src/neighbors/detail/cagra/utils.hpp      | 110 ++++++++++--------
 2 files changed, 66 insertions(+), 62 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index ef8b1f8daf..a6e4c08350 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -794,8 +794,7 @@ void merge_graph_gpu(raft::resources const& res,
     raft::make_host_matrix_view<IdxT, int64_t>(output_graph_ptr, graph_size, output_graph_degree),
     /*batch_size*/ batch_size,
     /*host_writeback*/ true,
-    /*initialize*/ true,
-    /*hmm_as_managed*/ false);
+    /*initialize*/ true);
 
   batched_device_view_from_host<IdxT, int64_t> d_mst_graph(
     res,
@@ -803,8 +802,7 @@ void merge_graph_gpu(raft::resources const& res,
       mst_graph_ptr, guarantee_connectivity ? graph_size : 0, output_graph_degree),
     /*batch_size*/ batch_size,
     /*host_writeback*/ false,
-    /*initialize*/ true,
-    /*hmm_as_managed*/ false);
+    /*initialize*/ true);
 
   batched_device_view_from_host<IdxT, int64_t> d_mst_graph_num_edges(
     res,
@@ -812,8 +810,7 @@ void merge_graph_gpu(raft::resources const& res,
       mst_graph_ptr, guarantee_connectivity ? graph_size : 0, output_graph_degree),
     /*batch_size*/ batch_size,
     /*host_writeback*/ false,
-    /*initialize*/ true,
-    /*hmm_as_managed*/ false);
+    /*initialize*/ true);
 
   const uint32_t num_warps = 4;
   const dim3 threads_merge(raft::WarpSize * num_warps, 1, 1);
@@ -887,8 +884,7 @@ void make_reverse_graph_gpu(raft::resources const& res,
     raft::make_host_matrix_view<IdxT, int64_t>(output_graph_ptr, graph_size, output_graph_degree),
     /*batch_size*/ batch_size,
     /*host_writeback*/ false,
-    /*initialize*/ true,
-    /*hmm_as_managed*/ false);
+    /*initialize*/ true);
 
   for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
     dim3 threads(256, 1, 1);
@@ -1544,8 +1540,7 @@ void prune_graph_gpu(raft::resources const& res,
     raft::make_host_matrix_view<IdxT, int64_t>(knn_graph_ptr, graph_size, knn_graph_degree),
     /*batch_size*/ graph_size,
     /*host_writeback*/ false,
-    /*initialize*/ true,
-    /*hmm_as_managed*/ true);
+    /*initialize*/ true);
   auto input_view = d_input_graph.next_view();
 
   batched_device_view_from_host<IdxT, int64_t> d_output_graph(
@@ -1553,8 +1548,7 @@ void prune_graph_gpu(raft::resources const& res,
     raft::make_host_matrix_view<IdxT, int64_t>(output_graph_ptr, graph_size, output_graph_degree),
     /*batch_size*/ batch_size,
     /*host_writeback*/ true,
-    /*initialize*/ false,
-    /*hmm_as_managed*/ false);
+    /*initialize*/ false);
 
   auto d_invalid_neighbor_list = raft::make_device_scalar<uint32_t>(res, 0u);
 
diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp
index df6ef1ce6f..8f6cfb063f 100644
--- a/cpp/src/neighbors/detail/cagra/utils.hpp
+++ b/cpp/src/neighbors/detail/cagra/utils.hpp
@@ -327,22 +327,25 @@ void copy_with_padding(
  * @param res The resources
  * @param host_view The host view to create the batched device view from
  * @param batch_size The batch size
- * @param read_only Whether the data is read only (only for managed memory)
  * @param host_writeback Whether to write back the data to the host (only for host memory)
  * @param initialize Whether to initialize the data (only for managed memory)
- * @param discard Whether to discard the data (only for managed memory)
  *
  * @return The batched device view
  */
 template <typename T, typename IdxT>
 class batched_device_view_from_host {
  public:
+  enum class memory_strategy {
+    device_only,   // data is on device only (no copy needed)
+    copy_device,   // data is explicitly moved to/from device buffers
+    managed_only,  // data is on managed memory (system managed)
+  };
+
   batched_device_view_from_host(raft::resources const& res,
                                 raft::host_matrix_view<T, IdxT> host_view,
                                 uint64_t batch_size,
                                 bool host_writeback = false,
-                                bool initialize     = true,
-                                bool hmm_as_managed = false)
+                                bool initialize     = true)
     : res_(res),
       host_view_(host_view),
       batch_size_(batch_size),
@@ -350,29 +353,23 @@ class batched_device_view_from_host {
       batch_id_(-2),
       num_buffers_(2),
       host_writeback_(host_writeback),
-      initialize_(initialize),
-      hmm_as_managed_(hmm_as_managed)
+      initialize_(initialize)
   {
     if (host_view.extent(0) == 0) {
-      mem_type_ = cudaMemoryTypeDevice;
+      mem_strategy_ = memory_strategy::device_only;
       return;
     }
 
     cudaPointerAttributes attr;
     RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle()));
-    mem_type_ = attr.type;
-    // cudaMemoryTypeUnregistered = 0
-    // cudaMemoryTypeHost = 1
-    // cudaMemoryTypeDevice = 2
-    // cudaMemoryTypeManaged = 3
-    //
-    // On HMM systems, unregistered (malloc) memory can have devicePointer != nullptr,
-    // meaning it's directly accessible from the GPU. Treat it like managed memory:
-    if (mem_type_ == cudaMemoryTypeUnregistered && attr.devicePointer != nullptr &&
-        hmm_as_managed) {
-      mem_type_ = cudaMemoryTypeManaged;
+    switch (attr.type) {
+      case cudaMemoryTypeUnregistered:
+      case cudaMemoryTypeHost:
+      case cudaMemoryTypeManaged: mem_strategy_ = memory_strategy::copy_device; break;
+      case cudaMemoryTypeDevice: mem_strategy_ = memory_strategy::device_only; break;
     }
 
+    // setup streams
     if (res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL) &&
         raft::resource::get_stream_pool_size(res) >= 1) {
       prefetch_stream_  = raft::resource::get_stream_from_stream_pool(res);
@@ -383,32 +380,48 @@ class batched_device_view_from_host {
       writeback_stream_  = local_stream_pool_.value()->get_stream();
     }
 
-    // allocations
-    if (mem_type_ == cudaMemoryTypeHost || mem_type_ == cudaMemoryTypeUnregistered) {
-      device_mem_[0].emplace(raft::make_device_mdarray<T, IdxT>(
-        res,
-        raft::resource::get_workspace_resource(res),
-        raft::make_extents<int64_t>(batch_size, host_view.extent(1))));
-      device_ptr[0] = device_mem_[0]->data_handle();
-      if (batch_size < static_cast<uint64_t>(host_view.extent(0))) {
-        device_mem_[1].emplace(raft::make_device_mdarray<T, IdxT>(
-          res,
-          raft::resource::get_workspace_resource(res),
-          raft::make_extents<int64_t>(batch_size, host_view.extent(1))));
-        device_ptr[1] = device_mem_[1]->data_handle();
-      }
-      if (host_writeback_ && batch_size * 2 < static_cast<uint64_t>(host_view.extent(0))) {
-        num_buffers_ = 3;
-        device_mem_[2].emplace(raft::make_device_mdarray<T, IdxT>(
+    // buffer allocations
+    if (mem_strategy_ == memory_strategy::copy_device) {
+      try {
+        device_mem_[0].emplace(raft::make_device_mdarray<T, IdxT>(
           res,
           raft::resource::get_workspace_resource(res),
           raft::make_extents<int64_t>(batch_size, host_view.extent(1))));
-        device_ptr[2] = device_mem_[2]->data_handle();
+        device_ptr[0] = device_mem_[0]->data_handle();
+        if (batch_size < static_cast<uint64_t>(host_view.extent(0))) {
+          device_mem_[1].emplace(raft::make_device_mdarray<T, IdxT>(
+            res,
+            raft::resource::get_workspace_resource(res),
+            raft::make_extents<int64_t>(batch_size, host_view.extent(1))));
+          device_ptr[1] = device_mem_[1]->data_handle();
+        }
+        if (host_writeback_ && batch_size * 2 < static_cast<uint64_t>(host_view.extent(0))) {
+          num_buffers_ = 3;
+          device_mem_[2].emplace(raft::make_device_mdarray<T, IdxT>(
+            res,
+            raft::resource::get_workspace_resource(res),
+            raft::make_extents<int64_t>(batch_size, host_view.extent(1))));
+          device_ptr[2] = device_mem_[2]->data_handle();
+        }
+      } catch (std::bad_alloc& e) {
+        RAFT_LOG_DEBUG("Insufficient memory for device buffers");
+        if (attr.devicePointer != nullptr) {
+          mem_strategy_ = memory_strategy::managed_only;
+        } else {
+          throw std::bad_alloc();
+        }
+      } catch (raft::logic_error& e) {
+        RAFT_LOG_DEBUG("Insufficient memory for device buffers (logic error)");
+        if (attr.devicePointer != nullptr) {
+          mem_strategy_ = memory_strategy::managed_only;
+        } else {
+          throw raft::logic_error("Insufficient memory for device buffers (logic error)");
+        }
       }
     }
 
     // if data is managed and not for_write_ we can set the attribute on the device ptr
-    if (mem_type_ == cudaMemoryTypeManaged) {
+    if (mem_strategy_ == memory_strategy::managed_only) {
       // location_.type = CU_MEM_LOCATION_TYPE_DEVICE;
       location_.type = cudaMemLocationTypeDevice;
       location_.id   = static_cast<CUdevice>(raft::resource::get_device_id(res_));
@@ -428,7 +441,7 @@ class batched_device_view_from_host {
     batch_id_++;
 
     // ensure previous batch at position batch_id_ is ready
-    prefetch_stream_.synchronize();
+    if (initialize_) { prefetch_stream_.synchronize(); }
     if (host_writeback_) { writeback_stream_.synchronize(); }
 
     // this step will
@@ -456,8 +469,8 @@ class batched_device_view_from_host {
       int32_t prefetch_pos             = (batch_id_ + 1) % num_buffers_;
       actual_batch_size_[prefetch_pos] = min(batch_size_, host_view_.extent(0) - offset_);
 
-      switch (mem_type_) {
-        case cudaMemoryTypeManaged:
+      switch (mem_strategy_) {
+        case memory_strategy::managed_only:
           if (!host_writeback_ && batch_id_ > 1) {
             uint32_t discard_pos = (batch_id_ - 1) % num_buffers_;
             size_t discard_size  = batch_size_ * host_view_.extent(1) * sizeof(T);
@@ -469,8 +482,7 @@ class batched_device_view_from_host {
             device_ptr[prefetch_pos],
             actual_batch_size_[prefetch_pos] * host_view_.extent(1) * sizeof(T));
           break;
-        case cudaMemoryTypeHost:
-        case cudaMemoryTypeUnregistered:
+        case memory_strategy::copy_device:
           if (host_writeback_ && batch_id_ > 0) {
             // copy back last active
             uint32_t writeback_pos    = (batch_id_ - 1) % num_buffers_;
@@ -484,7 +496,7 @@ class batched_device_view_from_host {
           }
 
           break;
-        case cudaMemoryTypeDevice:
+        case memory_strategy::device_only:
           // just move pointer to next position
           device_ptr[prefetch_pos] = host_view_.data_handle() + offset_ * host_view_.extent(1);
           break;
@@ -506,8 +518,8 @@ class batched_device_view_from_host {
     // if data is managed and evict --> evict last active
 
     // make sure to sync on prefetch stream & res
-    switch (mem_type_) {
-      case cudaMemoryTypeManaged:
+    switch (mem_strategy_) {
+      case memory_strategy::managed_only:
         if (!host_writeback_) {
           uint32_t discard_pos     = batch_id_ % num_buffers_;
           size_t discard_size_rows = actual_batch_size_[discard_pos];
@@ -520,8 +532,7 @@ class batched_device_view_from_host {
         }
         writeback_stream_.synchronize();
         break;
-      case cudaMemoryTypeHost:
-      case cudaMemoryTypeUnregistered:
+      case memory_strategy::copy_device:
         if (host_writeback_) {
           uint32_t writeback_pos_last = batch_id_ % num_buffers_;
           if (batch_id_ > 0) {
@@ -538,7 +549,7 @@ class batched_device_view_from_host {
         }
         writeback_stream_.synchronize();
         break;
-      case cudaMemoryTypeDevice: break;
+      case memory_strategy::device_only: break;
     }
   }
 
@@ -630,10 +641,10 @@ class batched_device_view_from_host {
   rmm::cuda_stream_view writeback_stream_;
 
   // configuration
+  memory_strategy mem_strategy_;
   const raft::resources& res_;
   bool initialize_;      // initialize the data on the device
   bool host_writeback_;  // write back the data to the host
-  bool hmm_as_managed_;  // treat unregistered memory as managed memory
 
   // batch position information
   uint64_t batch_size_;
@@ -643,7 +654,6 @@ class batched_device_view_from_host {
   cudaMemLocation location_;
 
   // input pointer information
-  cudaMemoryType mem_type_;
   raft::host_matrix_view<T, IdxT> host_view_;
 
   // internal device buffers

From d0e3daefdfc7fcdec3ceaaa62a8d95134a726f15 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Tue, 10 Mar 2026 17:31:23 +0000
Subject: [PATCH 078/119] add logging / remove stats compute

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 51 +++------------
 cpp/src/neighbors/detail/cagra/utils.hpp      | 62 ++++++++++++-------
 2 files changed, 46 insertions(+), 67 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index a6e4c08350..b5e055820d 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -648,44 +648,6 @@ __global__ void kern_mst_opt_postprocessing(IdxT* outgoing_num_edges,  // [graph
   }
 }
 
-template <class T>
-uint64_t pos_in_array(T val, const T* array, uint64_t num)
-{
-  for (uint64_t i = 0; i < num; i++) {
-    if (val == array[i]) { return i; }
-  }
-  return num;
-}
-
-template <class T>
-void shift_array(T* array, uint64_t num)
-{
-  for (uint64_t i = num; i > 0; i--) {
-    array[i] = array[i - 1];
-  }
-}
-
-template <typename IdxT>
-void log_replaced_edges_stats(const IdxT* output_graph_ptr,
-                              uint64_t graph_size,
-                              uint64_t output_graph_degree)
-{
-  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-    "cagra::graph::optimize/stats");
-  uint64_t num_replaced_edges = 0;
-#pragma omp parallel for reduction(+ : num_replaced_edges)
-  for (uint64_t i = 0; i < graph_size; i++) {
-    for (uint64_t k = 0; k < output_graph_degree; k++) {
-      const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)];
-      const uint64_t pos =
-        pos_in_array<IdxT>(j, output_graph_ptr + (output_graph_degree * i), output_graph_degree);
-      if (pos == output_graph_degree) { num_replaced_edges += 1; }
-    }
-  }
-  RAFT_LOG_DEBUG("# Average number of replaced edges per node: %.2f",
-                 (double)num_replaced_edges / graph_size);
-}
-
 template <typename IdxT>
 void log_incoming_edges_histogram(const IdxT* output_graph_ptr,
                                   uint64_t graph_size,
@@ -755,7 +717,10 @@ void check_duplicates_and_out_of_range(const IdxT* output_graph_ptr,
 
       for (uint32_t k = j + 1; k < output_graph_degree; k++) {
         const auto neighbor_b = my_out_graph[k];
-        if (neighbor_a == neighbor_b) { num_dup++; }
+        if (neighbor_a == neighbor_b) {
+          num_dup++;
+          break;
+        }
       }
     }
   }
@@ -1606,10 +1571,10 @@ void prune_graph_gpu(raft::resources const& res,
 }
 
 // TODO allow pinned input for both knn_graph and new_graph
-template <typename IdxT = uint32_t, typename InOutMatrixView>
+template <typename IdxT = uint32_t, typename InputMatrixView, typename OutputMatrixView>
 void optimize(raft::resources const& res,
-              InOutMatrixView knn_graph,
-              InOutMatrixView new_graph,
+              InputMatrixView knn_graph,
+              OutputMatrixView new_graph,
               const bool guarantee_connectivity = true,
               const bool use_gpu                = true)
 {
@@ -1707,8 +1672,6 @@ void optimize(raft::resources const& res,
 
   if (is_ptr_host_accessible(new_graph.data_handle())) {
     // following checks require host access
-    log_replaced_edges_stats<IdxT>(new_graph.data_handle(), graph_size, output_graph_degree);
-
     log_incoming_edges_histogram<IdxT>(new_graph.data_handle(), graph_size, output_graph_degree);
 
     check_duplicates_and_out_of_range<IdxT>(
diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp
index 8f6cfb063f..75883a9636 100644
--- a/cpp/src/neighbors/detail/cagra/utils.hpp
+++ b/cpp/src/neighbors/detail/cagra/utils.hpp
@@ -360,25 +360,18 @@ class batched_device_view_from_host {
       return;
     }
 
-    cudaPointerAttributes attr;
-    RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle()));
-    switch (attr.type) {
+    RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr_, host_view.data_handle()));
+    switch (attr_.type) {
       case cudaMemoryTypeUnregistered:
       case cudaMemoryTypeHost:
       case cudaMemoryTypeManaged: mem_strategy_ = memory_strategy::copy_device; break;
       case cudaMemoryTypeDevice: mem_strategy_ = memory_strategy::device_only; break;
     }
 
-    // setup streams
-    if (res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL) &&
-        raft::resource::get_stream_pool_size(res) >= 1) {
-      prefetch_stream_  = raft::resource::get_stream_from_stream_pool(res);
-      writeback_stream_ = raft::resource::get_stream_from_stream_pool(res);
-    } else {
-      local_stream_pool_ = std::make_shared<rmm::cuda_stream_pool>(2);
-      prefetch_stream_   = local_stream_pool_.value()->get_stream();
-      writeback_stream_  = local_stream_pool_.value()->get_stream();
-    }
+    RAFT_LOG_DEBUG("Memory strategy: %d for type %d, size %zu",
+                   static_cast<int>(mem_strategy_),
+                   static_cast<int>(attr_.type),
+                   host_view.extent(0) * host_view.extent(1) * sizeof(T));
 
     // buffer allocations
     if (mem_strategy_ == memory_strategy::copy_device) {
@@ -405,14 +398,14 @@ class batched_device_view_from_host {
         }
       } catch (std::bad_alloc& e) {
         RAFT_LOG_DEBUG("Insufficient memory for device buffers");
-        if (attr.devicePointer != nullptr) {
+        if (attr_.devicePointer != nullptr) {
           mem_strategy_ = memory_strategy::managed_only;
         } else {
           throw std::bad_alloc();
         }
       } catch (raft::logic_error& e) {
         RAFT_LOG_DEBUG("Insufficient memory for device buffers (logic error)");
-        if (attr.devicePointer != nullptr) {
+        if (attr_.devicePointer != nullptr) {
           mem_strategy_ = memory_strategy::managed_only;
         } else {
           throw raft::logic_error("Insufficient memory for device buffers (logic error)");
@@ -420,6 +413,17 @@ class batched_device_view_from_host {
       }
     }
 
+    // setup streams
+    if (res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL) &&
+        raft::resource::get_stream_pool_size(res) >= 1) {
+      prefetch_stream_  = raft::resource::get_stream_from_stream_pool(res);
+      writeback_stream_ = raft::resource::get_stream_from_stream_pool(res);
+    } else {
+      local_stream_pool_ = std::make_shared<rmm::cuda_stream_pool>(2);
+      prefetch_stream_   = local_stream_pool_.value()->get_stream();
+      writeback_stream_  = local_stream_pool_.value()->get_stream();
+    }
+
     // if data is managed and not for_write_ we can set the attribute on the device ptr
     if (mem_strategy_ == memory_strategy::managed_only) {
       // location_.type = CU_MEM_LOCATION_TYPE_DEVICE;
@@ -621,18 +625,29 @@ class batched_device_view_from_host {
 
   void prefetch_from_host_to_device(T* dev_ptr, size_t src_row_offset, size_t num_rows)
   {
-    raft::copy(dev_ptr,
-               host_view_.data_handle() + src_row_offset * host_view_.extent(1),
-               num_rows * host_view_.extent(1),
-               prefetch_stream_);
+    const size_t n_elem  = num_rows * host_view_.extent(1);
+    const size_t n_bytes = n_elem * sizeof(T);
+    RAFT_CUDA_TRY(cudaHostRegister(host_view_.data_handle() + src_row_offset * host_view_.extent(1),
+                                   n_bytes,
+                                   cudaHostRegisterDefault));
+    // use memcpy instead of raft::copy to avoid strange behavior with HMM/ATS memory
+    RAFT_CUDA_TRY(cudaMemcpyAsync(dev_ptr,
+                                  host_view_.data_handle() + src_row_offset * host_view_.extent(1),
+                                  n_bytes,
+                                  cudaMemcpyHostToDevice,
+                                  prefetch_stream_));
   }
 
   void writeback_from_device_to_host(T* dev_ptr, size_t dst_row_offset, size_t num_rows)
   {
-    raft::copy(host_view_.data_handle() + dst_row_offset * host_view_.extent(1),
-               dev_ptr,
-               num_rows * host_view_.extent(1),
-               writeback_stream_);
+    const size_t n_elem  = num_rows * host_view_.extent(1);
+    const size_t n_bytes = n_elem * sizeof(T);
+    // use memcpy instead of raft::copy to avoid strange behavior with HMM/ATS memory
+    RAFT_CUDA_TRY(cudaMemcpyAsync(host_view_.data_handle() + dst_row_offset * host_view_.extent(1),
+                                  dev_ptr,
+                                  n_bytes,
+                                  cudaMemcpyDeviceToHost,
+                                  writeback_stream_));
   }
 
   // stream pool for local streams
@@ -655,6 +670,7 @@ class batched_device_view_from_host {
 
   // input pointer information
   raft::host_matrix_view<T, IdxT> host_view_;
+  cudaPointerAttributes attr_;
 
   // internal device buffers
   uint64_t num_buffers_;

From ec45fd251d90cd8713c58252d8258ebee3b700a8 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Tue, 10 Mar 2026 22:46:18 +0000
Subject: [PATCH 079/119] add test, persist stream pool, cleanup

---
 cpp/src/neighbors/detail/cagra/utils.hpp      | 214 ++++++++++--------
 cpp/tests/CMakeLists.txt                      |   1 +
 .../test_batched_device_view_from_host.cu     | 205 +++++++++++++++++
 3 files changed, 326 insertions(+), 94 deletions(-)
 create mode 100644 cpp/tests/neighbors/ann_cagra/test_batched_device_view_from_host.cu

diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp
index 75883a9636..44d87d2993 100644
--- a/cpp/src/neighbors/detail/cagra/utils.hpp
+++ b/cpp/src/neighbors/detail/cagra/utils.hpp
@@ -322,15 +322,22 @@ void copy_with_padding(
  * writeback of the data Each batch can be referenced exactlyonce by calling the next_view()
  * function
  *
+ * Usage:
+ * ```
+ * batched_device_view_from_host<float, int32_t> view(res, host_view, batch_size, host_writeback,
+ * initialize); while (view.next_view().extent(0) > 0) { auto device_view = view.next_view();
+ *   // use device_view
+ * }
+ * ```
+ *
+ * The call to next_view() will
+ * * synchronize on all previous operations / increments batch_id_
+ * * (optionally) write back the data of the previous batch to the host
+ * * (optionally) prefetch the data of the next batch
+ * * return the view of the current batch
+ *
  * @tparam T The type of the data
  * @tparam IdxT The type of the index
- * @param res The resources
- * @param host_view The host view to create the batched device view from
- * @param batch_size The batch size
- * @param host_writeback Whether to write back the data to the host (only for host memory)
- * @param initialize Whether to initialize the data (only for managed memory)
- *
- * @return The batched device view
  */
 template <typename T, typename IdxT>
 class batched_device_view_from_host {
@@ -341,6 +348,18 @@ class batched_device_view_from_host {
     managed_only,  // data is on managed memory (system managed)
   };
 
+  /**
+   * Create a batched device view from a host view and will handle the prefetch and
+   * writeback of the data. Each batch can be referenced exactly once by calling the next_view()
+   * method.
+   *
+   * @param res The resources to use
+   * @param host_view The host view to create the batched device view from
+   * @param batch_size The batch size
+   * @param host_writeback Whether to write back the data to the host (only for host memory)
+   * (default: false)
+   * @param initialize Whether to initialize the data (only for managed memory) (default: true)
+   */
   batched_device_view_from_host(raft::resources const& res,
                                 raft::host_matrix_view<T, IdxT> host_view,
                                 uint64_t batch_size,
@@ -360,6 +379,9 @@ class batched_device_view_from_host {
       return;
     }
 
+    RAFT_EXPECTS(host_writeback_ || initialize_,
+                 "At least one of host_writeback or initialize must be true");
+
     RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr_, host_view.data_handle()));
     switch (attr_.type) {
       case cudaMemoryTypeUnregistered:
@@ -388,7 +410,8 @@ class batched_device_view_from_host {
             raft::make_extents<int64_t>(batch_size, host_view.extent(1))));
           device_ptr[1] = device_mem_[1]->data_handle();
         }
-        if (host_writeback_ && batch_size * 2 < static_cast<uint64_t>(host_view.extent(0))) {
+        if (host_writeback_ && initialize_ &&
+            batch_size * 2 < static_cast<uint64_t>(host_view.extent(0))) {
           num_buffers_ = 3;
           device_mem_[2].emplace(raft::make_device_mdarray<T, IdxT>(
             res,
@@ -397,15 +420,16 @@ class batched_device_view_from_host {
           device_ptr[2] = device_mem_[2]->data_handle();
         }
       } catch (std::bad_alloc& e) {
-        RAFT_LOG_DEBUG("Insufficient memory for device buffers");
         if (attr_.devicePointer != nullptr) {
+          RAFT_LOG_DEBUG("Insufficient memory for device buffers, switching to managed memory");
           mem_strategy_ = memory_strategy::managed_only;
         } else {
           throw std::bad_alloc();
         }
       } catch (raft::logic_error& e) {
-        RAFT_LOG_DEBUG("Insufficient memory for device buffers (logic error)");
         if (attr_.devicePointer != nullptr) {
+          RAFT_LOG_DEBUG(
+            "Insufficient memory for device buffers (logic error), switching to managed memory");
           mem_strategy_ = memory_strategy::managed_only;
         } else {
           throw raft::logic_error("Insufficient memory for device buffers (logic error)");
@@ -413,20 +437,18 @@ class batched_device_view_from_host {
       }
     }
 
-    // setup streams
-    if (res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL) &&
-        raft::resource::get_stream_pool_size(res) >= 1) {
-      prefetch_stream_  = raft::resource::get_stream_from_stream_pool(res);
-      writeback_stream_ = raft::resource::get_stream_from_stream_pool(res);
-    } else {
-      local_stream_pool_ = std::make_shared<rmm::cuda_stream_pool>(2);
-      prefetch_stream_   = local_stream_pool_.value()->get_stream();
-      writeback_stream_  = local_stream_pool_.value()->get_stream();
+    // setup stream pool if not already present
+    size_t required_streams = host_writeback_ && initialize_ ? 2 : 1;
+    if (!res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL) ||
+        raft::resource::get_stream_pool_size(res) < required_streams) {
+      // always create at least 2 streams to account for subsequent iterator calls
+      raft::resource::set_cuda_stream_pool(res, std::make_shared<rmm::cuda_stream_pool>(2));
     }
+    prefetch_stream_  = raft::resource::get_stream_from_stream_pool(res);
+    writeback_stream_ = raft::resource::get_stream_from_stream_pool(res);
 
     // if data is managed and not for_write_ we can set the attribute on the device ptr
     if (mem_strategy_ == memory_strategy::managed_only) {
-      // location_.type = CU_MEM_LOCATION_TYPE_DEVICE;
       location_.type = cudaMemLocationTypeDevice;
       location_.id   = static_cast<CUdevice>(raft::resource::get_device_id(res_));
       if (!host_writeback_) {
@@ -440,6 +462,84 @@ class batched_device_view_from_host {
     prefetch_next_batch();
   }
 
+  ~batched_device_view_from_host() noexcept
+  {
+    raft::resource::sync_stream(res_);
+
+    // if data is on host and for_write --> make sure to copy back last active
+    // if data is managed and evict --> evict last active
+
+    // make sure to sync on prefetch stream & res
+    switch (mem_strategy_) {
+      case memory_strategy::managed_only:
+        if (!host_writeback_) {
+          uint32_t discard_pos     = batch_id_ % num_buffers_;
+          size_t discard_size_rows = actual_batch_size_[discard_pos];
+          if (batch_id_ > 0) {
+            discard_pos = (batch_id_ - 1) % num_buffers_;
+            discard_size_rows += batch_size_;
+          }
+          discard_managed_region(device_ptr[discard_pos],
+                                 discard_size_rows * host_view_.extent(1) * sizeof(T));
+          writeback_stream_.synchronize();
+        }
+        break;
+      case memory_strategy::copy_device:
+        if (host_writeback_) {
+          uint32_t writeback_pos_last = batch_id_ % num_buffers_;
+          if (batch_id_ > 0) {
+            uint32_t writeback_pos    = (batch_id_ - 1) % num_buffers_;
+            uint64_t writeback_offset = (batch_id_ - 1) * batch_size_;
+            writeback_from_device_to_host(device_ptr[writeback_pos], writeback_offset, batch_size_);
+          }
+          {
+            uint64_t writeback_offset_last = batch_id_ * batch_size_;
+            writeback_from_device_to_host(device_ptr[writeback_pos_last],
+                                          writeback_offset_last,
+                                          actual_batch_size_[writeback_pos_last]);
+          }
+          writeback_stream_.synchronize();
+        }
+        break;
+      case memory_strategy::device_only: break;
+    }
+  }
+
+  /**
+   * Returns the next view of the batch
+   *
+   * This function will ensure the next batch is ready and will trigger the prefetch of the
+   * subsequent next batch. If writeback is enabled, the last active batch will be written back to
+   * the host.
+   *
+   * @return The next view of the batch
+   */
+  raft::device_matrix_view<T, IdxT> next_view()
+  {
+    bool end_of_data = static_cast<uint64_t>((batch_id_ + 1) * batch_size_) >=
+                       static_cast<uint64_t>(host_view_.extent(0));
+
+    // special case for empty host view or last batch surpassed
+    if (end_of_data) {
+      return raft::make_device_matrix_view<T, IdxT>(nullptr, 0, host_view_.extent(1));
+    }
+
+    // trigger prefetch of next batch (also increments batch_id_)
+    prefetch_next_batch();
+
+    uint32_t current_pos = batch_id_ % num_buffers_;
+    return raft::make_device_matrix_view<T, IdxT>(
+      device_ptr[current_pos], actual_batch_size_[current_pos], host_view_.extent(1));
+  }
+
+ private:
+  /**
+   * Prefetch the next batch
+   *
+   * This function will prefetch the next batch and will handle the writeback of the data.
+   *
+   * @return True if the next batch exists, false otherwise
+   */
   bool prefetch_next_batch()
   {
     batch_id_++;
@@ -512,77 +612,6 @@ class batched_device_view_from_host {
     return next_batch_exists;
   }
 
-  ~batched_device_view_from_host() noexcept
-  {
-    prefetch_stream_.synchronize();
-    writeback_stream_.synchronize();
-    raft::resource::sync_stream(res_);
-
-    // if data is on host and for_write --> make sure to copy back last active
-    // if data is managed and evict --> evict last active
-
-    // make sure to sync on prefetch stream & res
-    switch (mem_strategy_) {
-      case memory_strategy::managed_only:
-        if (!host_writeback_) {
-          uint32_t discard_pos     = batch_id_ % num_buffers_;
-          size_t discard_size_rows = actual_batch_size_[discard_pos];
-          if (batch_id_ > 0) {
-            discard_pos = (batch_id_ - 1) % num_buffers_;
-            discard_size_rows += batch_size_;
-          }
-          discard_managed_region(device_ptr[discard_pos],
-                                 discard_size_rows * host_view_.extent(1) * sizeof(T));
-        }
-        writeback_stream_.synchronize();
-        break;
-      case memory_strategy::copy_device:
-        if (host_writeback_) {
-          uint32_t writeback_pos_last = batch_id_ % num_buffers_;
-          if (batch_id_ > 0) {
-            uint32_t writeback_pos    = (batch_id_ - 1) % num_buffers_;
-            uint64_t writeback_offset = (batch_id_ - 1) * batch_size_;
-            writeback_from_device_to_host(device_ptr[writeback_pos], writeback_offset, batch_size_);
-          }
-          {
-            uint64_t writeback_offset_last = batch_id_ * batch_size_;
-            writeback_from_device_to_host(device_ptr[writeback_pos_last],
-                                          writeback_offset_last,
-                                          actual_batch_size_[writeback_pos_last]);
-          }
-        }
-        writeback_stream_.synchronize();
-        break;
-      case memory_strategy::device_only: break;
-    }
-  }
-
-  /**
-   * Returns the next view of the batch
-   *
-   * This function will ensure the next batch is ready and will trigger the prefetch of the
-   * subsequent next batch
-   *
-   * @return The next view of the batch
-   */
-  raft::device_matrix_view<T, IdxT> next_view()
-  {
-    // special case for empty host view
-    if (host_view_.extent(0) == 0) {
-      return raft::make_device_matrix_view<T, IdxT>(nullptr, 0, host_view_.extent(1));
-    }
-
-    // trigger prefetch of next batch
-    bool next_batch_exists = prefetch_next_batch();
-
-    RAFT_EXPECTS(batch_id_ * batch_size_ < host_view_.extent(0), "Batch index out of bounds");
-
-    uint32_t current_pos = batch_id_ % num_buffers_;
-    return raft::make_device_matrix_view<T, IdxT>(
-      device_ptr[current_pos], actual_batch_size_[current_pos], host_view_.extent(1));
-  }
-
- private:
   void advise_read_mostly(T* ptr, size_t size)
   {
 #if CUDA_VERSION >= 13000
@@ -627,9 +656,6 @@ class batched_device_view_from_host {
   {
     const size_t n_elem  = num_rows * host_view_.extent(1);
     const size_t n_bytes = n_elem * sizeof(T);
-    RAFT_CUDA_TRY(cudaHostRegister(host_view_.data_handle() + src_row_offset * host_view_.extent(1),
-                                   n_bytes,
-                                   cudaHostRegisterDefault));
     // use memcpy instead of raft::copy to avoid strange behavior with HMM/ATS memory
     RAFT_CUDA_TRY(cudaMemcpyAsync(dev_ptr,
                                   host_view_.data_handle() + src_row_offset * host_view_.extent(1),
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 35794adf9b..77fd18c7d3 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -173,6 +173,7 @@ ConfigureTest(
 ConfigureTest(
   NAME NEIGHBORS_ANN_CAGRA_HELPERS_TEST
   PATH neighbors/ann_cagra/test_optimize_uint32_t.cu
+       neighbors/ann_cagra/test_batched_device_view_from_host.cu
   GPUS 1
   PERCENT 100
 )
diff --git a/cpp/tests/neighbors/ann_cagra/test_batched_device_view_from_host.cu b/cpp/tests/neighbors/ann_cagra/test_batched_device_view_from_host.cu
new file mode 100644
index 0000000000..1e1cc13093
--- /dev/null
+++ b/cpp/tests/neighbors/ann_cagra/test_batched_device_view_from_host.cu
@@ -0,0 +1,205 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include <cuvs/neighbors/cagra.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/host_device_accessor.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/managed_mdarray.hpp>
+#include <raft/core/pinned_mdarray.hpp>
+
+#include <raft/core/copy.cuh>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/init.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include "../../../src/neighbors/detail/cagra/utils.hpp"
+
+#include <array>
+#include <cstdint>
+#include <tuple>
+#include <vector>
+
+namespace cuvs::neighbors::cagra {
+
+using IdxT = uint32_t;
+
+struct BatchConfig {
+  bool initialize;
+  bool host_writeback;
+};
+
+struct DimsConfig {
+  int64_t n_rows;
+  int64_t n_cols;
+  uint64_t batch_size;
+};
+
+class BatchedDeviceViewFromHostTest : public ::testing::Test {
+ protected:
+  void SetUp() override { raft::resource::sync_stream(res); }
+
+  /**
+   * Run batched_device_view_from_host over host data, copy device views back,
+   * and verify against the input.
+   */
+  template <typename InputMatrixView>
+  void run_and_verify_batched(InputMatrixView input_view,
+                              uint64_t batch_size,
+                              bool host_writeback,
+                              bool initialize)
+  {
+    int64_t n_rows = input_view.extent(0);
+    int64_t n_cols = input_view.extent(1);
+
+    std::vector<IdxT> readback(n_rows * n_cols);
+
+    int64_t total_processed = 0;
+
+    {
+      cagra::detail::batched_device_view_from_host<IdxT, int64_t> batched(
+        res,
+        raft::make_host_matrix_view<IdxT, int64_t>(input_view.data_handle(), n_rows, n_cols),
+        batch_size,
+        host_writeback,
+        initialize);
+      while (true) {
+        auto dev_view = batched.next_view();
+        if (dev_view.extent(0) == 0) break;
+
+        if (initialize) {
+          raft::copy(readback.data() + total_processed * n_cols,
+                     dev_view.data_handle(),
+                     dev_view.extent(0) * dev_view.extent(1),
+                     raft::resource::get_cuda_stream(res));
+        }
+        if (host_writeback) { raft::matrix::fill(res, dev_view, IdxT(17)); }
+        total_processed += dev_view.extent(0);
+      }
+    }
+    raft::resource::sync_stream(res);
+
+    EXPECT_EQ(total_processed, n_rows);
+    if (initialize) {
+      for (int64_t i = 0; i < n_rows * n_cols; ++i) {
+        EXPECT_EQ(readback[i], IdxT(13)) << "Mismatch (initialize) at index " << i;
+      }
+    }
+    if (host_writeback) {
+      auto readback_view =
+        raft::make_host_matrix_view<IdxT, int64_t>(readback.data(), n_rows, n_cols);
+      raft::copy(res, readback_view, input_view);
+      raft::resource::sync_stream(res);
+      for (int64_t i = 0; i < n_rows * n_cols; ++i) {
+        EXPECT_EQ(readback[i], IdxT(17)) << "Mismatch (host_writeback) at index " << i;
+      }
+    }
+  }
+
+  raft::resources res;
+};
+
+TEST_F(BatchedDeviceViewFromHostTest, EmptyView)
+{
+  auto host_empty = raft::make_host_matrix<IdxT, int64_t>(0, 8);
+  auto host_view  = host_empty.view();
+  cagra::detail::batched_device_view_from_host<IdxT, int64_t> batched(
+    res, host_view, /*batch_size=*/128, /*host_writeback=*/false, /*initialize=*/true);
+
+  auto view = batched.next_view();
+  EXPECT_EQ(view.extent(0), 0);
+  EXPECT_EQ(view.extent(1), 8);
+  EXPECT_EQ(view.data_handle(), nullptr);
+}
+
+using BatchDimsParam = std::tuple<BatchConfig, DimsConfig>;
+
+class BatchedDeviceViewFromHostParameterizedTest
+  : public BatchedDeviceViewFromHostTest,
+    public ::testing::WithParamInterface<BatchDimsParam> {};
+
+TEST_P(BatchedDeviceViewFromHostParameterizedTest, VectorHostData)
+{
+  auto [batch_config, dims_config]  = GetParam();
+  auto [initialize, host_writeback] = batch_config;
+  auto [n_rows, n_cols, batch_size] = dims_config;
+
+  std::vector<IdxT> host_data(n_rows * n_cols);
+  auto host_view = raft::make_host_matrix_view<IdxT, int64_t>(host_data.data(), n_rows, n_cols);
+
+  std::fill(host_view.data_handle(), host_view.data_handle() + n_rows * n_cols, IdxT(13));
+
+  run_and_verify_batched(host_view, batch_size, host_writeback, initialize);
+}
+
+TEST_P(BatchedDeviceViewFromHostParameterizedTest, PinnedMemory)
+{
+  auto [batch_config, dims_config]  = GetParam();
+  auto [initialize, host_writeback] = batch_config;
+  auto [n_rows, n_cols, batch_size] = dims_config;
+
+  auto host_matrix = raft::make_pinned_matrix<IdxT, int64_t>(res, n_rows, n_cols);
+  auto host_view   = host_matrix.view();
+
+  std::fill(host_view.data_handle(), host_view.data_handle() + n_rows * n_cols, IdxT(13));
+
+  run_and_verify_batched(host_view, batch_size, host_writeback, initialize);
+}
+
+TEST_P(BatchedDeviceViewFromHostParameterizedTest, ManagedMemory)
+{
+  auto [batch_config, dims_config]  = GetParam();
+  auto [initialize, host_writeback] = batch_config;
+  auto [n_rows, n_cols, batch_size] = dims_config;
+
+  auto host_matrix = raft::make_managed_matrix<IdxT, int64_t>(res, n_rows, n_cols);
+  auto host_view   = host_matrix.view();
+
+  std::fill(host_view.data_handle(), host_view.data_handle() + n_rows * n_cols, IdxT(13));
+
+  run_and_verify_batched(host_view, batch_size, host_writeback, initialize);
+}
+
+TEST_P(BatchedDeviceViewFromHostParameterizedTest, DeviceMemory)
+{
+  auto [batch_config, dims_config]  = GetParam();
+  auto [initialize, host_writeback] = batch_config;
+  auto [n_rows, n_cols, batch_size] = dims_config;
+
+  auto host_matrix = raft::make_device_matrix<IdxT, int64_t>(res, n_rows, n_cols);
+  auto host_view   = host_matrix.view();
+
+  raft::matrix::fill(res, host_view, IdxT(13));
+
+  run_and_verify_batched(host_view, batch_size, host_writeback, initialize);
+}
+
+static const std::array<BatchConfig, 3> kBatchConfigs = {{
+  {/*initialize=*/true, /*host_writeback=*/false},
+  {/*initialize=*/false, /*host_writeback=*/true},
+  {/*initialize=*/true, /*host_writeback=*/true},
+}};
+
+static const std::array<DimsConfig, 4> kDimsConfigs = {{
+  {/*n_rows=*/64, /*n_cols=*/32, /*batch_size=*/256},  // rows less than batch size, single batch
+  {/*n_rows=*/64, /*n_cols=*/32, /*batch_size=*/64},   // single batch
+  {/*n_rows=*/256, /*n_cols=*/32, /*batch_size=*/32},  // multiple batches
+  {/*n_rows=*/500,
+   /*n_cols=*/32,
+   /*batch_size=*/128},  // multiple batches, partial batch in the end
+}};
+
+INSTANTIATE_TEST_SUITE_P(BatchConfigs,
+                         BatchedDeviceViewFromHostParameterizedTest,
+                         ::testing::Combine(::testing::ValuesIn(kBatchConfigs),
+                                            ::testing::ValuesIn(kDimsConfigs)));
+
+}  // namespace cuvs::neighbors::cagra

From c412138a0dd6e3b81fa9bc4e10a1b546d71c5476 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 11 Mar 2026 00:04:52 +0000
Subject: [PATCH 080/119] switch to cooperative groups as __reduce_min_sync
 causes issues

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index b5e055820d..2444350253 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -29,11 +29,16 @@
 #include <float.h>
 #include <sys/time.h>
 
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+
 #include <climits>
 #include <iostream>
 #include <memory>
 #include <random>
 
+namespace cg = cooperative_groups;
+
 namespace cuvs::neighbors::cagra::detail::graph {
 
 // unnamed namespace to avoid multiple definition error
@@ -196,6 +201,9 @@ __global__ void kern_fused_prune(const IdxT* const knn_graph,   // [graph_chunk_
 {
   extern __shared__ unsigned char smem_buf[];
 
+  cg::thread_block block         = cg::this_thread_block();
+  cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
+
   const uint32_t wid     = threadIdx.x / raft::WarpSize;
   const uint32_t lane_id = threadIdx.x % raft::WarpSize;
 
@@ -207,8 +215,7 @@ __global__ void kern_fused_prune(const IdxT* const knn_graph,   // [graph_chunk_
   uint64_t* const num_retain = stats;
   uint64_t* const num_full   = stats + 1;
 
-  const unsigned warp_mask = 0xffffffff;
-  const uint32_t maxval16  = 0x0000ffff;
+  const uint32_t maxval16 = 0x0000ffff;
 
   const uint64_t nid_batch = blockIdx.x * num_warps + wid;
   const uint64_t nid       = nid_batch + (batch_size * batch_id);
@@ -255,11 +262,7 @@ __global__ void kern_fused_prune(const IdxT* const knn_graph,   // [graph_chunk_
 
   __syncwarp();
 
-  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 1);
-  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 2);
-  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 4);
-  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 8);
-  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 16);
+  num_edges_no_detour = cg::reduce(warp, num_edges_no_detour, cg::plus<uint32_t>());
   num_edges_no_detour = min(num_edges_no_detour, output_graph_degree);
 
   if (lane_id == 0) {
@@ -280,7 +283,7 @@ __global__ void kern_fused_prune(const IdxT* const knn_graph,   // [graph_chunk_
     }
 
     uint32_t local_min_with_tag = (local_min << 16) | ((uint32_t)local_idx);
-    uint32_t warp_min_with_tag  = __reduce_min_sync(warp_mask, local_min_with_tag);
+    uint32_t warp_min_with_tag  = cg::reduce(warp, local_min_with_tag, cg::less<uint32_t>());
     uint32_t warp_min_count     = warp_min_with_tag >> 16;
     uint32_t warp_local_idx     = warp_min_with_tag & 0xffff;
 
@@ -294,7 +297,7 @@ __global__ void kern_fused_prune(const IdxT* const knn_graph,   // [graph_chunk_
     for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) {
       if (smem_indices[k] == selected_node) { smem_num_detour[k] = maxval16; }
     }
-    __syncwarp(warp_mask);
+    __syncwarp();
 
     if (lane_id == 0) { output_graph_ptr[nid_batch * output_graph_degree + i] = selected_node; }
   }
@@ -312,7 +315,10 @@ __device__ unsigned int warp_pos_in_array(T val, const T* array, uint64_t num)
       break;
     }
   }
-  ret = __reduce_min_sync(0xffffffff, ret);
+
+  cg::thread_block block         = cg::this_thread_block();
+  cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
+  ret                            = cg::reduce(warp, ret, cg::less<unsigned int>());
   return ret;
 }
 

From ab01bab594e4337a9b6530a686e0d8642ce61866 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 13 Mar 2026 18:43:55 +0000
Subject: [PATCH 081/119] back to column wise reverse graph creation to boost
 closer connections

---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 124 +++++++++++-------
 1 file changed, 78 insertions(+), 46 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 88c13c139e..5d43da851b 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -171,24 +171,38 @@ __global__ void kern_sort(const DATA_T* const dataset,  // [dataset_chunk_size,
   }
 }
 
+template <class IdxT>
+__global__ void kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_size]
+                                    IdxT* const rev_graph,            // [size, degree]
+                                    uint32_t* const rev_graph_count,  // [graph_size]
+                                    const uint32_t graph_size,
+                                    const uint32_t degree)
+{
+  const uint32_t tid  = threadIdx.x + (blockDim.x * blockIdx.x);
+  const uint32_t tnum = blockDim.x * gridDim.x;
+
+  for (uint32_t src_id = tid; src_id < graph_size; src_id += tnum) {
+    const IdxT dest_id = dest_nodes[src_id];
+    if (dest_id >= graph_size) continue;
+
+    const uint32_t pos = atomicAdd(rev_graph_count + dest_id, 1);
+    if (pos < degree) { rev_graph[pos + ((uint64_t)degree * dest_id)] = src_id; }
+  }
+}
+
 template <typename IdxT>
-__global__ void kern_rev_graph_batched(const IdxT* const dest_nodes,     // [batch_size, degree]
-                                       IdxT* const rev_graph,            // [graph_size, degree]
-                                       uint32_t* const rev_graph_count,  // [graph_size]
-                                       const uint32_t graph_size,
-                                       const uint32_t degree,
-                                       const uint32_t batch_size,
-                                       const uint32_t batch_id)
+__global__ void kern_make_rev_graph_k(const IdxT* const output_graph,   // [graph_size, degree]
+                                      IdxT* const rev_graph,            // [graph_size, degree]
+                                      uint32_t* const rev_graph_count,  // [graph_size]
+                                      const uint32_t graph_size,
+                                      const uint32_t degree,
+                                      uint64_t k)
 {
   const uint64_t tid  = threadIdx.x + (blockDim.x * blockIdx.x);
   const uint64_t tnum = blockDim.x * gridDim.x;
 
-  const uint64_t block_batch_size = min(batch_size, graph_size - batch_id * batch_size);
-
-  for (uint64_t idx = tid; idx < block_batch_size * degree; idx += tnum) {
-    const IdxT dest_id    = dest_nodes[idx];
-    const uint32_t src_id = idx / degree;
-
+  for (uint64_t src_id = tid; src_id < graph_size; src_id += tnum) {
+    IdxT dest_id = output_graph[k + (degree * src_id)];
     if (dest_id >= graph_size) continue;
 
     const uint32_t pos = atomicAdd(rev_graph_count + dest_id, 1);
@@ -840,50 +854,67 @@ void make_reverse_graph_gpu(raft::resources const& res,
                             uint64_t output_graph_degree)
 {
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
-    "cagra::graph::optimize/reverse");
+    "cagra::graph::optimize/reverse2");
 
-  auto default_ws_mr = raft::resource::get_workspace_resource(res);
+  auto d_rev_graph =
+    raft::make_device_vector_view<IdxT, int64_t>(d_rev_graph_ptr, graph_size * output_graph_degree);
+  auto d_rev_graph_count =
+    raft::make_device_vector_view<uint32_t, int64_t>(d_rev_graph_count_ptr, graph_size);
 
-  raft::matrix::fill(
-    res,
-    raft::make_device_vector_view<IdxT, int64_t>(d_rev_graph_ptr, graph_size * output_graph_degree),
-    IdxT(-1));
+  //
+  // Make reverse graph
+  //
+  const double time_make_start = cur_time();
 
-  raft::matrix::fill(
-    res,
-    raft::make_device_vector_view<IdxT, int64_t>(d_rev_graph_count_ptr, graph_size),
-    uint32_t(0));
+  raft::matrix::fill(res, d_rev_graph, IdxT(-1));
+  raft::matrix::fill(res, d_rev_graph_count, uint32_t(0));
 
-  const uint32_t batch_size =
-    std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
-  const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
+  if (is_ptr_host_accessible(output_graph_ptr)) {
+    auto d_dest_nodes =
+      raft::make_device_mdarray<IdxT>(res, raft::make_extents<int64_t>(graph_size));
 
-  batched_device_view_from_host<IdxT, int64_t> d_output_graph(
-    res,
-    raft::make_host_matrix_view<IdxT, int64_t>(output_graph_ptr, graph_size, output_graph_degree),
-    /*batch_size*/ batch_size,
-    /*host_writeback*/ false,
-    /*initialize*/ true);
-
-  for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
+    for (uint64_t k = 0; k < output_graph_degree; k++) {
+      RAFT_CUDA_TRY(cudaMemcpy2DAsync(d_dest_nodes.data_handle(),
+                                      sizeof(IdxT),
+                                      output_graph_ptr + k,
+                                      output_graph_degree * sizeof(IdxT),
+                                      1 * sizeof(IdxT),
+                                      graph_size,
+                                      cudaMemcpyHostToDevice,
+                                      raft::resource::get_cuda_stream(res)));
+
+      dim3 threads(256, 1, 1);
+      dim3 blocks(1024, 1, 1);
+      kern_make_rev_graph<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
+        d_dest_nodes.data_handle(),
+        d_rev_graph.data_handle(),
+        d_rev_graph_count.data_handle(),
+        graph_size,
+        output_graph_degree);
+      RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u    \r", k, output_graph_degree);
+    }
+  } else {
+    // output graph is fully device accessible, so we need no copy to device
     dim3 threads(256, 1, 1);
     dim3 blocks(1024, 1, 1);
-    auto output_view = d_output_graph.next_view();
-
-    kern_rev_graph_batched<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
-      output_view.data_handle(),
-      d_rev_graph_ptr,
-      d_rev_graph_count_ptr,
-      static_cast<uint32_t>(graph_size),
-      static_cast<uint32_t>(output_graph_degree),
-      static_cast<uint32_t>(batch_size),
-      static_cast<uint32_t>(i_batch));
+    for (uint64_t k = 0; k < output_graph_degree; k++) {
+      kern_make_rev_graph_k<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
+        output_graph_ptr,
+        d_rev_graph.data_handle(),
+        d_rev_graph_count.data_handle(),
+        graph_size,
+        output_graph_degree,
+        k);
+    }
   }
 
   raft::resource::sync_stream(res);
   RAFT_LOG_DEBUG("\n");
+
+  const double time_make_end = cur_time();
+  RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms",
+                 (time_make_end - time_make_start) * 1000.0);
 }
-}  // namespace
 
 template <typename DataT,
           typename IdxT       = uint32_t,
@@ -1585,7 +1616,8 @@ void prune_graph_gpu(raft::resources const& res,
     (double)num_full / graph_size * 100);
 }
 
-// TODO allow pinned input for both knn_graph and new_graph
+}  // namespace
+
 template <typename IdxT = uint32_t, typename InputMatrixView, typename OutputMatrixView>
 void optimize(raft::resources const& res,
               InputMatrixView knn_graph,

From 68f78839a5437d48f54d876b484efded20e8448d Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 13 Mar 2026 20:00:55 +0000
Subject: [PATCH 082/119] fix signness

---
 cpp/src/neighbors/detail/cagra/utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp
index 44d87d2993..7dae487863 100644
--- a/cpp/src/neighbors/detail/cagra/utils.hpp
+++ b/cpp/src/neighbors/detail/cagra/utils.hpp
@@ -561,7 +561,7 @@ class batched_device_view_from_host {
 
     // if data is on device only this is almost a noop, just prepping the pointers
 
-    RAFT_EXPECTS(offset_ <= host_view_.extent(0), "Offset out of bounds");
+    RAFT_EXPECTS(static_cast<uint64_t>(offset_) <= host_view_.extent(0), "Offset out of bounds");
 
     bool next_batch_exists = offset_ < static_cast<uint64_t>(host_view_.extent(0));
 

From add206a7697aaf019543a43e39f763858992c5a2 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Fri, 13 Mar 2026 22:51:05 +0000
Subject: [PATCH 083/119] stupid me trusting cursor to fix this

---
 cpp/src/neighbors/detail/cagra/utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp
index 7dae487863..79d1ed1cae 100644
--- a/cpp/src/neighbors/detail/cagra/utils.hpp
+++ b/cpp/src/neighbors/detail/cagra/utils.hpp
@@ -561,7 +561,7 @@ class batched_device_view_from_host {
 
     // if data is on device only this is almost a noop, just prepping the pointers
 
-    RAFT_EXPECTS(static_cast<uint64_t>(offset_) <= host_view_.extent(0), "Offset out of bounds");
+    RAFT_EXPECTS(static_cast<int64_t>(offset_) <= host_view_.extent(0), "Offset out of bounds");
 
     bool next_batch_exists = offset_ < static_cast<uint64_t>(host_view_.extent(0));
 

From ab21766205138c20603ca1abfe7832f20bb37cda Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Mon, 16 Mar 2026 08:23:42 -0700
Subject: [PATCH 084/119] leftover files

---
 .gitignore                                     | 3 +++
 cpp/src/neighbors/detail/cagra/cagra_build.cuh | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 317c28d997..0066d2b89a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -88,5 +88,8 @@ ivf_pq_index
 /datasets/
 /*.json
 
+# clangd
+*/.clangd
+
 # java
 .classpath
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 58c48e4023..31797acb5b 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -2254,7 +2254,7 @@ auto iterative_build_graph(
     idx_opt->update_dataset(
       res,
       // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later
-      cuvs::neighbors::vpq_build<decltype(dev_dataset), half, int64_t>(
+      cuvs::preprocessing::quantize::pq::vpq_build(
         res, *params.compression, dev_dataset));
     auto end        = std::chrono::high_resolution_clock::now();
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();

From 6f450cdd066dc71ec44da92304915fc182e154d5 Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Mon, 16 Mar 2026 09:44:03 -0700
Subject: [PATCH 085/119] Revert graph_core.cuh to merge base before merging PR
 1830

Discarding local GPU reverse graph / edge selection changes in
graph_core.cuh to cleanly accept upstream PR 1830 optimizations.

Made-with: Cursor
---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 341 +++++-------------
 1 file changed, 92 insertions(+), 249 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index b088c41e49..d94e279829 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -31,7 +31,6 @@
 #include <climits>
 #include <iostream>
 #include <memory>
-#include <optional>
 #include <random>
 
 namespace cuvs::neighbors::cagra::detail::graph {
@@ -250,67 +249,6 @@ __global__ void kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_
   }
 }
 
-template <class IdxT>
-__global__ void kern_extract_column(const IdxT* const d_matrix,
-                                    IdxT* const d_column,
-                                    const uint32_t n_rows,
-                                    const uint32_t n_cols,
-                                    const uint32_t col_idx)
-{
-  const uint32_t tid  = threadIdx.x + (blockDim.x * blockIdx.x);
-  const uint32_t tnum = blockDim.x * gridDim.x;
-  for (uint32_t i = tid; i < n_rows; i += tnum) {
-    d_column[i] = d_matrix[col_idx + (static_cast<uint64_t>(n_cols) * i)];
-  }
-}
-
-template <class IdxT>
-__global__ void kern_select_edges(const uint8_t* const d_detour_count,
-                                  const IdxT* const d_knn_graph,
-                                  IdxT* const d_output_graph,
-                                  const uint32_t graph_size,
-                                  const uint32_t knn_graph_degree,
-                                  const uint32_t output_graph_degree,
-                                  uint32_t* const d_invalid_count)
-{
-  const uint64_t i = static_cast<uint64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (i >= graph_size) return;
-
-  const uint8_t* my_detour = d_detour_count + i * knn_graph_degree;
-  const IdxT* my_knn       = d_knn_graph + i * knn_graph_degree;
-  IdxT* my_output          = d_output_graph + i * output_graph_degree;
-
-  uint32_t pk         = 0;
-  uint32_t num_detour = 0;
-  for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) {
-    uint32_t next_num_detour = 0xFFFFFFFFu;
-    for (uint32_t k = 0; k < knn_graph_degree; k++) {
-      const uint32_t d = my_detour[k];
-      if (d > num_detour) { next_num_detour = min(next_num_detour, d); }
-      if (d != num_detour) { continue; }
-
-      const IdxT candidate = my_knn[k];
-      bool dup              = false;
-      for (uint32_t dk = 0; dk < pk; dk++) {
-        if (candidate == my_output[dk]) {
-          dup = true;
-          break;
-        }
-      }
-      if (!dup && candidate < static_cast<IdxT>(graph_size)) {
-        my_output[pk] = candidate;
-        pk++;
-      }
-      if (pk >= output_graph_degree) break;
-    }
-    if (pk >= output_graph_degree) break;
-    if (next_num_detour == 0xFFFFFFFFu) break;
-    num_detour = next_num_detour;
-  }
-
-  if (pk != output_graph_degree) { atomicAdd(d_invalid_count, 1); }
-}
-
 template <class IdxT, class LabelT>
 __device__ __host__ LabelT get_root_label(IdxT i, const LabelT* label)
 {
@@ -1214,8 +1152,7 @@ void optimize(
   raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, g_accessor> knn_graph,
   raft::host_matrix_view<IdxT, int64_t, raft::row_major> new_graph,
   const bool guarantee_connectivity = true,
-  const bool use_gpu                = true,
-  const IdxT* d_knn_graph_ptr       = nullptr)
+  const bool use_gpu                = true)
 {
   RAFT_LOG_DEBUG(
     "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
@@ -1257,10 +1194,6 @@ void optimize(
     }
   }
 
-  // Device pruned graph: populated by GPU edge selection, reused by GPU reverse graph.
-  auto d_pruned_graph = raft::make_device_mdarray<IdxT>(
-    res, large_tmp_mr, raft::make_extents<int64_t>(0, 0));
-
   {
     raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
       "cagra::graph::optimize/prune");
@@ -1324,18 +1257,11 @@ void optimize(
 
       RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");
 
-      // Use device knn_graph directly if provided; otherwise copy from host.
-      std::optional<device_matrix_view_from_host<IdxT, int64_t>> d_input_graph_copy;
-      const IdxT* d_input_graph_handle;
-      if (d_knn_graph_ptr != nullptr) {
-        d_input_graph_handle = d_knn_graph_ptr;
-      } else {
-        d_input_graph_copy.emplace(
-          res,
-          raft::make_host_matrix_view<IdxT, int64_t>(
-            knn_graph.data_handle(), graph_size, knn_graph_degree));
-        d_input_graph_handle = d_input_graph_copy->data_handle();
-      }
+      // Copy knn_graph over to device if necessary
+      device_matrix_view_from_host d_input_graph(
+        res,
+        raft::make_host_matrix_view<IdxT, int64_t>(
+          knn_graph.data_handle(), graph_size, knn_graph_degree));
 
       constexpr int MAX_DEGREE = 1024;
       if (knn_graph_degree > MAX_DEGREE) {
@@ -1356,7 +1282,7 @@ void optimize(
       for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
         kern_prune<MAX_DEGREE, IdxT>
           <<<blocks_prune, threads_prune, 0, raft::resource::get_cuda_stream(res)>>>(
-            d_input_graph_handle,
+            d_input_graph.data_handle(),
             graph_size,
             knn_graph_degree,
             output_graph_degree,
@@ -1373,6 +1299,8 @@ void optimize(
       raft::resource::sync_stream(res);
       RAFT_LOG_DEBUG("\n");
 
+      raft::copy(res, detour_count.view(), raft::make_const_mdspan(d_detour_count.view()));
+
       raft::copy(res, host_stats.view(), raft::make_const_mdspan(dev_stats.view()));
       num_keep = host_stats.data_handle()[0];
       num_full = host_stats.data_handle()[1];
@@ -1386,45 +1314,6 @@ void optimize(
         (double)num_keep / graph_size,
         output_graph_degree,
         (double)num_full / graph_size * 100);
-
-      // GPU edge selection: pick output_graph_degree edges per node with lowest detour counts.
-      d_pruned_graph = raft::make_device_mdarray<IdxT>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
-      {
-        raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> select_scope(
-          "cagra::graph::optimize/prune/edge-selection-by-GPU");
-        auto d_invalid_count = raft::make_device_mdarray<uint32_t>(
-          res, large_tmp_mr, raft::make_extents<int64_t>(1));
-        raft::matrix::fill(res, d_invalid_count.view(), uint32_t(0));
-
-        const uint32_t select_threads = 256;
-        const uint32_t select_blocks  = (graph_size + select_threads - 1) / select_threads;
-        kern_select_edges<IdxT>
-          <<<select_blocks, select_threads, 0, raft::resource::get_cuda_stream(res)>>>(
-            d_detour_count.data_handle(),
-            d_input_graph_handle,
-            d_pruned_graph.data_handle(),
-            graph_size,
-            knn_graph_degree,
-            output_graph_degree,
-            d_invalid_count.data_handle());
-        raft::resource::sync_stream(res);
-
-        auto h_invalid_count = raft::make_host_vector<uint32_t, int64_t>(1);
-        raft::copy(res, h_invalid_count.view(), raft::make_const_mdspan(d_invalid_count.view()));
-        raft::resource::sync_stream(res);
-        RAFT_EXPECTS(
-          h_invalid_count.data_handle()[0] == 0,
-          "Could not generate an intermediate CAGRA graph because the initial kNN graph "
-          "contains too many invalid or duplicated neighbor nodes. (%u nodes failed)",
-          h_invalid_count.data_handle()[0]);
-
-        raft::copy(output_graph_ptr,
-                   d_pruned_graph.data_handle(),
-                   graph_size * output_graph_degree,
-                   raft::resource::get_cuda_stream(res));
-        raft::resource::sync_stream(res);
-      }
     } else {
       // Count 2-hop detours on CPU
       raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
@@ -1436,66 +1325,66 @@ void optimize(
       const double time_2hop_count_end = cur_time();
       RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec",
                      time_2hop_count_end - time_2hop_count_start);
+    }
 
-      // Create pruned kNN graph
-      bool invalid_neighbor_list = false;
+    // Create pruned kNN graph
+    bool invalid_neighbor_list = false;
 #pragma omp parallel for
-      for (uint64_t i = 0; i < graph_size; i++) {
-        // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable
-        // count of the neighbors while increasing the target detourable count from zero.
-        uint64_t pk         = 0;
-        uint32_t num_detour = 0;
-        for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) {
-          uint32_t next_num_detour = std::numeric_limits<uint32_t>::max();
-          for (uint64_t k = 0; k < knn_graph_degree; k++) {
-            const auto num_detour_k = detour_count(i, k);
-            // Find the detourable count to check in the next iteration
-            if (num_detour_k > num_detour) {
-              next_num_detour = std::min(static_cast<uint32_t>(num_detour_k), next_num_detour);
-            }
+    for (uint64_t i = 0; i < graph_size; i++) {
+      // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable
+      // count of the neighbors while increasing the target detourable count from zero.
+      uint64_t pk         = 0;
+      uint32_t num_detour = 0;
+      for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) {
+        uint32_t next_num_detour = std::numeric_limits<uint32_t>::max();
+        for (uint64_t k = 0; k < knn_graph_degree; k++) {
+          const auto num_detour_k = detour_count(i, k);
+          // Find the detourable count to check in the next iteration
+          if (num_detour_k > num_detour) {
+            next_num_detour = std::min(static_cast<uint32_t>(num_detour_k), next_num_detour);
+          }
 
-            // Store the neighbor index if its detourable count is equal to `num_detour`.
-            if (num_detour_k != num_detour) { continue; }
-
-            // Check duplication and append
-            const auto candidate_node = knn_graph(i, k);
-            bool dup                  = false;
-            for (uint32_t dk = 0; dk < pk; dk++) {
-              if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) {
-                dup = true;
-                break;
-              }
-            }
-            if (!dup && candidate_node < graph_size) {
-              output_graph_ptr[i * output_graph_degree + pk] = candidate_node;
-              pk += 1;
+          // Store the neighbor index if its detourable count is equal to `num_detour`.
+          if (num_detour_k != num_detour) { continue; }
+
+          // Check duplication and append
+          const auto candidate_node = knn_graph(i, k);
+          bool dup                  = false;
+          for (uint32_t dk = 0; dk < pk; dk++) {
+            if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) {
+              dup = true;
+              break;
             }
-            if (pk >= output_graph_degree) break;
           }
-          if (pk >= output_graph_degree) break;
-
-          if (next_num_detour == std::numeric_limits<uint32_t>::max()) {
-            // There are no valid edges enough in the initial kNN graph. Break the loop here and
-            // catch the error at the next validation (pk != output_graph_degree).
-            break;
+          if (!dup && candidate_node < graph_size) {
+            output_graph_ptr[i * output_graph_degree + pk] = candidate_node;
+            pk += 1;
           }
-          num_detour = next_num_detour;
+          if (pk >= output_graph_degree) break;
         }
-        if (pk != output_graph_degree) {
-          RAFT_LOG_DEBUG(
-            "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for "
-            "node %lu in the rank-based node reranking process",
-            output_graph_degree,
-            i);
-          invalid_neighbor_list = true;
+        if (pk >= output_graph_degree) break;
+
+        if (next_num_detour == std::numeric_limits<uint32_t>::max()) {
+          // There are no valid edges enough in the initial kNN graph. Break the loop here and catch
+          // the error at the next validation (pk != output_graph_degree).
+          break;
         }
+        num_detour = next_num_detour;
+      }
+      if (pk != output_graph_degree) {
+        RAFT_LOG_DEBUG(
+          "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for "
+          "node %lu in the rank-based node reranking process",
+          output_graph_degree,
+          i);
+        invalid_neighbor_list = true;
       }
-      RAFT_EXPECTS(
-        !invalid_neighbor_list,
-        "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
-        "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
-        "overflows occur during the norm computation between the dataset vectors.");
     }
+    RAFT_EXPECTS(
+      !invalid_neighbor_list,
+      "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too "
+      "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many "
+      "overflows occur during the norm computation between the dataset vectors.");
 
     const double time_prune_end = cur_time();
     RAFT_LOG_DEBUG("# Pruning time: %.1lf ms", (time_prune_end - time_prune_start) * 1000.0);
@@ -1512,94 +1401,48 @@ void optimize(
     //
     const double time_make_start = cur_time();
 
-    if (d_pruned_graph.extent(0) > 0) {
-      // GPU path: d_pruned_graph is on device; extract columns on device to preserve
-      // column-priority ordering (earlier columns get priority in the reverse graph).
-      auto d_rev_graph = raft::make_device_mdarray<IdxT>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
-      raft::matrix::fill(res,
-                         raft::make_device_vector_view<IdxT, int64_t>(
-                           d_rev_graph.data_handle(), graph_size * output_graph_degree),
-                         IdxT(-1));
-
-      auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
-      raft::matrix::fill(res, d_rev_graph_count.view(), uint32_t(0));
+    device_matrix_view_from_host<IdxT, int64_t> d_rev_graph(res, rev_graph.view());
+    raft::matrix::fill(res,
+                       raft::make_device_vector_view<IdxT, int64_t>(
+                         d_rev_graph.data_handle(), graph_size * output_graph_degree),
+                       IdxT(-1));
 
-      auto d_dest_nodes =
-        raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
+    auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
+      res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
+    raft::matrix::fill(res, d_rev_graph_count.view(), uint32_t(0));
 
-      for (uint64_t k = 0; k < output_graph_degree; k++) {
-        dim3 ext_threads(256, 1, 1);
-        dim3 ext_blocks(std::min(static_cast<uint32_t>((graph_size + 255) / 256), 65535u), 1, 1);
-        kern_extract_column<IdxT>
-          <<<ext_blocks, ext_threads, 0, raft::resource::get_cuda_stream(res)>>>(
-            d_pruned_graph.data_handle(),
-            d_dest_nodes.data_handle(),
-            graph_size,
-            output_graph_degree,
-            k);
-
-        dim3 threads(256, 1, 1);
-        dim3 blocks(1024, 1, 1);
-        kern_make_rev_graph<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
-          d_dest_nodes.data_handle(),
-          d_rev_graph.data_handle(),
-          d_rev_graph_count.data_handle(),
-          graph_size,
-          output_graph_degree);
+    auto dest_nodes = raft::make_host_vector<IdxT, int64_t>(graph_size);
+    auto d_dest_nodes =
+      raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
+
+    for (uint64_t k = 0; k < output_graph_degree; k++) {
+#pragma omp parallel for
+      for (uint64_t i = 0; i < graph_size; i++) {
+        // dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)];
+        dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)];
       }
       raft::resource::sync_stream(res);
 
-      d_pruned_graph = raft::make_device_mdarray<IdxT>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(0, 0));
-
-      raft::copy(res, rev_graph.view(), raft::make_const_mdspan(d_rev_graph.view()));
-      raft::copy(res, rev_graph_count.view(), raft::make_const_mdspan(d_rev_graph_count.view()));
-    } else {
-      // CPU fallback: per-column H-to-D copy approach.
-      device_matrix_view_from_host<IdxT, int64_t> d_rev_graph(res, rev_graph.view());
-      raft::matrix::fill(res,
-                         raft::make_device_vector_view<IdxT, int64_t>(
-                           d_rev_graph.data_handle(), graph_size * output_graph_degree),
-                         IdxT(-1));
-
-      auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
-        res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
-      raft::matrix::fill(res, d_rev_graph_count.view(), uint32_t(0));
-
-      auto dest_nodes = raft::make_host_vector<IdxT, int64_t>(graph_size);
-      auto d_dest_nodes =
-        raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
-
-      for (uint64_t k = 0; k < output_graph_degree; k++) {
-#pragma omp parallel for
-        for (uint64_t i = 0; i < graph_size; i++) {
-          dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)];
-        }
-        raft::resource::sync_stream(res);
+      raft::copy(res, d_dest_nodes.view(), raft::make_const_mdspan(dest_nodes.view()));
 
-        raft::copy(res, d_dest_nodes.view(), raft::make_const_mdspan(dest_nodes.view()));
-
-        dim3 threads(256, 1, 1);
-        dim3 blocks(1024, 1, 1);
-        kern_make_rev_graph<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
-          d_dest_nodes.data_handle(),
-          d_rev_graph.data_handle(),
-          d_rev_graph_count.data_handle(),
-          graph_size,
-          output_graph_degree);
-        RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u    \r", k, output_graph_degree);
-      }
+      dim3 threads(256, 1, 1);
+      dim3 blocks(1024, 1, 1);
+      kern_make_rev_graph<<<blocks, threads, 0, raft::resource::get_cuda_stream(res)>>>(
+        d_dest_nodes.data_handle(),
+        d_rev_graph.data_handle(),
+        d_rev_graph_count.data_handle(),
+        graph_size,
+        output_graph_degree);
+      RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u    \r", k, output_graph_degree);
+    }
 
-      raft::resource::sync_stream(res);
-      RAFT_LOG_DEBUG("\n");
+    raft::resource::sync_stream(res);
+    RAFT_LOG_DEBUG("\n");
 
-      if (d_rev_graph.allocated_memory()) {
-        raft::copy(res, rev_graph.view(), raft::make_const_mdspan(d_rev_graph.view()));
-      }
-      raft::copy(res, rev_graph_count.view(), raft::make_const_mdspan(d_rev_graph_count.view()));
+    if (d_rev_graph.allocated_memory()) {
+      raft::copy(res, rev_graph.view(), raft::make_const_mdspan(d_rev_graph.view()));
     }
+    raft::copy(res, rev_graph_count.view(), raft::make_const_mdspan(d_rev_graph_count.view()));
 
     const double time_make_end = cur_time();
     RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms",

From d2195fd5fb20209205a46ea17e3c35349450875f Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Mon, 16 Mar 2026 10:18:26 -0700
Subject: [PATCH 086/119] older api artefact

---
 .../neighbors/detail/cagra/cagra_build.cuh    | 27 +++----------------
 1 file changed, 3 insertions(+), 24 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 5931bfa156..690a706619 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -1903,8 +1903,7 @@ void optimize(
   raft::resources const& res,
   raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, g_accessor> knn_graph,
   raft::host_matrix_view<IdxT, int64_t, raft::row_major> new_graph,
-  const bool guarantee_connectivity = false,
-  const IdxT* d_knn_graph_ptr       = nullptr)
+  const bool guarantee_connectivity = false)
 {
   using internal_IdxT = typename std::make_unsigned<IdxT>::type;
 
@@ -1922,12 +1921,7 @@ void optimize(
       knn_graph.extent(1));
 
   cagra::detail::graph::optimize(
-    res,
-    knn_graph_internal,
-    new_graph_internal,
-    guarantee_connectivity,
-    true,
-    reinterpret_cast<const internal_IdxT*>(d_knn_graph_ptr));
+    res, knn_graph_internal, new_graph_internal, guarantee_connectivity);
 }
 
 // RAII wrapper for allocating memory with Transparent HugePage
@@ -2093,12 +2087,6 @@ void search_and_optimize(raft::resources const& res,
 {
   auto stream = raft::resource::get_cuda_stream(res);
 
-  // Accumulate search results on device to avoid D-to-H + H-to-D round-trip.
-  auto dev_knn_graph =
-    raft::make_device_matrix<IdxT, int64_t>(res, curr_query_size, curr_topk);
-
-  // Search in batches, accumulate results on both device and host.
-  // Host copy is needed by optimize Phase 3 (edge selection) which currently runs on CPU.
   cuvs::spatial::knn::detail::utils::batch_load_iterator<T> query_batch(
     dev_query_view.data_handle(),
     curr_query_size,
@@ -2121,28 +2109,19 @@ void search_and_optimize(raft::resources const& res,
                                    batch_dev_neighbors_view,
                                    batch_dev_distances_view);
 
-    // D-to-D: accumulate into device knn_graph
-    raft::copy(dev_knn_graph.data_handle() + batch.offset() * curr_topk,
-               batch_dev_neighbors_view.data_handle(),
-               batch.size() * curr_topk,
-               stream);
-
-    // D-to-H: still needed for optimize Phase 3 (host edge selection)
     raft::copy(neighbors_view.data_handle() + batch.offset() * curr_topk,
                batch_dev_neighbors_view.data_handle(),
                batch.size() * curr_topk,
                stream);
   }
 
-  // Optimize graph, passing device knn_graph to skip H-to-D copy inside optimize Phase 2.
   auto next_graph_size = curr_query_size;
   cagra_graph          = raft::make_host_matrix<IdxT, int64_t>(0, 0);
   cagra_graph          = raft::make_host_matrix<IdxT, int64_t>(next_graph_size, next_graph_degree);
   optimize<IdxT>(res,
                  neighbors_view,
                  cagra_graph.view(),
-                 flag_last ? params.guarantee_connectivity : 0,
-                 dev_knn_graph.data_handle());
+                 flag_last ? params.guarantee_connectivity : false);
 }
 
 template <typename T,

From 9f315cebfb9718b0c68a210ae0cfa7fe7f8b5478 Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Tue, 17 Mar 2026 04:33:55 -0700
Subject: [PATCH 087/119] put all optimize() steps onto device, no more extra
 copies d->h; also no connectivity guarantee

---
 .../neighbors/detail/cagra/cagra_build.cuh    | 147 ++++--------------
 cpp/src/neighbors/detail/cagra/graph_core.cuh |   2 +-
 2 files changed, 35 insertions(+), 114 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 690a706619..8c1fb2b0d7 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -2022,53 +2022,6 @@ void reconstruct_vpq_queries(raft::resources const& res,
       output.data_handle());
 }
 
-template <typename T, typename IdxT>
-void search_to_device_graph(raft::resources const& res,
-                            const cuvs::neighbors::cagra::search_params& search_params,
-                            const index<T, IdxT>& idx,
-                            raft::device_matrix_view<const T, int64_t> dev_query_view,
-                            raft::device_matrix_view<IdxT, int64_t> dev_neighbors,
-                            raft::device_matrix_view<float, int64_t> dev_distances,
-                            raft::device_matrix_view<IdxT, int64_t> dev_graph_output,
-                            size_t curr_query_size,
-                            size_t next_graph_degree,
-                            size_t curr_topk,
-                            uint64_t max_chunk_size)
-{
-  cuvs::spatial::knn::detail::utils::batch_load_iterator<T> query_batch(
-    dev_query_view.data_handle(),
-    curr_query_size,
-    dev_query_view.extent(1),
-    max_chunk_size,
-    raft::resource::get_cuda_stream(res),
-    raft::resource::get_workspace_resource(res));
-
-  for (const auto& batch : query_batch) {
-    auto batch_dev_query_view = raft::make_device_matrix_view<const T, int64_t>(
-      batch.data(), batch.size(), dev_query_view.extent(1));
-    auto batch_dev_neighbors_view = raft::make_device_matrix_view<IdxT, int64_t>(
-      dev_neighbors.data_handle(), batch.size(), curr_topk);
-    auto batch_dev_distances_view = raft::make_device_matrix_view<float, int64_t>(
-      dev_distances.data_handle(), batch.size(), curr_topk);
-
-    cuvs::neighbors::cagra::search(
-      res, search_params, idx, batch_dev_query_view, batch_dev_neighbors_view,
-      batch_dev_distances_view);
-
-    RAFT_CUDA_TRY(cudaMemcpy2DAsync(
-      dev_graph_output.data_handle() + batch.offset() * next_graph_degree,
-      next_graph_degree * sizeof(IdxT),
-      dev_neighbors.data_handle(),
-      curr_topk * sizeof(IdxT),
-      next_graph_degree * sizeof(IdxT),
-      batch.size(),
-      cudaMemcpyDeviceToDevice,
-      raft::resource::get_cuda_stream(res)));
-  }
-
-  raft::resource::sync_stream(res);
-}
-
 template <typename T, typename IdxT>
 void search_and_optimize(raft::resources const& res,
                          const cuvs::neighbors::cagra::search_params& search_params,
@@ -2076,17 +2029,16 @@ void search_and_optimize(raft::resources const& res,
                          raft::device_matrix_view<const T, int64_t> dev_query_view,
                          raft::device_matrix_view<IdxT, int64_t> dev_neighbors,
                          raft::device_matrix_view<float, int64_t> dev_distances,
-                         raft::host_matrix_view<IdxT, int64_t> neighbors_view,
-                         raft::host_matrix<IdxT, int64_t>& cagra_graph,
+                         raft::device_matrix<IdxT, int64_t>& dev_output_graph,
                          size_t curr_query_size,
                          size_t next_graph_degree,
                          size_t curr_topk,
-                         uint64_t max_chunk_size,
-                         bool flag_last,
-                         const index_params& params)
+                         uint64_t max_chunk_size)
 {
   auto stream = raft::resource::get_cuda_stream(res);
 
+  auto dev_knn_graph = raft::make_device_matrix<IdxT, int64_t>(res, curr_query_size, curr_topk);
+
   cuvs::spatial::knn::detail::utils::batch_load_iterator<T> query_batch(
     dev_query_view.data_handle(),
     curr_query_size,
@@ -2109,19 +2061,16 @@ void search_and_optimize(raft::resources const& res,
                                    batch_dev_neighbors_view,
                                    batch_dev_distances_view);
 
-    raft::copy(neighbors_view.data_handle() + batch.offset() * curr_topk,
+    raft::copy(dev_knn_graph.data_handle() + batch.offset() * curr_topk,
                batch_dev_neighbors_view.data_handle(),
                batch.size() * curr_topk,
                stream);
   }
 
-  auto next_graph_size = curr_query_size;
-  cagra_graph          = raft::make_host_matrix<IdxT, int64_t>(0, 0);
-  cagra_graph          = raft::make_host_matrix<IdxT, int64_t>(next_graph_size, next_graph_degree);
-  optimize<IdxT>(res,
-                 neighbors_view,
-                 cagra_graph.view(),
-                 flag_last ? params.guarantee_connectivity : false);
+  dev_output_graph =
+    raft::make_device_matrix<IdxT, int64_t>(res, curr_query_size, next_graph_degree);
+
+  graph::optimize<IdxT>(res, dev_knn_graph.view(), dev_output_graph.view(), false);
 }
 
 template <typename T,
@@ -2206,18 +2155,9 @@ auto iterative_build_graph(
     }
   }
 
-  // Allocate memory for neighbors list using Transparent HugePage
-  constexpr size_t thp_size = 2 * 1024 * 1024;
-  size_t byte_size          = sizeof(IdxT) * final_graph_size * topk;
-  if (byte_size % thp_size) { byte_size += thp_size - (byte_size % thp_size); }
-  mmap_owner neighbors_list(byte_size);
-  IdxT* neighbors_ptr = (IdxT*)neighbors_list.data();
-  memset(neighbors_ptr, 0, byte_size);
-
   bool flag_last       = false;
   auto curr_graph_size = initial_graph_size;
 
-  // Device graph for skip_graph_optimization: keeps the graph on device between iterations.
   auto dev_graph        = raft::make_device_matrix<IdxT, int64_t>(res, 0, 0);
   bool use_device_graph = false;
 
@@ -2244,13 +2184,13 @@ auto iterative_build_graph(
       "# Freed original dataset from device (%.1f MiB); queries will use VPQ reconstruction",
       to_mib(final_graph_size * dataset_dim * sizeof(T)));
   }
-  bool do_skip = true;
   while (true) {
     auto start           = std::chrono::high_resolution_clock::now();
     auto curr_query_size = std::min(2 * curr_graph_size, final_graph_size);
 
     auto next_graph_degree = small_graph_degree;
     if (curr_graph_size == final_graph_size) { next_graph_degree = graph_degree; }
+    RAFT_LOG_INFO("Current graph size %lu: # current graph degree = %lu", (uint64_t)curr_graph_size, (uint64_t)next_graph_degree);
 
     // The search count (topk) is set to the next graph degree + 1, because
     // pruning is not used except in the last iteration.
@@ -2262,9 +2202,6 @@ auto iterative_build_graph(
       curr_itopk_size = curr_topk + 32;
     }
 
-    do_skip = false;//params.skip_graph_optimization && !flag_last;
-    RAFT_LOG_INFO("# do_skip = %s", do_skip ? "true" : "false");
-
     cuvs::neighbors::cagra::search_params search_params;
     search_params.algo           = cuvs::neighbors::cagra::search_algo::AUTO;
     search_params.max_queries    = max_chunk_size;
@@ -2311,46 +2248,22 @@ auto iterative_build_graph(
       : raft::make_device_matrix_view<const T, int64_t>(
           dev_dataset.data_handle(), (int64_t)curr_query_size, dev_dataset.extent(1));
 
-    if (do_skip) {
-      auto dev_graph_next =
-        raft::make_device_matrix<IdxT, int64_t>(res, curr_query_size, next_graph_degree);
-
-      search_to_device_graph(res,
-                             search_params,
-                             idx,
-                             dev_query_view,
-                             dev_neighbors.view(),
-                             dev_distances.view(),
-                             dev_graph_next.view(),
-                             curr_query_size,
-                             next_graph_degree,
-                             curr_topk,
-                             max_chunk_size);
-
-      dev_graph        = std::move(dev_graph_next);
-      use_device_graph = true;
-    } else {
-      auto neighbors_view =
-        raft::make_host_matrix_view<IdxT, int64_t>(neighbors_ptr, curr_query_size, curr_topk);
-
-      search_and_optimize(res,
-                          search_params,
-                          idx,
-                          dev_query_view,
-                          dev_neighbors.view(),
-                          dev_distances.view(),
-                          neighbors_view,
-                          cagra_graph,
-                          curr_query_size,
-                          next_graph_degree,
-                          curr_topk,
-                          max_chunk_size,
-                          flag_last,
-                          params);
-
-      dev_graph        = raft::make_device_matrix<IdxT, int64_t>(res, 0, 0);
-      use_device_graph = false;
-    }
+    auto dev_optimized_graph = raft::make_device_matrix<IdxT, int64_t>(res, 0, 0);
+
+    search_and_optimize(res,
+                        search_params,
+                        idx,
+                        dev_query_view,
+                        dev_neighbors.view(),
+                        dev_distances.view(),
+                        dev_optimized_graph,
+                        curr_query_size,
+                        next_graph_degree,
+                        curr_topk,
+                        max_chunk_size);
+
+    dev_graph        = std::move(dev_optimized_graph);
+    use_device_graph = true;
 
     auto end        = std::chrono::high_resolution_clock::now();
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
@@ -2362,6 +2275,14 @@ auto iterative_build_graph(
     curr_graph_size      = next_graph_size;
   }
 
+  auto stream = raft::resource::get_cuda_stream(res);
+  cagra_graph = raft::make_host_matrix<IdxT, int64_t>(dev_graph.extent(0), dev_graph.extent(1));
+  raft::copy(cagra_graph.data_handle(),
+             dev_graph.data_handle(),
+             dev_graph.extent(0) * dev_graph.extent(1),
+             stream);
+  raft::resource::sync_stream(res);
+
   return cagra_graph;
 }
 
diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 5d43da851b..d5647dac00 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -1683,7 +1683,7 @@ void optimize(raft::resources const& res,
 
   // reverse graph creation will always use the GPU
   auto d_rev_graph = raft::make_device_mdarray<IdxT>(
-    res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
+    res, default_ws_mr, raft::make_extents<int64_t>(graph_size, output_graph_degree));
 
   // This should use the default workspace resource for random access / atomics
   auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(

From 1fc5acbc16294290cb6b637d9e712d723e1398e1 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Tue, 31 Mar 2026 15:55:29 +0900
Subject: [PATCH 088/119] Fix copyright

---
 cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh | 2 +-
 cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp      | 2 +-
 cpp/tests/neighbors/ann_utils.cuh                            | 2 +-
 cpp/tests/neighbors/vpq_utils.cuh                            | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 5de2478702..0cd3bd03a5 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
index 0f55b3efb2..06bcff2072 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
diff --git a/cpp/tests/neighbors/ann_utils.cuh b/cpp/tests/neighbors/ann_utils.cuh
index a5bb7c5268..dc4a335e3c 100644
--- a/cpp/tests/neighbors/ann_utils.cuh
+++ b/cpp/tests/neighbors/ann_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
diff --git a/cpp/tests/neighbors/vpq_utils.cuh b/cpp/tests/neighbors/vpq_utils.cuh
index 8ceb371413..35a14edc76 100644
--- a/cpp/tests/neighbors/vpq_utils.cuh
+++ b/cpp/tests/neighbors/vpq_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #include <cuvs/neighbors/common.hpp>

From 9fb763a2da2672e97dd8c2b85a8ec728e9236297 Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Wed, 8 Apr 2026 07:17:58 -0700
Subject: [PATCH 089/119] cmake fix

---
 cpp/cmake/modules/generate_jit_lto_kernels.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/modules/generate_jit_lto_kernels.cmake b/cpp/cmake/modules/generate_jit_lto_kernels.cmake
index e27f432b76..69356095c6 100644
--- a/cpp/cmake/modules/generate_jit_lto_kernels.cmake
+++ b/cpp/cmake/modules/generate_jit_lto_kernels.cmake
@@ -75,7 +75,7 @@ function(add_jit_lto_kernel kernel_target)
     OUTPUT "${_JIT_LTO_EMBEDDED_HEADER_FILE}"
     COMMAND "${bin_to_c}" --const --name embedded_fatbin --static $<TARGET_OBJECTS:${kernel_target}>
             > "${_JIT_LTO_EMBEDDED_HEADER_FILE}"
-    DEPENDS $<TARGET_OBJECTS:${kernel_target}>
+    DEPENDS ${kernel_target} $<TARGET_OBJECTS:${kernel_target}>
   )
 endfunction()
 

From c91011ff426a59a8a8c01ed89d826e176c1e9b1f Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Wed, 8 Apr 2026 08:56:17 -0700
Subject: [PATCH 090/119] Decoupled compression parameters used during
 iterative graph construction from the target compression

---
 .../src/cuvs/cuvs_ann_bench_param_parser.h    |  8 ++++
 cpp/include/cuvs/neighbors/cagra.hpp          | 21 ++++++++-
 cpp/include/cuvs/neighbors/common.hpp         | 12 +++++
 .../neighbors/detail/cagra/cagra_build.cuh    | 46 ++++++++++++++++---
 4 files changed, 79 insertions(+), 8 deletions(-)

diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
index 2eaf3123a0..9b6acaaf10 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -295,6 +295,7 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index
   nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_");
   nlohmann::json nn_descent_conf    = collect_conf_with_prefix(conf, "nn_descent_");
   nlohmann::json ace_conf           = collect_conf_with_prefix(conf, "ace_");
+  nlohmann::json build_compression_conf  = collect_conf_with_prefix(conf, "build_compression_");
 
   // When graph_build_algo is not specified, leave graph_build_params as monostate so the
   // CAGRA build uses AUTO selection (NN_DESCENT or IVF_PQ based on dataset/heuristics).
@@ -325,6 +326,13 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index
       } else if constexpr (std::is_same_v<U,
                                           cuvs::neighbors::graph_build_params::nn_descent_params>) {
         parse_build_param<T, IdxT>(nn_descent_conf, arg);
+      } else if constexpr (std::is_same_v<
+                             U, cuvs::neighbors::graph_build_params::iterative_search_params>) {
+        if (!build_compression_conf.empty()) {
+          auto vpq_pams = arg.build_compression.value_or(cuvs::neighbors::vpq_params{});
+          parse_build_param(build_compression_conf, vpq_pams);
+          arg.build_compression.emplace(vpq_pams);
+        }
       }
     },
     params.graph_build_params);
diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 2c32a6c08e..0b0fb90d0e 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -31,7 +31,26 @@
 #include <variant>
 
 namespace cuvs::neighbors::graph_build_params {
-using iterative_search_params = cuvs::neighbors::search_params;
+
+/**
+ * Parameters for the iterative CAGRA graph build algorithm.
+ *
+ * The iterative builder repeatedly runs CAGRA search() and optimize() to
+ * refine the graph.  When compression is used during graph construction,
+ * `build_compression` controls the VPQ parameters applied to the dataset
+ * *while building the graph*.  This is independent of `index_params::compression`,
+ * which controls the compression of the dataset stored in the final index.
+ */
+struct iterative_search_params : cuvs::neighbors::search_params {
+  /**
+   * Optional VPQ compression parameters used during iterative graph construction.
+   *
+   * When set, the dataset is compressed with these parameters for the
+   * search-and-optimize loop.  When std::nullopt (default), the builder
+   * falls back to `index_params::compression` (original behaviour).
+   */
+  std::optional<cuvs::neighbors::vpq_params> build_compression = std::nullopt;
+};
 
 /** Specialized parameters for ACE (Augmented Core Extraction) graph build */
 struct ace_params {
diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp
index c7111aaf4a..2c12f6ef34 100644
--- a/cpp/include/cuvs/neighbors/common.hpp
+++ b/cpp/include/cuvs/neighbors/common.hpp
@@ -94,6 +94,18 @@ struct vpq_params {
    * The max number of data points to use per VQ cluster during training.
    */
   uint32_t max_train_points_per_vq_cluster = 1024;
+
+  friend bool operator==(const vpq_params& a, const vpq_params& b)
+  {
+    return a.pq_bits == b.pq_bits && a.pq_dim == b.pq_dim && a.vq_n_centers == b.vq_n_centers &&
+           a.kmeans_n_iters == b.kmeans_n_iters &&
+           a.vq_kmeans_trainset_fraction == b.vq_kmeans_trainset_fraction &&
+           a.pq_kmeans_trainset_fraction == b.pq_kmeans_trainset_fraction &&
+           a.pq_kmeans_type == b.pq_kmeans_type &&
+           a.max_train_points_per_pq_code == b.max_train_points_per_pq_code &&
+           a.max_train_points_per_vq_cluster == b.max_train_points_per_vq_cluster;
+  }
+  friend bool operator!=(const vpq_params& a, const vpq_params& b) { return !(a == b); }
 };
 
 /** @} */  // end group cagra_cpp_index_params
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index eaa4411c52..13e91868d5 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -2085,6 +2085,34 @@ auto iterative_build_graph(
   size_t intermediate_degree = params.intermediate_graph_degree;
   size_t graph_degree        = params.graph_degree;
 
+  // Resolve the compression parameters for the build loop.
+  // `build_compression` (from iterative_search_params) takes priority;
+  // if unset, fall back to `params.compression` (original behaviour).
+  const auto& iter_params =
+    std::get<cagra::graph_build_params::iterative_search_params>(params.graph_build_params);
+  const auto& build_compression =
+    iter_params.build_compression.has_value() ? iter_params.build_compression : params.compression;
+
+  if (build_compression.has_value()) {
+    const auto& bc = *build_compression;
+    RAFT_LOG_INFO(
+      "Build compression params: pq_bits=%u, pq_dim=%u, vq_n_centers=%u, kmeans_n_iters=%u, "
+      "vq_kmeans_trainset_fraction=%.4f, pq_kmeans_trainset_fraction=%.4f, "
+      "max_train_points_per_pq_code=%u, max_train_points_per_vq_cluster=%u%s",
+      bc.pq_bits,
+      bc.pq_dim,
+      bc.vq_n_centers,
+      bc.kmeans_n_iters,
+      bc.vq_kmeans_trainset_fraction,
+      bc.pq_kmeans_trainset_fraction,
+      bc.max_train_points_per_pq_code,
+      bc.max_train_points_per_vq_cluster,
+      iter_params.build_compression.has_value() ? " (from build_compression)"
+                                                : " (from compression)");
+  } else {
+    RAFT_LOG_INFO("Build compression: disabled (uncompressed build)");
+  }
+
   auto cagra_graph = raft::make_host_matrix<IdxT, int64_t>(0, 0);
 
   // Iteratively improve the accuracy of the graph by repeatedly running
@@ -2164,7 +2192,7 @@ auto iterative_build_graph(
   // Generate the compressed index once if compression is enabled
   const uint64_t dataset_dim = dev_dataset.extent(1);
   std::optional<index<T, IdxT>> idx_opt;
-  if (params.compression.has_value()) {
+  if (build_compression.has_value()) {
     auto start = std::chrono::high_resolution_clock::now();
     RAFT_EXPECTS(params.metric == cuvs::distance::DistanceType::L2Expanded,
                  "VPQ compression is only supported with L2Expanded distance mertric");
@@ -2173,7 +2201,7 @@ auto iterative_build_graph(
       res,
       // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later
       cuvs::preprocessing::quantize::pq::vpq_build(
-        res, *params.compression, dev_dataset));
+        res, *build_compression, dev_dataset));
     auto end        = std::chrono::high_resolution_clock::now();
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
     RAFT_LOG_INFO("# VPQ compression time: %.3lf sec", (double)elapsed_ms / 1000);
@@ -2219,7 +2247,7 @@ auto iterative_build_graph(
     search_params.search_width   = 1;
 
     // Create index and query views.
-    if (!params.compression.has_value()) {
+    if (!build_compression.has_value()) {
       auto dev_dataset_view = raft::make_device_matrix_view<const T, int64_t>(
         dev_dataset.data_handle(), (int64_t)curr_graph_size, dev_dataset.extent(1));
       if (use_device_graph) {
@@ -2241,17 +2269,17 @@ auto iterative_build_graph(
     // When compression is enabled, reconstruct queries from VPQ codes instead of
     // reading from the (freed) original dataset.
     auto dev_reconstructed_queries =
-      params.compression.has_value()
+      build_compression.has_value()
         ? raft::make_device_matrix<T, int64_t>(res, curr_query_size, dataset_dim)
         : raft::make_device_matrix<T, int64_t>(res, 0, 0);
-    if (params.compression.has_value()) {
+    if (build_compression.has_value()) {
       auto* vpq_dset =
         dynamic_cast<const vpq_dataset<half, int64_t>*>(&idx.data());
       RAFT_EXPECTS(vpq_dset != nullptr, "Expected VPQ dataset in compressed index");
       reconstruct_vpq_queries<T, half, int64_t>(
         res, *vpq_dset, 0, curr_query_size, dev_reconstructed_queries.view());
     }
-    auto dev_query_view = params.compression.has_value()
+    auto dev_query_view = build_compression.has_value()
       ? raft::make_device_matrix_view<const T, int64_t>(
           dev_reconstructed_queries.data_handle(), (int64_t)curr_query_size, dataset_dim)
       : raft::make_device_matrix_view<const T, int64_t>(
@@ -2284,6 +2312,9 @@ auto iterative_build_graph(
     curr_graph_size      = next_graph_size;
   }
 
+  // TODO: when build_compression matches params.compression, the dataset is compressed twice
+  // (once for the build loop and once in build()'s shared tail). We could avoid this by returning
+  // the index directly (with its VPQ dataset and device-side graph) instead of just the host graph.
   auto stream = raft::resource::get_cuda_stream(res);
   cagra_graph = raft::make_host_matrix<IdxT, int64_t>(dev_graph.extent(0), dev_graph.extent(1));
   raft::copy(cagra_graph.data_handle(),
@@ -2384,7 +2415,8 @@ index<T, IdxT> build(
       }
       if (nn_descent_params.graph_degree != intermediate_degree) {
         RAFT_LOG_WARN(
-          "Graph degree (%lu) for nn-descent needs to match cagra intermediate graph degree (%lu), "
+          "Graph degree (%lu) for nn-descent needs to match cagra intermediate graph degree "
+          "(%lu), "
           "aligning "
           "nn-descent graph_degree.",
           nn_descent_params.graph_degree,

From fbba6b259e37bf3b3d47255cb94c13422863dddf Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 8 Apr 2026 18:28:38 +0900
Subject: [PATCH 091/119] Add enable_fp8

---
 cpp/CMakeLists.txt                                          | 6 +++---
 .../neighbors/detail/cagra/compute_distance_vpq_inst.cu.in  | 4 +++-
 .../neighbors/detail/cagra/compute_distance_vpq_matrix.json | 3 ++-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 72a4d65a5a..aa2b92d825 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -268,12 +268,12 @@ if(NOT BUILD_CPU_ONLY)
     INPUT_FILE
       "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in"
     OUTPUT_FILE_FORMAT
-      "${CMAKE_CURRENT_BINARY_DIR}/src/neighbors/detail/cagra/compute_distance_vpq_inst_data_@data_abbrev@_index_@index_abbrev@_distance_@distance_abbrev@_codebook_@codebook_abbrev@_metric_@metric@_team_@team_size@_dim_@dim@_pq_bits_@pq_bits@_pq_len_@pq_len@.cu"
+      "${CMAKE_CURRENT_BINARY_DIR}/src/neighbors/detail/cagra/compute_distance_vpq_inst_data_@data_abbrev@_index_@index_abbrev@_distance_@distance_abbrev@_codebook_@codebook_abbrev@_metric_@metric@_team_@team_size@_dim_@dim@_pq_bits_@pq_bits@_pq_len_@pq_len@_fp8@enable_fp8@.cu"
   )
   generate_string_matrix(
     cagra_compute_distance_vpq_selector_template_params
     ITEM_FORMAT
-    "\nvpq_descriptor_spec<DistanceType::@metric@, @team_size@, @dim@, @pq_bits@, @pq_len@, @codebook_type@, @data_type@, @index_type@, @distance_type@>"
+    "\nvpq_descriptor_spec<DistanceType::@metric@, @team_size@, @dim@, @pq_bits@, @pq_len@, @codebook_type@, @data_type@, @index_type@, @distance_type@, @enable_fp8@>"
     GLUE
     ","
     MATRIX_JSON_FILE
@@ -282,7 +282,7 @@ if(NOT BUILD_CPU_ONLY)
   generate_string_matrix(
     cagra_compute_distance_vpq_template_inst
     ITEM_FORMAT
-    "extern template struct vpq_descriptor_spec<DistanceType::@metric@, @team_size@, @dim@, @pq_bits@, @pq_len@, @codebook_type@, @data_type@, @index_type@, @distance_type@>@semicolon@"
+    "extern template struct vpq_descriptor_spec<DistanceType::@metric@, @team_size@, @dim@, @pq_bits@, @pq_len@, @codebook_type@, @data_type@, @index_type@, @distance_type@, @enable_fp8@>@semicolon@"
     GLUE
     "\n"
     MATRIX_JSON_FILE
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in
index c159da3229..676f25c9fd 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in
@@ -13,6 +13,7 @@ constexpr uint32_t team_size = @team_size@;
 constexpr uint32_t dim       = @dim@;
 constexpr uint32_t pq_bits   = @pq_bits@;
 constexpr uint32_t pq_len    = @pq_len@;
+constexpr bool enable_fp8    = @enable_fp8@;
 using codebook_t             = @codebook_type@;
 using data_t                 = @data_type@;
 using index_t                = @index_type@;
@@ -30,6 +31,7 @@ template struct vpq_descriptor_spec<metric,
                                     codebook_t,
                                     data_t,
                                     index_t,
-                                    distance_t>;
+                                    distance_t,
+                                    enable_fp8>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json
index cf6e060d33..76c0f888eb 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json
@@ -55,5 +55,6 @@
   ],
   "pq_bits": ["8"],
   "pq_len": ["2", "4"],
-  "metric": ["L2Expanded"]
+  "metric": ["L2Expanded"],
+  "enable_fp8": ["true", "false"]
 }

From 7ed0fe7e9a15ddc826940c146da8b93791188de1 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Thu, 9 Apr 2026 12:41:46 +0900
Subject: [PATCH 092/119] Fix smem_dtype validation

---
 cpp/src/neighbors/detail/cagra/cagra_search.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index f49e1a4d4c..7c6687efbb 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -152,7 +152,7 @@ void search_main(raft::resources const& res,
   // Dispatch search parameters based on the dataset kind.
   if (auto* strided_dset = dynamic_cast<const strided_dataset<T, ds_idx_type>*>(&index.data());
       strided_dset != nullptr) {
-    if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO ||
+    if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO &&
         params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::F16) {
       RAFT_LOG_WARN("In this search mode, only AUTO or F16 are supported as the smem_dtype.");
     }

From b7f52101a10ed5a1bbf97dab370aeb8e666c6282 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Thu, 9 Apr 2026 12:44:58 +0900
Subject: [PATCH 093/119] Fix params.smem_dtype set

---
 cpp/src/neighbors/detail/cagra/cagra_search.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index 7c6687efbb..1445676631 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -154,7 +154,8 @@ void search_main(raft::resources const& res,
       strided_dset != nullptr) {
     if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO &&
         params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::F16) {
-      RAFT_LOG_WARN("In this search mode, only AUTO or F16 are supported as the smem_dtype.");
+      RAFT_LOG_WARN("In this search mode, smem_dtype supports only AUTO or F16. Set it to AUTO.");
+      params.smem_dtype = cuvs::neighbors::cagra::internal_dtype::AUTO;
     }
     // Search using a plain (strided) row-major dataset
     RAFT_EXPECTS(index.metric() != cuvs::distance::DistanceType::CosineExpanded ||

From beb0a47fffa4722b76f5d86ddabdad7ba716f864 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Thu, 9 Apr 2026 13:51:42 +0900
Subject: [PATCH 094/119] Fix CAGRA VPQ instance list

---
 .../cagra/compute_distance_vpq_matrix.json    | 55 +++++++++++++++++--
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json
index 76c0f888eb..06aea3f10d 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json
@@ -34,17 +34,65 @@
     }
   ],
   "_mxdim_team": [
+    {
+      "dim": "64",
+      "team_size": "4",
+      "pq_len": "2"
+    },
+    {
+      "dim": "128",
+      "team_size": "8",
+      "pq_len": "2"
+    },
+    {
+      "dim": "256",
+      "team_size": "16",
+      "pq_len": "2"
+    },
+    {
+      "dim": "512",
+      "team_size": "32",
+      "pq_len": "2"
+    },
+    {
+      "dim": "64",
+      "team_size": "4",
+      "pq_len": "4"
+    },
+    {
+      "dim": "128",
+      "team_size": "8",
+      "pq_len": "4"
+    },
+    {
+      "dim": "256",
+      "team_size": "16",
+      "pq_len": "4"
+    },
+    {
+      "dim": "512",
+      "team_size": "32",
+      "pq_len": "4"
+    },
     {
       "dim": "128",
-      "team_size": "8"
+      "team_size": "4",
+      "pq_len": "8"
     },
     {
       "dim": "256",
-      "team_size": "16"
+      "team_size": "8",
+      "pq_len": "8"
     },
     {
       "dim": "512",
-      "team_size": "32"
+      "team_size": "16",
+      "pq_len": "8"
+    },
+    {
+      "dim": "1024",
+      "team_size": "32",
+      "pq_len": "8"
     }
   ],
   "_codebook": [
@@ -54,7 +102,6 @@
     }
   ],
   "pq_bits": ["8"],
-  "pq_len": ["2", "4"],
   "metric": ["L2Expanded"],
   "enable_fp8": ["true", "false"]
 }

From d430c39a15eb9c0eda4b6a2d536b83c029d8fdfd Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Thu, 9 Apr 2026 00:50:47 -0700
Subject: [PATCH 095/119] fixed the In this search mode, only AUTO or F16 are
 supported as the smem_dtype. warning bug

---
 cpp/src/neighbors/detail/cagra/cagra_build.cuh  | 3 +--
 cpp/src/neighbors/detail/cagra/cagra_search.cuh | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 13e91868d5..1866a0ab7d 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -2415,8 +2415,7 @@ index<T, IdxT> build(
       }
       if (nn_descent_params.graph_degree != intermediate_degree) {
         RAFT_LOG_WARN(
-          "Graph degree (%lu) for nn-descent needs to match cagra intermediate graph degree "
-          "(%lu), "
+          "Graph degree (%lu) for nn-descent needs to match cagra intermediate graph degree (%lu), "
           "aligning "
           "nn-descent graph_degree.",
           nn_descent_params.graph_degree,
diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index 3917b3160b..63743a8197 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -154,7 +154,7 @@ void search_main(raft::resources const& res,
   // Dispatch search parameters based on the dataset kind.
   if (auto* strided_dset = dynamic_cast<const strided_dataset<T, ds_idx_type>*>(&index.data());
       strided_dset != nullptr) {
-    if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO ||
+    if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO &&
         params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::F16) {
       RAFT_LOG_WARN("In this search mode, only AUTO or F16 are supported as the smem_dtype.");
     }

From 471069b221adc284ec6f91c0ef9b77b85aaea5f4 Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Thu, 9 Apr 2026 01:04:28 -0700
Subject: [PATCH 096/119] Fix structured binding mismatch in calc_recall and
 add explicit return types

Made-with: Cursor
---
 cpp/tests/neighbors/ann_utils.cuh | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/cpp/tests/neighbors/ann_utils.cuh b/cpp/tests/neighbors/ann_utils.cuh
index dc4a335e3c..5fd010806e 100644
--- a/cpp/tests/neighbors/ann_utils.cuh
+++ b/cpp/tests/neighbors/ann_utils.cuh
@@ -128,10 +128,10 @@ struct idx_dist_pair {
 /** Calculate recall value using only neighbor indices
  */
 template <typename T>
-auto calc_recall(const std::vector<T>& expected_idx,
-                 const std::vector<T>& actual_idx,
-                 size_t rows,
-                 size_t cols)
+std::tuple<double, size_t, size_t> calc_recall(const std::vector<T>& expected_idx,
+                                               const std::vector<T>& actual_idx,
+                                               size_t rows,
+                                               size_t cols)
 {
   size_t match_count = 0;
   size_t total_count = static_cast<size_t>(rows) * static_cast<size_t>(cols);
@@ -196,7 +196,7 @@ auto eval_recall(const std::vector<T>& expected_idx,
                  double min_recall,
                  bool test_unique = true) -> testing::AssertionResult
 {
-  auto [actual_recall, index_based_actual_recall, match_count, total_count] =
+  auto [actual_recall, match_count, total_count] =
     calc_recall(expected_idx, actual_idx, rows, cols);
   double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps);
   RAFT_LOG_INFO("Recall = %f (%zu/%zu), the error is %2.1f%% %s the threshold (eps = %f).",
@@ -220,13 +220,13 @@ auto eval_recall(const std::vector<T>& expected_idx,
 /** Overload of calc_recall to account for distances
  */
 template <typename T, typename DistT>
-auto calc_recall(const std::vector<T>& expected_idx,
-                 const std::vector<T>& actual_idx,
-                 const std::vector<DistT>& expected_dist,
-                 const std::vector<DistT>& actual_dist,
-                 size_t rows,
-                 size_t cols,
-                 double eps)
+std::tuple<double, double, size_t, size_t> calc_recall(const std::vector<T>& expected_idx,
+                                                       const std::vector<T>& actual_idx,
+                                                       const std::vector<DistT>& expected_dist,
+                                                       const std::vector<DistT>& actual_dist,
+                                                       size_t rows,
+                                                       size_t cols,
+                                                       double eps)
 {
   size_t match_count       = 0;
   size_t index_match_count = 0;

From 648ade4f55713af43928bb021bbba6ec96302f2b Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Thu, 9 Apr 2026 02:09:29 -0700
Subject: [PATCH 097/119] Search parameters used during iterative cagra graph
 construction are now also configurable

---
 .../src/cuvs/cuvs_ann_bench_param_parser.h    |  62 +++++
 cpp/include/cuvs/neighbors/cagra.hpp          | 220 +++++++++---------
 .../neighbors/detail/cagra/cagra_build.cuh    |  12 +-
 3 files changed, 184 insertions(+), 110 deletions(-)

diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
index 9b6acaaf10..137732ca92 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -296,6 +296,7 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index
   nlohmann::json nn_descent_conf    = collect_conf_with_prefix(conf, "nn_descent_");
   nlohmann::json ace_conf           = collect_conf_with_prefix(conf, "ace_");
   nlohmann::json build_compression_conf  = collect_conf_with_prefix(conf, "build_compression_");
+  nlohmann::json build_search_conf       = collect_conf_with_prefix(conf, "build_search_");
 
   // When graph_build_algo is not specified, leave graph_build_params as monostate so the
   // CAGRA build uses AUTO selection (NN_DESCENT or IVF_PQ based on dataset/heuristics).
@@ -333,6 +334,67 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index
           parse_build_param(build_compression_conf, vpq_pams);
           arg.build_compression.emplace(vpq_pams);
         }
+        if (build_search_conf.contains("width")) {
+          arg.search_width = build_search_conf.at("width");
+        }
+        if (build_search_conf.contains("max_iterations")) {
+          arg.max_iterations = build_search_conf.at("max_iterations");
+        }
+        if (build_search_conf.contains("min_iterations")) {
+          arg.min_iterations = build_search_conf.at("min_iterations");
+        }
+        if (build_search_conf.contains("itopk")) {
+          arg.itopk_size = build_search_conf.at("itopk");
+        }
+        if (build_search_conf.contains("max_queries")) {
+          arg.max_queries = build_search_conf.at("max_queries");
+        }
+        if (build_search_conf.contains("team_size")) {
+          arg.team_size = build_search_conf.at("team_size");
+        }
+        if (build_search_conf.contains("thread_block_size")) {
+          arg.thread_block_size = build_search_conf.at("thread_block_size");
+        }
+        if (build_search_conf.contains("hashmap_min_bitlen")) {
+          arg.hashmap_min_bitlen = build_search_conf.at("hashmap_min_bitlen");
+        }
+        if (build_search_conf.contains("hashmap_max_fill_rate")) {
+          arg.hashmap_max_fill_rate = build_search_conf.at("hashmap_max_fill_rate");
+        }
+        if (build_search_conf.contains("num_random_samplings")) {
+          arg.num_random_samplings = build_search_conf.at("num_random_samplings");
+        }
+        if (build_search_conf.contains("persistent")) {
+          arg.persistent = build_search_conf.at("persistent");
+        }
+        if (build_search_conf.contains("persistent_lifetime")) {
+          arg.persistent_lifetime = build_search_conf.at("persistent_lifetime");
+        }
+        if (build_search_conf.contains("persistent_device_usage")) {
+          arg.persistent_device_usage = build_search_conf.at("persistent_device_usage");
+        }
+        if (build_search_conf.contains("algo")) {
+          std::string algo = build_search_conf.at("algo");
+          if (algo == "single_cta") {
+            arg.algo = cuvs::neighbors::cagra::search_algo::SINGLE_CTA;
+          } else if (algo == "multi_cta") {
+            arg.algo = cuvs::neighbors::cagra::search_algo::MULTI_CTA;
+          } else if (algo == "multi_kernel") {
+            arg.algo = cuvs::neighbors::cagra::search_algo::MULTI_KERNEL;
+          } else if (algo == "auto") {
+            arg.algo = cuvs::neighbors::cagra::search_algo::AUTO;
+          }
+        }
+        if (build_search_conf.contains("hashmap_mode")) {
+          std::string mode = build_search_conf.at("hashmap_mode");
+          if (mode == "hash") {
+            arg.hashmap_mode = cuvs::neighbors::cagra::hash_mode::HASH;
+          } else if (mode == "small") {
+            arg.hashmap_mode = cuvs::neighbors::cagra::hash_mode::SMALL;
+          } else if (mode == "auto") {
+            arg.hashmap_mode = cuvs::neighbors::cagra::hash_mode::AUTO;
+          }
+        }
       }
     },
     params.graph_build_params);
diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 0b0fb90d0e..d7f8f54afc 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -30,18 +30,125 @@
 #include <string>
 #include <variant>
 
+namespace cuvs::neighbors::cagra {
+
+/**
+ * @defgroup cagra_cpp_search_params CAGRA index search parameters
+ * @{
+ */
+
+enum class search_algo {
+  /** For large batch sizes. */
+  SINGLE_CTA = 0,
+  /** For small batch sizes. */
+  MULTI_CTA    = 1,
+  MULTI_KERNEL = 2,
+  AUTO         = 100
+};
+
+enum class hash_mode { HASH = 0, SMALL = 1, AUTO = 100 };
+
+enum class internal_dtype { F16 = 0, E5M2 = 1, AUTO = 100 };
+
+struct search_params : cuvs::neighbors::search_params {
+  /** Maximum number of queries to search at the same time (batch size). Auto select when 0.*/
+  size_t max_queries = 0;
+
+  /** Number of intermediate search results retained during the search.
+   *
+   *  This is the main knob to adjust trade off between accuracy and search speed.
+   *  Higher values improve the search accuracy.
+   */
+  size_t itopk_size = 64;
+
+  /** Upper limit of search iterations. Auto select when 0.*/
+  size_t max_iterations = 0;
+
+  // In the following we list additional search parameters for fine tuning.
+  // Reasonable default values are automatically chosen.
+
+  /** Which search implementation to use. */
+  search_algo algo = search_algo::AUTO;
+
+  /** Number of threads used to calculate a single distance. 4, 8, 16, or 32. */
+  size_t team_size = 0;
+
+  /** Number of graph nodes to select as the starting point for the search in each iteration. aka
+   * search width?*/
+  size_t search_width = 1;
+  /** Lower limit of search iterations. */
+  size_t min_iterations = 0;
+
+  /** Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0. */
+  size_t thread_block_size = 0;
+  /** Hashmap type. Auto selection when AUTO. */
+  hash_mode hashmap_mode = hash_mode::AUTO;
+  /** Lower limit of hashmap bit length. More than 8. */
+  size_t hashmap_min_bitlen = 0;
+  /** Upper limit of hashmap fill rate. More than 0.1, less than 0.9.*/
+  float hashmap_max_fill_rate = 0.5;
+
+  /** Number of iterations of initial random seed node selection. 1 or more. */
+  uint32_t num_random_samplings = 1;
+  /** Bit mask used for initial random seed node selection. */
+  uint64_t rand_xor_mask = 0x128394;
+
+  /** Whether to use the persistent version of the kernel (only SINGLE_CTA is supported a.t.m.) */
+  bool persistent = false;
+  /** Persistent kernel: time in seconds before the kernel stops if no requests received. */
+  float persistent_lifetime = 2;
+  /**
+   * Set the fraction of maximum grid size used by persistent kernel.
+   * Value 1.0 means the kernel grid size is maximum possible for the selected device.
+   * The value must be greater than 0.0 and not greater than 1.0.
+   *
+   * One may need to run other kernels alongside this persistent kernel. This parameter can
+   * be used to reduce the grid size of the persistent kernel to leave a few SMs idle.
+   * Note: running any other work on GPU alongside with the persistent kernel makes the setup
+   * fragile.
+   *   - Running another kernel in another thread usually works, but no progress guaranteed
+   *   - Any CUDA allocations block the context (this issue may be obscured by using pools)
+   *   - Memory copies to not-pinned host memory may block the context
+   *
+   * Even when we know there are no other kernels working at the same time, setting
+   * kDeviceUsage to 1.0 surprisingly sometimes hurts performance. Proceed with care.
+   * If you suspect this is an issue, you can reduce this number to ~0.9 without a significant
+   * impact on the throughput.
+   */
+  float persistent_device_usage = 1.0;
+
+  /**
+   * A parameter indicating the rate of nodes to be filtered-out, when filtering is used.
+   * The value must be equal to or greater than 0.0 and less than 1.0. Default value is
+   * negative, in which case the filtering rate is automatically calculated.
+   */
+  float filtering_rate = -1.0;
+
+  /** Data type of the query vector and codebook table on shared memory. Currently, only VPQ
+   * supports FP8. **/
+  internal_dtype smem_dtype = internal_dtype::AUTO;
+};
+
+/** @} */
+
+}  // namespace cuvs::neighbors::cagra
+
 namespace cuvs::neighbors::graph_build_params {
 
 /**
  * Parameters for the iterative CAGRA graph build algorithm.
  *
- * The iterative builder repeatedly runs CAGRA search() and optimize() to
- * refine the graph.  When compression is used during graph construction,
+ * Inherits from cagra::search_params so that all search tuning knobs
+ * (search_width, max_iterations, itopk_size, etc.) are available for
+ * controlling the search-and-optimize loop during graph construction.
+ * The defaults are tuned for the build loop (e.g. search_width=1,
+ * max_iterations=8) and may differ from the regular search defaults.
+ *
  * `build_compression` controls the VPQ parameters applied to the dataset
  * *while building the graph*.  This is independent of `index_params::compression`,
  * which controls the compression of the dataset stored in the final index.
  */
-struct iterative_search_params : cuvs::neighbors::search_params {
+struct iterative_search_params : cuvs::neighbors::cagra::search_params {
   /**
    * Optional VPQ compression parameters used during iterative graph construction.
    *
@@ -50,6 +157,12 @@ struct iterative_search_params : cuvs::neighbors::search_params {
    * falls back to `index_params::compression` (original behaviour).
    */
   std::optional<cuvs::neighbors::vpq_params> build_compression = std::nullopt;
+
+  iterative_search_params()
+  {
+    this->search_width   = 1;
+    this->max_iterations = 8;
+  }
 };
 
 /** Specialized parameters for ACE (Augmented Core Extraction) graph build */
@@ -277,107 +390,6 @@ struct index_params : cuvs::neighbors::index_params {
     cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded);
 };
 
-/**
- * @}
- */
-
-/**
- * @defgroup cagra_cpp_search_params CAGRA index search parameters
- * @{
- */
-
-enum class search_algo {
-  /** For large batch sizes. */
-  SINGLE_CTA = 0,
-  /** For small batch sizes. */
-  MULTI_CTA    = 1,
-  MULTI_KERNEL = 2,
-  AUTO         = 100
-};
-
-enum class hash_mode { HASH = 0, SMALL = 1, AUTO = 100 };
-
-enum class internal_dtype { F16 = 0, E5M2 = 1, AUTO = 100 };
-
-struct search_params : cuvs::neighbors::search_params {
-  /** Maximum number of queries to search at the same time (batch size). Auto select when 0.*/
-  size_t max_queries = 0;
-
-  /** Number of intermediate search results retained during the search.
-   *
-   *  This is the main knob to adjust trade off between accuracy and search speed.
-   *  Higher values improve the search accuracy.
-   */
-  size_t itopk_size = 64;
-
-  /** Upper limit of search iterations. Auto select when 0.*/
-  size_t max_iterations = 0;
-
-  // In the following we list additional search parameters for fine tuning.
-  // Reasonable default values are automatically chosen.
-
-  /** Which search implementation to use. */
-  search_algo algo = search_algo::AUTO;
-
-  /** Number of threads used to calculate a single distance. 4, 8, 16, or 32. */
-  size_t team_size = 0;
-
-  /** Number of graph nodes to select as the starting point for the search in each iteration. aka
-   * search width?*/
-  size_t search_width = 1;
-  /** Lower limit of search iterations. */
-  size_t min_iterations = 0;
-
-  /** Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0. */
-  size_t thread_block_size = 0;
-  /** Hashmap type. Auto selection when AUTO. */
-  hash_mode hashmap_mode = hash_mode::AUTO;
-  /** Lower limit of hashmap bit length. More than 8. */
-  size_t hashmap_min_bitlen = 0;
-  /** Upper limit of hashmap fill rate. More than 0.1, less than 0.9.*/
-  float hashmap_max_fill_rate = 0.5;
-
-  /** Number of iterations of initial random seed node selection. 1 or more. */
-  uint32_t num_random_samplings = 1;
-  /** Bit mask used for initial random seed node selection. */
-  uint64_t rand_xor_mask = 0x128394;
-
-  /** Whether to use the persistent version of the kernel (only SINGLE_CTA is supported a.t.m.) */
-  bool persistent = false;
-  /** Persistent kernel: time in seconds before the kernel stops if no requests received. */
-  float persistent_lifetime = 2;
-  /**
-   * Set the fraction of maximum grid size used by persistent kernel.
-   * Value 1.0 means the kernel grid size is maximum possible for the selected device.
-   * The value must be greater than 0.0 and not greater than 1.0.
-   *
-   * One may need to run other kernels alongside this persistent kernel. This parameter can
-   * be used to reduce the grid size of the persistent kernel to leave a few SMs idle.
-   * Note: running any other work on GPU alongside with the persistent kernel makes the setup
-   * fragile.
-   *   - Running another kernel in another thread usually works, but no progress guaranteed
-   *   - Any CUDA allocations block the context (this issue may be obscured by using pools)
-   *   - Memory copies to not-pinned host memory may block the context
-   *
-   * Even when we know there are no other kernels working at the same time, setting
-   * kDeviceUsage to 1.0 surprisingly sometimes hurts performance. Proceed with care.
-   * If you suspect this is an issue, you can reduce this number to ~0.9 without a significant
-   * impact on the throughput.
-   */
-  float persistent_device_usage = 1.0;
-
-  /**
-   * A parameter indicating the rate of nodes to be filtered-out, when filtering is used.
-   * The value must be equal to or greater than 0.0 and less than 1.0. Default value is
-   * negative, in which case the filtering rate is automatically calculated.
-   */
-  float filtering_rate = -1.0;
-
-  /** Data type of the query vector and codebook table on shared memory. Currently, only VPQ
-   * supports FP8. **/
-  internal_dtype smem_dtype = internal_dtype::AUTO;
-};
-
 /**
  * @}
  */
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 1866a0ab7d..03bbd45dae 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -2112,6 +2112,9 @@ auto iterative_build_graph(
   } else {
     RAFT_LOG_INFO("Build compression: disabled (uncompressed build)");
   }
+  RAFT_LOG_INFO("Build search params: search_width=%zu, max_iterations=%zu",
+                iter_params.search_width,
+                iter_params.max_iterations);
 
   auto cagra_graph = raft::make_host_matrix<IdxT, int64_t>(0, 0);
 
@@ -2239,12 +2242,9 @@ auto iterative_build_graph(
       (uint64_t)curr_itopk_size,
       (uint64_t)curr_topk);
 
-    cuvs::neighbors::cagra::search_params search_params;
-    search_params.algo           = cuvs::neighbors::cagra::search_algo::AUTO;
-    search_params.max_queries    = max_chunk_size;
-    search_params.itopk_size     = curr_itopk_size;
-    search_params.max_iterations = 8;
-    search_params.search_width   = 1;
+    cuvs::neighbors::cagra::search_params search_params = iter_params;
+    search_params.max_queries = max_chunk_size;
+    search_params.itopk_size  = curr_itopk_size;
 
     // Create index and query views.
     if (!build_compression.has_value()) {

From d780bc7d9618b66c2e3d5b16d385db217ba7c3e7 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Thu, 9 Apr 2026 22:34:11 +0900
Subject: [PATCH 098/119] Remove unnecessary files

---
 .../detail/cagra/compute_distance.cu          | 386 ++++++++++++++++++
 1 file changed, 386 insertions(+)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance.cu

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu
new file mode 100644
index 0000000000..4e1625fdd7
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu
@@ -0,0 +1,386 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance-ext.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+
+template struct instance_selector<
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L1, 8, 128, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::CosineExpanded, 16, 256, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L1, 16, 256, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L1, 32, 512, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, float, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      float,
+                      uint32_t,
+                      float,
+                      false>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L1, 8, 128, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::CosineExpanded, 16, 256, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L1, 16, 256, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L1, 32, 512, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float, false>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L1, 8, 128, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::CosineExpanded, 16, 256, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L1, 16, 256, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L1, 32, 512, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      2,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      2,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      4,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      4,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, int8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      512,
+                      8,
+                      8,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      int8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L1, 8, 128, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::CosineExpanded, 16, 256, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L1, 16, 256, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L1, 32, 512, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      8,
+                      128,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      2,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float, false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      8,
+                      128,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      256,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      512,
+                      8,
+                      4,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      4,
+                      128,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, uint8_t, uint32_t, float, true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      8,
+                      256,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      512,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      16,
+                      512,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      true>,
+  vpq_descriptor_spec<DistanceType::L2Expanded,
+                      32,
+                      1024,
+                      8,
+                      8,
+                      half,
+                      uint8_t,
+                      uint32_t,
+                      float,
+                      false>,
+  standard_descriptor_spec<DistanceType::BitwiseHamming, 8, 128, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::BitwiseHamming, 16, 256, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::BitwiseHamming, 32, 512, uint8_t, uint32_t, float>>;
+
+}  // namespace cuvs::neighbors::cagra::detail

From 9d19f74f227ada9c48579586f8d8ab3cc94fd968 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Thu, 9 Apr 2026 22:34:54 +0900
Subject: [PATCH 099/119] Remove unnecessary files (2)

---
 .../detail/cagra/compute_distance.cu          | 386 ------------------
 1 file changed, 386 deletions(-)
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance.cu

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu
deleted file mode 100644
index 4e1625fdd7..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance.cu
+++ /dev/null
@@ -1,386 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance-ext.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-
-template struct instance_selector<
-  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, float, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, float, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, float, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L1, 8, 128, float, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, float, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::CosineExpanded, 16, 256, float, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L1, 16, 256, float, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, float, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L1, 32, 512, float, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, float, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, float, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, float, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, float, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, float, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, float, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      32,
-                      1024,
-                      8,
-                      8,
-                      half,
-                      float,
-                      uint32_t,
-                      float,
-                      false>,
-  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, half, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L1, 8, 128, half, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, half, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::CosineExpanded, 16, 256, half, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L1, 16, 256, half, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, half, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L1, 32, 512, half, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, half, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, half, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, half, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, half, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, half, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 1024, 8, 8, half, half, uint32_t, float, false>,
-  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, int8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L1, 8, 128, int8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, int8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, int8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::CosineExpanded, 16, 256, int8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L1, 16, 256, int8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, int8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L1, 32, 512, int8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, int8_t, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      16,
-                      256,
-                      8,
-                      2,
-                      half,
-                      int8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      32,
-                      512,
-                      8,
-                      2,
-                      half,
-                      int8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, int8_t, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      16,
-                      256,
-                      8,
-                      4,
-                      half,
-                      int8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      32,
-                      512,
-                      8,
-                      4,
-                      half,
-                      int8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, int8_t, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, int8_t, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 512, 8, 8, half, int8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      16,
-                      512,
-                      8,
-                      8,
-                      half,
-                      int8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      32,
-                      1024,
-                      8,
-                      8,
-                      half,
-                      int8_t,
-                      uint32_t,
-                      float,
-                      true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      32,
-                      1024,
-                      8,
-                      8,
-                      half,
-                      int8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::CosineExpanded, 8, 128, uint8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L1, 8, 128, uint8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, uint8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, uint8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::CosineExpanded, 16, 256, uint8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L1, 16, 256, uint8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::CosineExpanded, 32, 512, uint8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::L1, 32, 512, uint8_t, uint32_t, float>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 2, half, uint8_t, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      8,
-                      128,
-                      8,
-                      2,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      16,
-                      256,
-                      8,
-                      2,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      16,
-                      256,
-                      8,
-                      2,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      32,
-                      512,
-                      8,
-                      2,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      32,
-                      512,
-                      8,
-                      2,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 64, 8, 4, half, uint8_t, uint32_t, float, false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      8,
-                      128,
-                      8,
-                      4,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      16,
-                      256,
-                      8,
-                      4,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      16,
-                      256,
-                      8,
-                      4,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      32,
-                      512,
-                      8,
-                      4,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      32,
-                      512,
-                      8,
-                      4,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 4, 128, 8, 8, half, uint8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      4,
-                      128,
-                      8,
-                      8,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 256, 8, 8, half, uint8_t, uint32_t, float, true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      8,
-                      256,
-                      8,
-                      8,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      16,
-                      512,
-                      8,
-                      8,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      16,
-                      512,
-                      8,
-                      8,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      32,
-                      1024,
-                      8,
-                      8,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      true>,
-  vpq_descriptor_spec<DistanceType::L2Expanded,
-                      32,
-                      1024,
-                      8,
-                      8,
-                      half,
-                      uint8_t,
-                      uint32_t,
-                      float,
-                      false>,
-  standard_descriptor_spec<DistanceType::BitwiseHamming, 8, 128, uint8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::BitwiseHamming, 16, 256, uint8_t, uint32_t, float>,
-  standard_descriptor_spec<DistanceType::BitwiseHamming, 32, 512, uint8_t, uint32_t, float>>;
-
-}  // namespace cuvs::neighbors::cagra::detail

From c3a3cd968b5461cf8887ef4d91c914f13cb22f86 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Thu, 9 Apr 2026 22:45:56 +0900
Subject: [PATCH 100/119] Remove unnecessary files (3)

---
 ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu | 31 -------------------
 ...int32_dim128_t4_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...uint32_dim128_t4_8pq_8subd_half_fp8true.cu | 31 -------------------
 ...int32_dim128_t8_8pq_2subd_half_fp8false.cu | 31 -------------------
 ...uint32_dim128_t8_8pq_2subd_half_fp8true.cu | 31 -------------------
 ...int32_dim128_t8_8pq_4subd_half_fp8false.cu | 31 -------------------
 ...uint32_dim128_t8_8pq_4subd_half_fp8true.cu | 31 -------------------
 ...nt32_dim256_t16_8pq_2subd_half_fp8false.cu | 31 -------------------
 ...int32_dim256_t16_8pq_2subd_half_fp8true.cu | 31 -------------------
 ...nt32_dim256_t16_8pq_4subd_half_fp8false.cu | 31 -------------------
 ...int32_dim256_t16_8pq_4subd_half_fp8true.cu | 31 -------------------
 ...int32_dim256_t8_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu | 31 -------------------
 ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...int32_dim512_t16_8pq_8subd_half_fp8true.cu | 31 -------------------
 ...nt32_dim512_t32_8pq_2subd_half_fp8false.cu | 31 -------------------
 ...int32_dim512_t32_8pq_2subd_half_fp8true.cu | 31 -------------------
 ...nt32_dim512_t32_8pq_4subd_half_fp8false.cu | 31 -------------------
 ...int32_dim512_t32_8pq_4subd_half_fp8true.cu | 31 -------------------
 ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu | 31 -------------------
 ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu | 31 -------------------
 ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu | 31 -------------------
 ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu | 31 -------------------
 ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu | 31 -------------------
 ...int32_dim128_t4_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...int32_dim128_t8_8pq_4subd_half_fp8false.cu | 31 -------------------
 ...int32_dim256_t8_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu | 31 -------------------
 ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...int32_dim512_t16_8pq_8subd_half_fp8true.cu | 31 -------------------
 ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu | 31 -------------------
 ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu | 31 -------------------
 ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu | 31 -------------------
 ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu | 31 -------------------
 ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu | 31 -------------------
 ...int32_dim128_t4_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...int32_dim128_t8_8pq_2subd_half_fp8false.cu | 31 -------------------
 ...int32_dim128_t8_8pq_4subd_half_fp8false.cu | 31 -------------------
 ...uint32_dim128_t8_8pq_4subd_half_fp8true.cu | 31 -------------------
 ...nt32_dim256_t16_8pq_2subd_half_fp8false.cu | 31 -------------------
 ...nt32_dim256_t16_8pq_4subd_half_fp8false.cu | 31 -------------------
 ...int32_dim256_t8_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu | 31 -------------------
 ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...int32_dim512_t16_8pq_8subd_half_fp8true.cu | 31 -------------------
 ...nt32_dim512_t32_8pq_2subd_half_fp8false.cu | 31 -------------------
 ...nt32_dim512_t32_8pq_4subd_half_fp8false.cu | 31 -------------------
 ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu | 31 -------------------
 ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu | 31 -------------------
 ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu | 31 -------------------
 ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu | 31 -------------------
 ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu | 31 -------------------
 ...int32_dim128_t4_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...int32_dim128_t8_8pq_2subd_half_fp8false.cu | 31 -------------------
 ...int32_dim128_t8_8pq_4subd_half_fp8false.cu | 31 -------------------
 ...uint32_dim128_t8_8pq_4subd_half_fp8true.cu | 31 -------------------
 ...nt32_dim256_t16_8pq_2subd_half_fp8false.cu | 31 -------------------
 ...nt32_dim256_t16_8pq_4subd_half_fp8false.cu | 31 -------------------
 ...int32_dim256_t8_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu | 31 -------------------
 ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu | 31 -------------------
 ...int32_dim512_t16_8pq_8subd_half_fp8true.cu | 31 -------------------
 ...nt32_dim512_t32_8pq_2subd_half_fp8false.cu | 31 -------------------
 ...nt32_dim512_t32_8pq_4subd_half_fp8false.cu | 31 -------------------
 ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu | 31 -------------------
 ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu | 31 -------------------
 ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu | 31 -------------------
 ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu | 31 -------------------
 72 files changed, 2232 deletions(-)
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index 2070e7d8f2..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    1024,
-                                    8,
-                                    8,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
deleted file mode 100644
index f97bd67591..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    1024,
-                                    8,
-                                    8,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index b039619c1c..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    128,
-                                    8,
-                                    8,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
deleted file mode 100644
index f280b96812..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    128,
-                                    8,
-                                    8,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
deleted file mode 100644
index fd9d5223da..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    128,
-                                    8,
-                                    2,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
deleted file mode 100644
index 5ffda49346..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    128,
-                                    8,
-                                    2,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
deleted file mode 100644
index 135144698c..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    128,
-                                    8,
-                                    4,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
deleted file mode 100644
index ec79a11832..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    128,
-                                    8,
-                                    4,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
deleted file mode 100644
index 458e056b0e..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    256,
-                                    8,
-                                    2,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
deleted file mode 100644
index c13492a8c7..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    256,
-                                    8,
-                                    2,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
deleted file mode 100644
index 461af81977..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    256,
-                                    8,
-                                    4,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
deleted file mode 100644
index 30083be1fa..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    256,
-                                    8,
-                                    4,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index 972e309ce7..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    256,
-                                    8,
-                                    8,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
deleted file mode 100644
index 23f0fdb48c..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    256,
-                                    8,
-                                    8,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index dbb86642ce..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    512,
-                                    8,
-                                    8,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
deleted file mode 100644
index 9aeb7dbf26..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    512,
-                                    8,
-                                    8,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
deleted file mode 100644
index 7c95cfb5eb..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    512,
-                                    8,
-                                    2,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
deleted file mode 100644
index efdec2d449..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    512,
-                                    8,
-                                    2,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
deleted file mode 100644
index 5788588ecd..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    512,
-                                    8,
-                                    4,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
deleted file mode 100644
index 4b8c9067e8..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    512,
-                                    8,
-                                    4,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
deleted file mode 100644
index fd4ccd8b25..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    2,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
deleted file mode 100644
index 148244bc70..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    2,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
deleted file mode 100644
index 072bc59621..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    4,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
deleted file mode 100644
index 012dbef416..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    4,
-                                    half,
-                                    float,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index 7763ce7d65..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    1024,
-                                    8,
-                                    8,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
deleted file mode 100644
index 66a4c66046..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    1024,
-                                    8,
-                                    8,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index 279ee6f07f..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    128,
-                                    8,
-                                    8,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
deleted file mode 100644
index 587dc25379..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    128,
-                                    8,
-                                    4,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index 628a18c8d1..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    256,
-                                    8,
-                                    8,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
deleted file mode 100644
index 8a485c8306..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    256,
-                                    8,
-                                    8,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index 5482a736c0..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    512,
-                                    8,
-                                    8,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
deleted file mode 100644
index e0d191d086..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    512,
-                                    8,
-                                    8,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
deleted file mode 100644
index 785f75ed31..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    2,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
deleted file mode 100644
index 35ca051988..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    2,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
deleted file mode 100644
index 61af7cdc03..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    4,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
deleted file mode 100644
index 3683fe112b..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    4,
-                                    half,
-                                    half,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index 3f8378d722..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    1024,
-                                    8,
-                                    8,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
deleted file mode 100644
index 3496957041..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    1024,
-                                    8,
-                                    8,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index 8e1ff80ddb..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    128,
-                                    8,
-                                    8,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
deleted file mode 100644
index ed5bf3fe56..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    128,
-                                    8,
-                                    2,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
deleted file mode 100644
index 4246762dbb..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    128,
-                                    8,
-                                    4,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
deleted file mode 100644
index 53e132c40d..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    128,
-                                    8,
-                                    4,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
deleted file mode 100644
index fc6496dc35..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    256,
-                                    8,
-                                    2,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
deleted file mode 100644
index 8298fa1460..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    256,
-                                    8,
-                                    4,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index a1f1b5d019..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    256,
-                                    8,
-                                    8,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
deleted file mode 100644
index 9c5d18ca61..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    256,
-                                    8,
-                                    8,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index 9d9ce9a74b..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    512,
-                                    8,
-                                    8,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
deleted file mode 100644
index 319803418d..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    512,
-                                    8,
-                                    8,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
deleted file mode 100644
index e9e5602c41..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    512,
-                                    8,
-                                    2,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
deleted file mode 100644
index 3372a83874..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    512,
-                                    8,
-                                    4,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
deleted file mode 100644
index ada26482fc..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    2,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
deleted file mode 100644
index 7abcf8596e..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    2,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
deleted file mode 100644
index 0ee39456de..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    4,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
deleted file mode 100644
index 47a7e2913b..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    4,
-                                    half,
-                                    int8_t,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index a1114b3c8c..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    1024,
-                                    8,
-                                    8,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
deleted file mode 100644
index 920d6ff0d4..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    1024,
-                                    8,
-                                    8,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index 7c5d5f7a7e..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    128,
-                                    8,
-                                    8,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
deleted file mode 100644
index 3b36686271..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    128,
-                                    8,
-                                    2,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
deleted file mode 100644
index 50d0570acf..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    128,
-                                    8,
-                                    4,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
deleted file mode 100644
index ed67e72ee9..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    128,
-                                    8,
-                                    4,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
deleted file mode 100644
index f2bb24b6af..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    256,
-                                    8,
-                                    2,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
deleted file mode 100644
index 24a301551a..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    256,
-                                    8,
-                                    4,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index 20a4bf007e..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    256,
-                                    8,
-                                    8,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
deleted file mode 100644
index 684930622e..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    8,
-                                    256,
-                                    8,
-                                    8,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
deleted file mode 100644
index a66e1aef14..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    512,
-                                    8,
-                                    8,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
deleted file mode 100644
index 7ba74089f7..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    16,
-                                    512,
-                                    8,
-                                    8,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
deleted file mode 100644
index 8b36b01c48..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    512,
-                                    8,
-                                    2,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
deleted file mode 100644
index 77ee8a588f..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    32,
-                                    512,
-                                    8,
-                                    4,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
deleted file mode 100644
index bd91a70a63..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    2,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
deleted file mode 100644
index e3faa06b80..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    2,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
deleted file mode 100644
index 2426bcd2c9..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    4,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    false>;
-
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
deleted file mode 100644
index f0f99f2e73..0000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * NOTE: this file is generated by compute_distance_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python compute_distance_00_generate.py
- *
- */
-
-#include "compute_distance_vpq-impl.cuh"
-
-namespace cuvs::neighbors::cagra::detail {
-
-using namespace cuvs::distance;
-template struct vpq_descriptor_spec<DistanceType::L2Expanded,
-                                    4,
-                                    64,
-                                    8,
-                                    4,
-                                    half,
-                                    uint8_t,
-                                    uint32_t,
-                                    float,
-                                    true>;
-
-}  // namespace cuvs::neighbors::cagra::detail

From ec349597e081170d19925aaf8e5f5c208ec52c97 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Fri, 10 Apr 2026 15:09:26 +0900
Subject: [PATCH 101/119] Fix a compilation error

---
 cpp/tests/neighbors/ann_utils.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tests/neighbors/ann_utils.cuh b/cpp/tests/neighbors/ann_utils.cuh
index dc4a335e3c..8c95d6aa81 100644
--- a/cpp/tests/neighbors/ann_utils.cuh
+++ b/cpp/tests/neighbors/ann_utils.cuh
@@ -196,7 +196,7 @@ auto eval_recall(const std::vector<T>& expected_idx,
                  double min_recall,
                  bool test_unique = true) -> testing::AssertionResult
 {
-  auto [actual_recall, index_based_actual_recall, match_count, total_count] =
+  auto [actual_recall, match_count, total_count] =
     calc_recall(expected_idx, actual_idx, rows, cols);
   double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps);
   RAFT_LOG_INFO("Recall = %f (%zu/%zu), the error is %2.1f%% %s the threshold (eps = %f).",

From 25398339b0e235d3fa252602b987a13167a83ae6 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Fri, 29 May 2026 17:50:22 +0900
Subject: [PATCH 102/119] Add pq_len=8

---
 .../cagra/compute_distance_vpq_matrix.json    | 53 ++++++++++++--
 .../jit_lto_kernels/cagra_planner_base.hpp    | 23 ++++--
 .../compute_distance_matrix.json              | 70 ++++++++++++++++++
 .../setup_workspace_matrix.json               | 70 ++++++++++++++++++
 cpp/tests/neighbors/ann_cagra.cuh             | 46 +++++++++++-
 cpp/tests/neighbors/vpq_utils.cuh             | 73 +++++++++++++++++++
 6 files changed, 321 insertions(+), 14 deletions(-)
 create mode 100644 cpp/tests/neighbors/vpq_utils.cuh

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json
index cf6e060d33..c6e2ae319c 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json
@@ -36,15 +36,53 @@
   "_mxdim_team": [
     {
       "dim": "128",
-      "team_size": "8"
+      "team_size": "8",
+      "pq_len": "2"
     },
     {
       "dim": "256",
-      "team_size": "16"
+      "team_size": "16",
+      "pq_len": "2"
     },
     {
       "dim": "512",
-      "team_size": "32"
+      "team_size": "32",
+      "pq_len": "2"
+    },
+    {
+      "dim": "128",
+      "team_size": "8",
+      "pq_len": "4"
+    },
+    {
+      "dim": "256",
+      "team_size": "16",
+      "pq_len": "4"
+    },
+    {
+      "dim": "512",
+      "team_size": "32",
+      "pq_len": "4"
+    },
+    {
+      "dim": "128",
+      "team_size": "4",
+      "pq_len": "8"
+    },
+    {
+      "dim": "256",
+      "team_size": "8",
+      "pq_len": "8"
+    },
+    {
+      "dim": "512",
+      "team_size": "16",
+      "pq_len": "8"
+    },
+    {
+      "dim": "1024",
+      "team_size": "32",
+      "pq_len": "8"
     }
   ],
   "_codebook": [
@@ -53,7 +91,10 @@
       "codebook_abbrev": "h"
     }
   ],
-  "pq_bits": ["8"],
-  "pq_len": ["2", "4"],
-  "metric": ["L2Expanded"]
+  "pq_bits": [
+    "8"
+  ],
+  "metric": [
+    "L2Expanded"
+  ]
 }
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp
index 317ca1a1b6..b9e7891723 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp
@@ -64,8 +64,8 @@ struct CagraPlannerBase : AlgorithmPlanner {
                                            uint32_t dataset_block_dim,
                                            uint32_t pq_len)
   {
-    if (pq_len != 2 && pq_len != 4) {
-      RAFT_FAIL("CAGRA JIT VPQ setup_workspace expects pq_len in {2,4} (matrix uses pq_bits=8)");
+    if (pq_len != 2 && pq_len != 4 && pq_len != 8) {
+      RAFT_FAIL("CAGRA JIT VPQ setup_workspace expects pq_len in {2,4,8} (matrix uses pq_bits=8)");
     }
     auto add = [&]<uint32_t TeamSz, uint32_t Dim, uint32_t PqBitsV, uint32_t PqLenV>() {
       this->add_static_fragment<fragment_tag_setup_workspace<DataTag,
@@ -82,8 +82,10 @@ struct CagraPlannerBase : AlgorithmPlanner {
       team_size, dataset_block_dim, [&add, pq_len]<uint32_t TeamSz, uint32_t Dim>() {
         if (pq_len == 2) {
           add.template operator()<TeamSz, Dim, 8u, 2u>();
-        } else {
+        } else if (pq_len == 4) {
           add.template operator()<TeamSz, Dim, 8u, 4u>();
+        } else {
+          add.template operator()<TeamSz, Dim, 8u, 8u>();
         }
       });
   }
@@ -120,8 +122,8 @@ struct CagraPlannerBase : AlgorithmPlanner {
                                             uint32_t dataset_block_dim,
                                             uint32_t pq_len)
   {
-    if (pq_len != 2 && pq_len != 4) {
-      RAFT_FAIL("CAGRA JIT VPQ compute_distance expects pq_len in {2,4} (matrix uses pq_bits=8)");
+    if (pq_len != 2 && pq_len != 4 && pq_len != 8) {
+      RAFT_FAIL("CAGRA JIT VPQ compute_distance expects pq_len in {2,4,8} (matrix uses pq_bits=8)");
     }
     auto add = [&]<uint32_t TeamSz, uint32_t Dim, uint32_t PqBitsV, uint32_t PqLenV>() {
       this->add_static_fragment<fragment_tag_compute_distance<DataTag,
@@ -138,8 +140,10 @@ struct CagraPlannerBase : AlgorithmPlanner {
       team_size, dataset_block_dim, [&add, pq_len]<uint32_t TeamSz, uint32_t Dim>() {
         if (pq_len == 2) {
           add.template operator()<TeamSz, Dim, 8u, 2u>();
-        } else {
+        } else if (pq_len == 4) {
           add.template operator()<TeamSz, Dim, 8u, 4u>();
+        } else {
+          add.template operator()<TeamSz, Dim, 8u, 8u>();
         }
       });
   }
@@ -219,6 +223,12 @@ struct CagraPlannerBase : AlgorithmPlanner {
   static void dispatch_cagra_team_dim(uint32_t team_size, uint32_t dataset_block_dim, Lambda&& l)
   {
     switch (team_size) {
+      case 4:
+        switch (dataset_block_dim) {
+          case 128: std::forward<Lambda>(l).template operator()<4u, 128u>(); return;
+          default: break;
+        }
+        break;
       case 8:
         switch (dataset_block_dim) {
           case 128: std::forward<Lambda>(l).template operator()<8u, 128u>(); return;
@@ -240,6 +250,7 @@ struct CagraPlannerBase : AlgorithmPlanner {
           case 128: std::forward<Lambda>(l).template operator()<32u, 128u>(); return;
           case 256: std::forward<Lambda>(l).template operator()<32u, 256u>(); return;
           case 512: std::forward<Lambda>(l).template operator()<32u, 512u>(); return;
+          case 1024: std::forward<Lambda>(l).template operator()<32u, 1024u>(); return;
           default: break;
         }
         break;
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json
index 82b8dbdf4e..2e64ee2ce1 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json
@@ -150,5 +150,75 @@
         "codebook_abbrev": "half"
       }
     ]
+  },
+  {
+    "_data": [
+      {
+        "data_type": "float",
+        "data_abbrev": "f"
+      },
+      {
+        "data_type": "__half",
+        "data_abbrev": "h"
+      },
+      {
+        "data_type": "uint8_t",
+        "data_abbrev": "u8"
+      },
+      {
+        "data_type": "int8_t",
+        "data_abbrev": "i8"
+      }
+    ],
+    "_query": [
+      {
+        "query_type": "half",
+        "query_abbrev": "h"
+      }
+    ],
+    "_index": [
+      {
+        "index_type": "uint32_t",
+        "index_abbrev": "u32"
+      }
+    ],
+    "_distance": [
+      {
+        "distance_type": "float",
+        "distance_abbrev": "f"
+      }
+    ],
+    "_pq": [
+      {
+        "pq_len": "8",
+        "pq_bits": "8",
+        "pq_prefix": "_vpq",
+        "pq_suffix": "_8pq_8subd"
+      }
+    ],
+    "_codebook": [
+      {
+        "codebook_type": "half",
+        "codebook_abbrev": "half"
+      }
+    ],
+    "_mxdim_team": [
+      {
+        "dataset_block_dim": "128",
+        "team_size": "4"
+      },
+      {
+        "dataset_block_dim": "256",
+        "team_size": "8"
+      },
+      {
+        "dataset_block_dim": "512",
+        "team_size": "16"
+      },
+      {
+        "dataset_block_dim": "1024",
+        "team_size": "32"
+      }
+    ]
   }
 ]
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json
index 83aa8764bc..64c82ce13a 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json
@@ -150,5 +150,75 @@
         "codebook_abbrev": "half"
       }
     ]
+  },
+  {
+    "_data": [
+      {
+        "data_type": "float",
+        "data_abbrev": "f"
+      },
+      {
+        "data_type": "__half",
+        "data_abbrev": "h"
+      },
+      {
+        "data_type": "uint8_t",
+        "data_abbrev": "u8"
+      },
+      {
+        "data_type": "int8_t",
+        "data_abbrev": "i8"
+      }
+    ],
+    "_query": [
+      {
+        "query_type": "half",
+        "query_abbrev": "h"
+      }
+    ],
+    "_index": [
+      {
+        "index_type": "uint32_t",
+        "index_abbrev": "u32"
+      }
+    ],
+    "_distance": [
+      {
+        "distance_type": "float",
+        "distance_abbrev": "f"
+      }
+    ],
+    "_pq": [
+      {
+        "pq_len": "8",
+        "pq_bits": "8",
+        "pq_prefix": "_vpq",
+        "pq_suffix": "_8pq_8subd"
+      }
+    ],
+    "_codebook": [
+      {
+        "codebook_type": "half",
+        "codebook_abbrev": "half"
+      }
+    ],
+    "_mxdim_team": [
+      {
+        "dataset_block_dim": "128",
+        "team_size": "4"
+      },
+      {
+        "dataset_block_dim": "256",
+        "team_size": "8"
+      },
+      {
+        "dataset_block_dim": "512",
+        "team_size": "16"
+      },
+      {
+        "dataset_block_dim": "1024",
+        "team_size": "32"
+      }
+    ]
   }
 ]
diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh
index a6704f892a..826b8d1a3a 100644
--- a/cpp/tests/neighbors/ann_cagra.cuh
+++ b/cpp/tests/neighbors/ann_cagra.cuh
@@ -6,6 +6,7 @@
 
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
+#include "vpq_utils.cuh"
 #include <raft/core/resource/cuda_stream.hpp>
 
 #include "naive_knn.cuh"
@@ -461,6 +462,46 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
         raft::update_host(indices_Cagra.data(), indices_dev.data(), queries_size, stream_);
 
         raft::resource::sync_stream(handle_);
+
+        reference_recall = 1;
+        if (ps.compression.has_value()) {
+          auto decoded_dataset =
+            raft::make_device_matrix<DataT, int64_t>(handle_, ps.n_rows, ps.dim);
+          cuvs::neighbors::decode_vpq_dataset<DataT, half>(
+            decoded_dataset.view(),
+            dynamic_cast<const cuvs::neighbors::vpq_dataset<half, int64_t>&>(index.data()),
+            raft::resource::get_cuda_stream(handle_));
+          auto indices_out_view = raft::make_device_matrix_view<SearchIdxT, int64_t>(
+            indices_dev.data(), ps.n_queries, ps.k);
+          auto dists_out_view = raft::make_device_matrix_view<DistanceT, int64_t>(
+            distances_dev.data(), ps.n_queries, ps.k);
+
+          cuvs::neighbors::naive_knn<DistanceT, DataT, SearchIdxT>(handle_,
+                                                                   dists_out_view.data_handle(),
+                                                                   indices_out_view.data_handle(),
+                                                                   search_queries.data(),
+                                                                   decoded_dataset.data_handle(),
+                                                                   ps.n_queries,
+                                                                   ps.n_rows,
+                                                                   ps.dim,
+                                                                   ps.k,
+                                                                   ps.metric);
+          std::vector<SearchIdxT> indices_vpq_dataset(queries_size);
+          std::vector<DistanceT> distances_vpq_dataset(queries_size);
+          raft::update_host(
+            distances_vpq_dataset.data(), dists_out_view.data_handle(), queries_size, stream_);
+          raft::update_host(
+            indices_vpq_dataset.data(), indices_out_view.data_handle(), queries_size, stream_);
+
+          reference_recall = std::get<1>(calc_recall(indices_naive,
+                                                     indices_vpq_dataset,
+                                                     distances_naive,
+                                                     distances_vpq_dataset,
+                                                     ps.n_queries,
+                                                     ps.k,
+                                                     0));
+          printf("reference_recall = %e\n", reference_recall);
+        }
       }
 
       // for (int i = 0; i < min(ps.n_queries, 10); i++) {
@@ -470,7 +511,7 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
       //   print_vector("T", distances_naive.data() + i * ps.k, ps.k, std::cout);
       //   print_vector("C", distances_Cagra.data() + i * ps.k, ps.k, std::cout);
       // }
-      double min_recall = ps.min_recall;
+      double min_recall = ps.min_recall * reference_recall;
       EXPECT_TRUE(eval_neighbours(indices_naive,
                                   indices_Cagra,
                                   distances_naive,
@@ -519,6 +560,7 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
   AnnCagraInputs ps;
   rmm::device_uvector<DataT> database;
   rmm::device_uvector<DataT> search_queries;
+  double reference_recall;
 };
 
 template <typename DistanceT, typename DataT, typename IdxT>
@@ -1652,7 +1694,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL,
      cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL});  // don't demand high recall
                                                                 // without refinement
-  for (uint32_t pq_len : {2}) {  // for now, only pq_len = 2 is supported, more options coming  soon
+  for (uint32_t pq_len : {2, 4, 8}) {
     for (uint32_t vq_n_centers : {100, 1000}) {
       for (auto input : inputs2) {
         vpq_params ps{};
diff --git a/cpp/tests/neighbors/vpq_utils.cuh b/cpp/tests/neighbors/vpq_utils.cuh
new file mode 100644
index 0000000000..613b5fe1d5
--- /dev/null
+++ b/cpp/tests/neighbors/vpq_utils.cuh
@@ -0,0 +1,73 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <cuvs/neighbors/common.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/util/integer_utils.hpp>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace cuvs::neighbors {
+template <class data_t, class math_t>
+__global__ void decode_vpq_dataset_kernel(data_t* const decoded_dataset_ptr,
+                                          const uint32_t ldd,
+                                          const math_t* const vq_codebook_ptr,
+                                          const uint32_t ldv,
+                                          const math_t* const pq_codebook_ptr,
+                                          const uint32_t pq_subspace_dim,
+                                          const uint32_t pq_table_size,
+                                          const uint32_t dataset_dim,
+                                          const size_t dataset_size,
+                                          const uint8_t* const data_ptr,
+                                          const uint32_t ldi)
+{
+  constexpr uint32_t warp_size = 32;
+  const size_t batch_id        = (blockIdx.x * blockDim.x + threadIdx.x) / warp_size;
+  if (batch_id >= dataset_size) { return; }
+
+  const auto local_data_ptr = data_ptr + ldi * batch_id;
+  const auto vq_code        = *reinterpret_cast<const uint32_t*>(local_data_ptr);
+  const auto pq_code_ptr    = local_data_ptr + sizeof(uint32_t);
+  const auto vq_vec_ptr     = vq_codebook_ptr + vq_code * ldv;
+  auto local_dst_ptr        = decoded_dataset_ptr + batch_id * ldd;
+
+  const auto lane_id = threadIdx.x % warp_size;
+  for (uint32_t i = lane_id; i < dataset_dim; i += warp_size) {
+    const auto pq_code = pq_code_ptr[i / pq_subspace_dim];
+    const auto pq_v    = pq_codebook_ptr[pq_code * pq_subspace_dim + (i % pq_subspace_dim)];
+
+    local_dst_ptr[i] = static_cast<data_t>(vq_vec_ptr[i]) + static_cast<data_t>(pq_v);
+  }
+}
+
+template <class data_t, class math_t>
+void decode_vpq_dataset(raft::device_matrix_view<data_t, int64_t> decoded_dataset,
+                        const cuvs::neighbors::vpq_dataset<math_t, int64_t>& vpq_dataset,
+                        cudaStream_t cuda_stream)
+{
+  const auto dataset_size = decoded_dataset.extent(0);
+  RAFT_EXPECTS(vpq_dataset.data.extent(0) == dataset_size, "Dataset sizes mismatch");
+
+  constexpr uint32_t block_size  = 256;
+  constexpr uint32_t warp_size   = 32;
+  constexpr int64_t vecs_per_cta = block_size / warp_size;
+  const auto grid_size = raft::div_rounding_up_safe(decoded_dataset.extent(0), vecs_per_cta);
+
+  decode_vpq_dataset_kernel<data_t, math_t>
+    <<<grid_size, block_size, 0, cuda_stream>>>(decoded_dataset.data_handle(),
+                                                decoded_dataset.stride(0),
+                                                vpq_dataset.vq_code_book.data_handle(),
+                                                vpq_dataset.vq_code_book.stride(0),
+                                                vpq_dataset.pq_code_book.data_handle(),
+                                                vpq_dataset.pq_len(),
+                                                1u << vpq_dataset.pq_bits(),
+                                                vpq_dataset.dim(),
+                                                dataset_size,
+                                                vpq_dataset.data.data_handle(),
+                                                vpq_dataset.data.stride(0));
+}
+}  // namespace cuvs::neighbors

From 19d5a0a788119c40f5e2041db50a90f947dd4c3b Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Thu, 4 Jun 2026 12:01:19 +0900
Subject: [PATCH 103/119] Update cagra-q test

---
 cpp/tests/neighbors/ann_utils.cuh | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/cpp/tests/neighbors/ann_utils.cuh b/cpp/tests/neighbors/ann_utils.cuh
index cbc95d7bb7..64732a8a3a 100644
--- a/cpp/tests/neighbors/ann_utils.cuh
+++ b/cpp/tests/neighbors/ann_utils.cuh
@@ -227,8 +227,9 @@ auto calc_recall(const std::vector<T>& expected_idx,
                  size_t cols,
                  double eps)
 {
-  size_t match_count = 0;
-  size_t total_count = static_cast<size_t>(rows) * static_cast<size_t>(cols);
+  size_t match_count       = 0;
+  size_t index_match_count = 0;
+  size_t total_count       = static_cast<size_t>(rows) * static_cast<size_t>(cols);
   for (size_t i = 0; i < rows; ++i) {
     for (size_t k = 0; k < cols; ++k) {
       size_t idx_k  = i * cols + k;  // row major assumption!
@@ -247,8 +248,28 @@ auto calc_recall(const std::vector<T>& expected_idx,
       }
     }
   }
-  return std::make_tuple(
-    static_cast<double>(match_count) / static_cast<double>(total_count), match_count, total_count);
+
+  // Index based recall
+  for (size_t i = 0; i < rows; ++i) {
+    for (size_t k = 0; k < cols; ++k) {
+      size_t idx_k = i * cols + k;  // row major assumption!
+      auto act_idx = actual_idx[idx_k];
+      for (size_t j = 0; j < cols; ++j) {
+        size_t idx   = i * cols + j;  // row major assumption!
+        auto exp_idx = expected_idx[idx];
+
+        if (act_idx == exp_idx) {
+          index_match_count++;
+          break;
+        }
+      }
+    }
+  }
+
+  return std::make_tuple(static_cast<double>(match_count) / static_cast<double>(total_count),
+                         static_cast<double>(index_match_count) / static_cast<double>(total_count),
+                         match_count,
+                         total_count);
 }
 
 /** same as eval_recall, but in case indices do not match,
@@ -265,7 +286,7 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
                      bool test_unique      = true,
                      size_t max_duplicates = 0) -> testing::AssertionResult
 {
-  auto [actual_recall, match_count, total_count] =
+  auto [actual_recall, index_based_actual_recall, match_count, total_count] =
     calc_recall(expected_idx, actual_idx, expected_dist, actual_dist, rows, cols, eps);
   double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps);
 

From 09deae59dfa21902bd23a83f671c405c6f9fc759 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Thu, 4 Jun 2026 12:05:17 +0900
Subject: [PATCH 104/119] Update the compute distance kernel

---
 .../cagra/compute_distance_vpq-impl.cuh       | 17 ++++-
 .../detail/cagra/compute_distance_vpq.hpp     |  8 ++-
 .../jit_lto_kernels/compute_distance_impl.cuh | 46 +++++++++----
 .../jit_lto_kernels/setup_workspace_impl.cuh  | 68 +++++++++++--------
 4 files changed, 93 insertions(+), 46 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 6992ae979a..d0f12a20fd 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -14,6 +14,14 @@
 
 namespace cuvs::neighbors::cagra::detail {
 
+template <uint32_t PQ_LEN>
+struct vpq_smem_value_config {
+  using smem_val_pack_t                         = half2;
+  using smem_val_t                              = half;
+  using smem_val_pack_uint_t                    = uint32_t;
+  static constexpr uint32_t num_packed_elements = 2;
+};
+
 template <uint32_t TeamSize,
           uint32_t DatasetBlockDim,
           uint32_t PQ_BITS,
@@ -80,8 +88,11 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
     return args.extra_word1;
   }
 
+  using smem_val_config = vpq_smem_value_config<PQ_LEN>;
+
   static constexpr std::uint32_t kSMemCodeBookSizeInBytes =
-    (1 << PQ_BITS) * PQ_LEN * utils::size_of<CODE_BOOK_T>();
+    (1 << PQ_BITS) * PQ_LEN * utils::size_of<typename smem_val_config::smem_val_pack_uint_t>() /
+    smem_val_config::num_packed_elements;
 
   _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(const std::uint8_t* encoded_dataset_ptr,
                                                  std::uint32_t encoded_dataset_dim,
@@ -108,7 +119,9 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
       3. Queries (smem_query_buffer_length elems)
     */
     return sizeof(cagra_q_dataset_descriptor_t) + kSMemCodeBookSizeInBytes +
-           raft::round_up_safe<uint32_t>(dim, DatasetBlockDim) * sizeof(QUERY_T);
+           raft::round_up_safe<uint32_t>(dim, DatasetBlockDim) *
+             utils::size_of<typename smem_val_config::smem_val_pack_uint_t>() /
+             smem_val_config::num_packed_elements;
   }
 
  private:
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
index 2b69a1cef4..299916c6c7 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -69,6 +69,12 @@ struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
     // Match codebook params
     if (dataset.pq_bits() != PqBits) { return -1.0; }
     if (dataset.pq_len() != PqLen) { return -1.0; }
+    // Keep auto-selection on the tuned VPQ diagonal while allowing explicit team_size requests to
+    // use the expanded team_size / dataset_block_dim grid.
+    constexpr std::uint32_t auto_dataset_block_dim_per_team = PqLen == 8 ? 32 : 16;
+    if (params.team_size == 0 && DatasetBlockDim != TeamSize * auto_dataset_block_dim_per_team) {
+      return -1.0;
+    }
     // Otherwise, favor the closest dataset dimensionality.
     constexpr std::uint32_t preferred_load_elmes_per_thread =
       16; /*magic number that is good based on experiments.*/
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh
index 92a014bd2f..08f44c171e 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh
@@ -100,10 +100,16 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker_impl(
   constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim;
   constexpr auto PQ_BITS         = DescriptorT::kPqBits;
   constexpr auto PQ_LEN          = DescriptorT::kPqLen;
+  using PQ_CODEBOOK_LOAD_T       = uint32_t;
+
+  using smem_val_config                  = vpq_smem_value_config<PQ_LEN>;
+  using smem_val_pack_t                  = typename smem_val_config::smem_val_pack_t;
+  using smem_val_pack_uint_t             = typename smem_val_config::smem_val_pack_uint_t;
+  constexpr uint32_t num_packed_elements = smem_val_config::num_packed_elements;
 
   const uint32_t query_ptr = pq_codebook_ptr + DescriptorT::kSMemCodeBookSizeInBytes;
   static_assert(PQ_BITS == 8, "Only pq_bits == 8 is supported at the moment.");
-  constexpr uint32_t vlen = 4;  // **** DO NOT CHANGE ****
+  constexpr uint32_t vlen = utils::size_of<PQ_CODEBOOK_LOAD_T>() / utils::size_of<uint8_t>();
   constexpr uint32_t nelem =
     raft::div_rounding_up_unsafe<uint32_t>(DatasetBlockDim / PQ_LEN, TeamSize * vlen);
 
@@ -115,12 +121,17 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker_impl(
   DISTANCE_T norm       = 0;
   for (uint32_t elem_offset = 0; elem_offset * PQ_LEN < dim;
        elem_offset += DatasetBlockDim / PQ_LEN) {
-    uint32_t pq_codes[nelem];
+    PQ_CODEBOOK_LOAD_T pq_codes[nelem];
 #pragma unroll
     for (std::uint32_t e = 0; e < nelem; e++) {
       const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen;
       if (k >= n_subspace) break;
-      device::ldg_cg(pq_codes[e], reinterpret_cast<const std::uint32_t*>(dataset_ptr + 4 + k));
+      if constexpr (std::is_same_v<PQ_CODEBOOK_LOAD_T, uint32_t>) {
+        device::ldg_cg(pq_codes[e],
+                       reinterpret_cast<const PQ_CODEBOOK_LOAD_T*>(dataset_ptr + 4 + k));
+      } else {
+        pq_codes[e] = *reinterpret_cast<const PQ_CODEBOOK_LOAD_T*>(dataset_ptr + 4 + k);
+      }
     }
     //
     if constexpr (PQ_LEN % 2 == 0) {
@@ -135,23 +146,30 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker_impl(
           if (d >= dim) break;
           device::ldg_ca(vq_vals[m], vq_code_book_ptr + d);
         }
-        std::uint32_t pq_code = pq_codes[e];
+        PQ_CODEBOOK_LOAD_T pq_code = pq_codes[e];
 #pragma unroll
         for (std::uint32_t v = 0; v < vlen; v++) {
           if (PQ_LEN * (v + k) >= dim) break;
 #pragma unroll
-          for (std::uint32_t m = 0; m < PQ_LEN / 2; m++) {
-            constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN);
-            const std::uint32_t d1     = m + (PQ_LEN / 2) * v;
-            const std::uint32_t d =
-              d1 * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
-            half2 q2, c2;
-            device::lds(q2, query_ptr + sizeof(half2) * d);
+          for (std::uint32_t m = 0; m < PQ_LEN / num_packed_elements; m++) {
+            constexpr uint32_t vq_val_pack_num_elements = 2;
+            constexpr auto kQueryBlock                  = DatasetBlockDim / (vlen * PQ_LEN);
+            const std::uint32_t vq_half2_index =
+              m * (num_packed_elements / vq_val_pack_num_elements) + (PQ_LEN / 2) * v;
+
+            static_assert(num_packed_elements == 2,
+                          "CAGRA JIT VPQ currently stores pq_len=8 in half2 shared-memory packs");
+            const uint32_t query_val_index =
+              vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
+
+            smem_val_pack_t q2, c2;
+            device::lds(q2, query_ptr + sizeof(smem_val_pack_t) * query_val_index);
             device::lds(c2,
                         pq_codebook_ptr +
-                          sizeof(CODE_BOOK_T) * ((1 << PQ_BITS) * 2 * m + (2 * (pq_code & 0xff))));
-            auto dist = q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
-            dist      = dist * dist;
+                          sizeof(smem_val_pack_uint_t) * ((1 << PQ_BITS) * m + (pq_code & 0xff)));
+            auto dist =
+              q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
+            dist = dist * dist;
             norm += static_cast<DISTANCE_T>(dist.x + dist.y);
           }
           pq_code >>= 8;
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh
index 8cdd7febd5..ed83c181fe 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh
@@ -79,12 +79,16 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq_impl(
   const typename DescriptorT::DATA_T* queries_ptr,
   uint32_t query_id) -> const DescriptorT*
 {
-  using QUERY_T                   = typename DescriptorT::QUERY_T;
-  using CODE_BOOK_T               = typename DescriptorT::CODE_BOOK_T;
-  using word_type                 = uint32_t;
-  constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim;
-  constexpr auto PQ_BITS          = DescriptorT::kPqBits;
-  constexpr auto PQ_LEN           = DescriptorT::kPqLen;
+  using QUERY_T                      = typename DescriptorT::QUERY_T;
+  using word_type                    = uint32_t;
+  constexpr auto kDatasetBlockDim    = DescriptorT::kDatasetBlockDim;
+  constexpr auto PQ_BITS             = DescriptorT::kPqBits;
+  constexpr auto PQ_LEN              = DescriptorT::kPqLen;
+  using smem_val_config              = vpq_smem_value_config<PQ_LEN>;
+  using smem_val_t                   = typename smem_val_config::smem_val_t;
+  using smem_val_pack_t              = typename smem_val_config::smem_val_pack_t;
+  using smem_val_pack_uint_t         = typename smem_val_config::smem_val_pack_uint_t;
+  constexpr auto num_packed_elements = smem_val_config::num_packed_elements;
 
   auto* r = reinterpret_cast<DescriptorT*>(smem_ptr);
 
@@ -105,18 +109,22 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq_impl(
     }
     __syncthreads();
 
-    for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) {
-      half2 buf2;
-      buf2.x = r->pq_code_book_ptr()[i];
-      buf2.y = r->pq_code_book_ptr()[i + 1];
-
-      constexpr auto num_elements_per_bank  = 4 / utils::size_of<CODE_BOOK_T>();
-      constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank;
-      const auto j                          = i / num_elements_per_bank;
-      const auto smem_index =
-        (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
-
-      device::sts(codebook_buf + smem_index * sizeof(half2), buf2);
+    for (unsigned i = threadIdx.x * num_packed_elements; i < (1 << PQ_BITS) * PQ_LEN;
+         i += blockDim.x * num_packed_elements) {
+      constexpr auto num_elements_per_bank =
+        num_packed_elements / (utils::size_of<smem_val_pack_uint_t>() / utils::size_of<uint32_t>());
+
+      if constexpr (PQ_LEN >= num_elements_per_bank) {
+        constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank;
+        const auto j                          = i / num_elements_per_bank;
+        const auto smem_index =
+          (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
+
+        smem_val_pack_t buf;
+        buf.x = r->pq_code_book_ptr()[i];
+        buf.y = r->pq_code_book_ptr()[i + 1];
+        device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_t), buf);
+      }
     }
   }
 
@@ -125,19 +133,21 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq_impl(
 
   constexpr cuvs::spatial::knn::detail::utils::mapping<QUERY_T> mapping{};
   auto smem_query_ptr =
-    reinterpret_cast<QUERY_T*>(reinterpret_cast<uint8_t*>(smem_ptr) + sizeof(DescriptorT) +
-                               DescriptorT::kSMemCodeBookSizeInBytes);
-  for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) {
-    half2 buf2{0, 0};
-    if (i < dim) { buf2.x = mapping(queries_ptr[i]); }
-    if (i + 1 < dim) { buf2.y = mapping(queries_ptr[i + 1]); }
-    if constexpr ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) {
+    reinterpret_cast<smem_val_t*>(reinterpret_cast<uint8_t*>(smem_ptr) + sizeof(DescriptorT) +
+                                  DescriptorT::kSMemCodeBookSizeInBytes);
+  for (unsigned i = threadIdx.x * num_packed_elements; i < dim;
+       i += blockDim.x * num_packed_elements) {
+    smem_val_pack_t buf{0, 0};
+    if (i < dim) { buf.x = mapping(queries_ptr[i]); }
+    if (i + 1 < dim) { buf.y = mapping(queries_ptr[i + 1]); }
+    if constexpr ((PQ_BITS == 8) && (PQ_LEN % num_packed_elements == 0)) {
       constexpr uint32_t vlen = 4;  // **** DO NOT CHANGE ****
-      constexpr auto kStride  = vlen * PQ_LEN / 2;
-      reinterpret_cast<half2*>(smem_query_ptr)[transpose<kDatasetBlockDim / 2, kStride>(i / 2)] =
-        buf2;
+      constexpr auto kStride  = vlen * PQ_LEN / num_packed_elements;
+      reinterpret_cast<smem_val_pack_t*>(
+        smem_query_ptr)[transpose<kDatasetBlockDim / num_packed_elements, kStride>(
+        i / num_packed_elements)] = buf;
     } else {
-      (reinterpret_cast<half2*>(smem_query_ptr + i))[0] = buf2;
+      (reinterpret_cast<smem_val_pack_t*>(smem_query_ptr + i))[0] = buf;
     }
   }
 

From 5fa53216cf33996f772ce3e10de4e5a545be3528 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Thu, 4 Jun 2026 18:10:24 +0900
Subject: [PATCH 105/119] Add FP8 support

---
 .../neighbors/detail/cagra/cagra_search.cuh   |   5 +
 .../detail/cagra/compute_distance.hpp         |   7 +-
 .../cagra/compute_distance_vpq-impl.cuh       |  49 +++-
 .../detail/cagra/compute_distance_vpq.hpp     |  10 +-
 .../cagra/compute_distance_vpq_inst.cu.in     |   4 +-
 .../cagra/compute_distance_vpq_matrix.json    |   4 +
 .../detail/cagra/device_memory_ops.hpp        |  15 +
 cpp/src/neighbors/detail/cagra/factory.cuh    |  12 +-
 .../cagra_jit_launcher_factory.hpp            |  36 ++-
 .../jit_lto_kernels/cagra_planner_base.hpp    | 274 ++++++++++++++----
 .../jit_lto_kernels/compute_distance_impl.cuh |  69 +++--
 .../compute_distance_kernel.cu.in             |   7 +-
 .../compute_distance_matrix.json              |  26 ++
 .../jit_lto_kernels/setup_workspace_impl.cuh  |  45 ++-
 .../setup_workspace_kernel.cu.in              |   4 +-
 .../setup_workspace_matrix.json               |  26 ++
 .../neighbors/detail/cagra/packed_type.hpp    |  49 ++++
 17 files changed, 520 insertions(+), 122 deletions(-)
 create mode 100644 cpp/src/neighbors/detail/cagra/packed_type.hpp

diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index bca8d3314d..f199cf7882 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -153,6 +153,11 @@ void search_main(raft::resources const& res,
   // Dispatch search parameters based on the dataset kind.
   if (auto* strided_dset = dynamic_cast<const strided_dataset<T, ds_idx_type>*>(&index.data());
       strided_dset != nullptr) {
+    if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO &&
+        params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::F16) {
+      RAFT_LOG_WARN("In this search mode, smem_dtype supports only AUTO or F16. Set it to AUTO.");
+      params.smem_dtype = cuvs::neighbors::cagra::internal_dtype::AUTO;
+    }
     // Search using a plain (strided) row-major dataset
     RAFT_EXPECTS(index.metric() != cuvs::distance::DistanceType::CosineExpanded ||
                    index.dataset_norms().has_value(),
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
index 75b56860bb..7f921ce948 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
@@ -207,6 +207,7 @@ struct dataset_descriptor_host {
   bool is_vpq                         = false;
   uint32_t pq_bits                    = 0;
   uint32_t pq_len                     = 0;
+  bool enable_fp8                     = false;
   // Codebook type is determined by DataT for VPQ (always half for now)
 
   struct state {
@@ -258,7 +259,8 @@ struct dataset_descriptor_host {
                           uint32_t dataset_block_dim_val,
                           bool is_vpq_val      = false,
                           uint32_t pq_bits_val = 0,
-                          uint32_t pq_len_val  = 0)
+                          uint32_t pq_len_val  = 0,
+                          bool enable_fp8_val  = false)
     : value_{std::make_shared<state>(init, sizeof(DescriptorImpl))},
       smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()},
       team_size{dd_host.team_size()},
@@ -266,7 +268,8 @@ struct dataset_descriptor_host {
       dataset_block_dim{dataset_block_dim_val},
       is_vpq{is_vpq_val},
       pq_bits{pq_bits_val},
-      pq_len{pq_len_val}
+      pq_len{pq_len_val},
+      enable_fp8{enable_fp8_val}
   {
   }
 
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index d0f12a20fd..ea994c450a 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -6,6 +6,7 @@
 #pragma once
 
 #include "compute_distance_vpq.hpp"
+#include "packed_type.hpp"
 
 #include <cuvs/distance/distance.hpp>
 #include <raft/util/pow2_utils.cuh>
@@ -14,14 +15,27 @@
 
 namespace cuvs::neighbors::cagra::detail {
 
-template <uint32_t PQ_LEN>
-struct vpq_smem_value_config {
+template <uint32_t PQ_LEN, bool EnableFP8, class Enable = void>
+struct vpq_smem_value_config;
+
+template <uint32_t PQ_LEN, bool EnableFP8>
+struct vpq_smem_value_config<PQ_LEN, EnableFP8, std::enable_if_t<PQ_LEN == 2 || !EnableFP8>> {
   using smem_val_pack_t                         = half2;
   using smem_val_t                              = half;
   using smem_val_pack_uint_t                    = uint32_t;
   static constexpr uint32_t num_packed_elements = 2;
 };
 
+template <uint32_t PQ_LEN, bool EnableFP8>
+struct vpq_smem_value_config<PQ_LEN,
+                             EnableFP8,
+                             std::enable_if_t<(PQ_LEN == 4 || PQ_LEN == 8) && EnableFP8>> {
+  using smem_val_pack_t                         = device::fp8xN<PQ_LEN, 5>;
+  using smem_val_t                              = typename smem_val_pack_t::unit_t;
+  using smem_val_pack_uint_t                    = typename smem_val_pack_t::uint_t;
+  static constexpr uint32_t num_packed_elements = smem_val_pack_t::num_elements;
+};
+
 template <uint32_t TeamSize,
           uint32_t DatasetBlockDim,
           uint32_t PQ_BITS,
@@ -30,7 +44,8 @@ template <uint32_t TeamSize,
           typename DataT,
           typename IndexT,
           typename DistanceT,
-          typename QueryT>
+          typename QueryT,
+          bool EnableFP8>
 struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, IndexT, DistanceT> {
   using base_type   = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
   using CODE_BOOK_T = CodebookT;
@@ -46,6 +61,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
   constexpr static inline auto kDatasetBlockDim = DatasetBlockDim;
   constexpr static inline auto kPqBits          = PQ_BITS;
   constexpr static inline auto kPqLen           = PQ_LEN;
+  constexpr static inline auto kEnableFP8       = EnableFP8;
 
   static_assert(std::is_same_v<CODE_BOOK_T, half>, "Only CODE_BOOK_T = `half` is supported now");
 
@@ -88,7 +104,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
     return args.extra_word1;
   }
 
-  using smem_val_config = vpq_smem_value_config<PQ_LEN>;
+  using smem_val_config = vpq_smem_value_config<PQ_LEN, EnableFP8>;
 
   static constexpr std::uint32_t kSMemCodeBookSizeInBytes =
     (1 << PQ_BITS) * PQ_LEN * utils::size_of<typename smem_val_config::smem_val_pack_uint_t>() /
@@ -135,7 +151,8 @@ template <cuvs::distance::DistanceType Metric,
           typename CodebookT,
           typename DataT,
           typename IndexT,
-          typename DistanceT>
+          typename DistanceT,
+          bool EnableFP8>
 RAFT_KERNEL __launch_bounds__(1, 1)
   vpq_dataset_descriptor_init_kernel(dataset_descriptor_base_t<DataT, IndexT, DistanceT>* out,
                                      const std::uint8_t* encoded_dataset_ptr,
@@ -153,7 +170,8 @@ RAFT_KERNEL __launch_bounds__(1, 1)
                                                  DataT,
                                                  IndexT,
                                                  DistanceT,
-                                                 half>;
+                                                 half,
+                                                 EnableFP8>;
   new (out) desc_type(
     encoded_dataset_ptr, encoded_dataset_dim, vq_code_book_ptr, pq_code_book_ptr, size, dim);
 }
@@ -166,7 +184,8 @@ template <cuvs::distance::DistanceType Metric,
           typename CodebookT,
           typename DataT,
           typename IndexT,
-          typename DistanceT>
+          typename DistanceT,
+          bool EnableFP8>
 dataset_descriptor_host<DataT, IndexT, DistanceT>
 vpq_descriptor_spec<Metric,
                     TeamSize,
@@ -176,7 +195,8 @@ vpq_descriptor_spec<Metric,
                     CodebookT,
                     DataT,
                     IndexT,
-                    DistanceT>::init_(const cagra::search_params& params,
+                    DistanceT,
+                    EnableFP8>::init_(const cagra::search_params& params,
                                       const std::uint8_t* encoded_dataset_ptr,
                                       uint32_t encoded_dataset_dim,
                                       const CodebookT* vq_code_book_ptr,
@@ -192,7 +212,8 @@ vpq_descriptor_spec<Metric,
                                                  DataT,
                                                  IndexT,
                                                  DistanceT,
-                                                 half>;
+                                                 half,
+                                                 EnableFP8>;
 
   return host_type{
     desc_type{
@@ -207,7 +228,8 @@ vpq_descriptor_spec<Metric,
                                          CodebookT,
                                          DataT,
                                          IndexT,
-                                         DistanceT><<<1, 1, 0, stream>>>(dev_ptr,
+                                         DistanceT,
+                                         EnableFP8><<<1, 1, 0, stream>>>(dev_ptr,
                                                                          encoded_dataset_ptr,
                                                                          encoded_dataset_dim,
                                                                          vq_code_book_ptr,
@@ -218,9 +240,10 @@ vpq_descriptor_spec<Metric,
     },
     Metric,
     DatasetBlockDim,
-    true,    // is_vpq
-    PqBits,  // pq_bits
-    PqLen};  // pq_len
+    true,        // is_vpq
+    PqBits,      // pq_bits
+    PqLen,       // pq_len
+    EnableFP8};  // enable_fp8
 }
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
index 299916c6c7..c6e4611ae8 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
@@ -8,6 +8,7 @@
 #include "compute_distance.hpp"
 
 #include <cuvs/distance/distance.hpp>
+#include <raft/util/cudart_utils.hpp>
 
 #include <type_traits>
 
@@ -21,7 +22,8 @@ template <cuvs::distance::DistanceType Metric,
           typename CodebookT,
           typename DataT,
           typename IndexT,
-          typename DistanceT>
+          typename DistanceT,
+          bool EnableFP8>
 struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
   using base_type = instance_spec<DataT, IndexT, DistanceT>;
   using typename base_type::data_type;
@@ -63,12 +65,18 @@ struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
                        const DatasetT& dataset,
                        cuvs::distance::DistanceType metric) -> double
   {
+    const auto fp8_natively_supported = raft::getComputeCapability().first >= 9;
+    const auto use_fp8 =
+      params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::E5M2 ||
+      (params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::AUTO && fp8_natively_supported);
+
     // If explicit team_size is specified and doesn't match the instance, discard it
     if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; }
     if (cuvs::distance::DistanceType::L2Expanded != metric) { return -1.0; }
     // Match codebook params
     if (dataset.pq_bits() != PqBits) { return -1.0; }
     if (dataset.pq_len() != PqLen) { return -1.0; }
+    if (use_fp8 != EnableFP8) { return -1.0; }
     // Keep auto-selection on the tuned VPQ diagonal while allowing explicit team_size requests to
     // use the expanded team_size / dataset_block_dim grid.
     constexpr std::uint32_t auto_dataset_block_dim_per_team = PqLen == 8 ? 32 : 16;
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in
index c159da3229..676f25c9fd 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in
@@ -13,6 +13,7 @@ constexpr uint32_t team_size = @team_size@;
 constexpr uint32_t dim       = @dim@;
 constexpr uint32_t pq_bits   = @pq_bits@;
 constexpr uint32_t pq_len    = @pq_len@;
+constexpr bool enable_fp8    = @enable_fp8@;
 using codebook_t             = @codebook_type@;
 using data_t                 = @data_type@;
 using index_t                = @index_type@;
@@ -30,6 +31,7 @@ template struct vpq_descriptor_spec<metric,
                                     codebook_t,
                                     data_t,
                                     index_t,
-                                    distance_t>;
+                                    distance_t,
+                                    enable_fp8>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json
index c6e2ae319c..7dac07c2a4 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json
@@ -96,5 +96,9 @@
   ],
   "metric": [
     "L2Expanded"
+  ],
+  "enable_fp8": [
+    "true",
+    "false"
   ]
 }
diff --git a/cpp/src/neighbors/detail/cagra/device_memory_ops.hpp b/cpp/src/neighbors/detail/cagra/device_memory_ops.hpp
index cc164994ea..1bcf6f8fbd 100644
--- a/cpp/src/neighbors/detail/cagra/device_memory_ops.hpp
+++ b/cpp/src/neighbors/detail/cagra/device_memory_ops.hpp
@@ -54,6 +54,11 @@ RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, uint32_t addr)
   asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "r"(addr));
 }
 
+RAFT_DEVICE_INLINE_FUNCTION void lds(uint64_t& x, uint32_t addr)
+{
+  asm volatile("ld.shared.u64 {%0}, [%1];" : "=l"(x) : "r"(addr));
+}
+
 RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, const uint32_t* addr)
 {
   lds(x, uint32_t(__cvta_generic_to_shared(addr)));
@@ -71,6 +76,16 @@ RAFT_DEVICE_INLINE_FUNCTION void lds(uint4& x, const uint4* addr)
   lds(x, uint32_t(__cvta_generic_to_shared(addr)));
 }
 
+RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const uint32_t& x)
+{
+  asm volatile("st.shared.u32 [%0], %1;" : : "r"(addr), "r"(reinterpret_cast<const uint32_t&>(x)));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const uint64_t& x)
+{
+  asm volatile("st.shared.u64 [%0], %1;" : : "r"(addr), "l"(reinterpret_cast<const uint64_t&>(x)));
+}
+
 RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const half2& x)
 {
   asm volatile("st.shared.v2.u16 [%0], {%1, %2};"
diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh
index 26cd13bab8..a1e2f6be9c 100644
--- a/cpp/src/neighbors/detail/cagra/factory.cuh
+++ b/cpp/src/neighbors/detail/cagra/factory.cuh
@@ -87,6 +87,7 @@ struct key {
   uint32_t extra_val;  // this one has different meanings for different descriptor types
   uint32_t team_size;
   uint32_t metric;
+  uint32_t smem_dtype;
 };
 
 template <typename DatasetT>
@@ -100,7 +101,8 @@ auto make_key(const cagra::search_params& params,
              dataset.dim(),
              dataset.stride(),
              uint32_t(params.team_size),
-             uint32_t(metric)};
+             uint32_t(metric),
+             uint32_t(params.smem_dtype)};
 }
 
 template <typename DatasetT>
@@ -114,20 +116,22 @@ auto make_key(const cagra::search_params& params,
              dataset.dim(),
              uint32_t(reinterpret_cast<uint64_t>(dataset.pq_code_book.data_handle()) >> 6),
              uint32_t(params.team_size),
-             uint32_t(metric)};
+             uint32_t(metric),
+             uint32_t(params.smem_dtype)};
 }
 
 inline auto operator==(const key& a, const key& b) -> bool
 {
   return a.data_ptr == b.data_ptr && a.n_rows == b.n_rows && a.dim == b.dim &&
-         a.extra_val == b.extra_val && a.team_size == b.team_size && a.metric == b.metric;
+         a.extra_val == b.extra_val && a.team_size == b.team_size && a.metric == b.metric &&
+         a.smem_dtype == b.smem_dtype;
 }
 
 struct key_hash {
   inline auto operator()(const key& x) const noexcept -> std::size_t
   {
     return size_t{x.data_ptr} + size_t{x.n_rows} * size_t{x.dim} * size_t{x.extra_val} +
-           (size_t{x.team_size} ^ size_t{x.metric});
+           (size_t{x.team_size} ^ size_t{x.metric}) + size_t{x.smem_dtype};
   }
 };
 
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp
index 60d17c5128..60e965796c 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp
@@ -57,10 +57,14 @@ std::shared_ptr<AlgorithmLauncher> build_single_cta_launcher(
             persistent);
 
   if constexpr (std::is_same_v<CodebookTag, tag_codebook_half>) {
-    planner.add_setup_workspace_device_function(
-      dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len);
-    planner.add_compute_distance_device_function(
-      dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len);
+    planner.add_setup_workspace_device_function(dataset_desc.team_size,
+                                                dataset_desc.dataset_block_dim,
+                                                dataset_desc.pq_len,
+                                                dataset_desc.enable_fp8);
+    planner.add_compute_distance_device_function(dataset_desc.team_size,
+                                                 dataset_desc.dataset_block_dim,
+                                                 dataset_desc.pq_len,
+                                                 dataset_desc.enable_fp8);
   } else {
     planner.add_setup_workspace_device_function(dataset_desc.team_size,
                                                 dataset_desc.dataset_block_dim);
@@ -102,10 +106,14 @@ std::shared_ptr<AlgorithmLauncher> build_multi_cta_launcher(
             dataset_desc.pq_len);
 
   if constexpr (std::is_same_v<CodebookTag, tag_codebook_half>) {
-    planner.add_setup_workspace_device_function(
-      dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len);
-    planner.add_compute_distance_device_function(
-      dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len);
+    planner.add_setup_workspace_device_function(dataset_desc.team_size,
+                                                dataset_desc.dataset_block_dim,
+                                                dataset_desc.pq_len,
+                                                dataset_desc.enable_fp8);
+    planner.add_compute_distance_device_function(dataset_desc.team_size,
+                                                 dataset_desc.dataset_block_dim,
+                                                 dataset_desc.pq_len,
+                                                 dataset_desc.enable_fp8);
   } else {
     planner.add_setup_workspace_device_function(dataset_desc.team_size,
                                                 dataset_desc.dataset_block_dim);
@@ -147,10 +155,14 @@ std::shared_ptr<AlgorithmLauncher> build_multi_kernel_launcher(
             dataset_desc.pq_bits,
             dataset_desc.pq_len);
   if constexpr (std::is_same_v<CodebookTag, tag_codebook_half>) {
-    planner.add_setup_workspace_device_function(
-      dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len);
-    planner.add_compute_distance_device_function(
-      dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len);
+    planner.add_setup_workspace_device_function(dataset_desc.team_size,
+                                                dataset_desc.dataset_block_dim,
+                                                dataset_desc.pq_len,
+                                                dataset_desc.enable_fp8);
+    planner.add_compute_distance_device_function(dataset_desc.team_size,
+                                                 dataset_desc.dataset_block_dim,
+                                                 dataset_desc.pq_len,
+                                                 dataset_desc.enable_fp8);
   } else {
     planner.add_setup_workspace_device_function(dataset_desc.team_size,
                                                 dataset_desc.dataset_block_dim);
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp
index b9e7891723..14ef271c2a 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp
@@ -50,11 +50,13 @@ struct CagraPlannerBase : AlgorithmPlanner {
                                                              TeamSz,
                                                              Dim,
                                                              PqBitsV,
-                                                             PqLenV>>();
+                                                             PqLenV,
+                                                             tag_smem_f16>>();
     };
-    dispatch_cagra_team_dim(team_size, dataset_block_dim, [&add]<uint32_t TeamSz, uint32_t Dim>() {
-      add.template operator()<TeamSz, Dim, 0u, 0u>();
-    });
+    dispatch_cagra_standard_team_dim(
+      team_size, dataset_block_dim, [&add]<uint32_t TeamSz, uint32_t Dim>() {
+        add.template operator()<TeamSz, Dim, 0u, 0u>();
+      });
   }
 
   /// VPQ (`tag_codebook_half`): JIT matrix fixes `pq_bits=8`; only `pq_len` is selected at runtime.
@@ -62,32 +64,39 @@ struct CagraPlannerBase : AlgorithmPlanner {
             std::enable_if_t<std::is_same_v<CB, tag_codebook_half>, int> = 0>
   void add_setup_workspace_device_function(uint32_t team_size,
                                            uint32_t dataset_block_dim,
-                                           uint32_t pq_len)
+                                           uint32_t pq_len,
+                                           bool enable_fp8)
   {
     if (pq_len != 2 && pq_len != 4 && pq_len != 8) {
       RAFT_FAIL("CAGRA JIT VPQ setup_workspace expects pq_len in {2,4,8} (matrix uses pq_bits=8)");
     }
-    auto add = [&]<uint32_t TeamSz, uint32_t Dim, uint32_t PqBitsV, uint32_t PqLenV>() {
-      this->add_static_fragment<fragment_tag_setup_workspace<DataTag,
-                                                             IndexTag,
-                                                             DistanceTag,
-                                                             QueryTag,
-                                                             CodebookTag,
-                                                             TeamSz,
-                                                             Dim,
-                                                             PqBitsV,
-                                                             PqLenV>>();
+    auto add =
+      [&]<uint32_t TeamSz, uint32_t Dim, uint32_t PqBitsV, uint32_t PqLenV, typename SmemTag>() {
+        this->add_static_fragment<fragment_tag_setup_workspace<DataTag,
+                                                               IndexTag,
+                                                               DistanceTag,
+                                                               QueryTag,
+                                                               CodebookTag,
+                                                               TeamSz,
+                                                               Dim,
+                                                               PqBitsV,
+                                                               PqLenV,
+                                                               SmemTag>>();
+      };
+    auto dispatch_smem = [&]<typename SmemTag>() {
+      dispatch_cagra_vpq_team_dim(
+        team_size,
+        dataset_block_dim,
+        pq_len,
+        [&add]<uint32_t TeamSz, uint32_t Dim, uint32_t PqBitsV, uint32_t PqLenV>() {
+          add.template operator()<TeamSz, Dim, PqBitsV, PqLenV, SmemTag>();
+        });
     };
-    dispatch_cagra_team_dim(
-      team_size, dataset_block_dim, [&add, pq_len]<uint32_t TeamSz, uint32_t Dim>() {
-        if (pq_len == 2) {
-          add.template operator()<TeamSz, Dim, 8u, 2u>();
-        } else if (pq_len == 4) {
-          add.template operator()<TeamSz, Dim, 8u, 4u>();
-        } else {
-          add.template operator()<TeamSz, Dim, 8u, 8u>();
-        }
-      });
+    if (enable_fp8) {
+      dispatch_smem.template operator()<tag_smem_e5m2>();
+    } else {
+      dispatch_smem.template operator()<tag_smem_f16>();
+    }
   }
 
   /// Registers dist_op + normalization + `compute_distance` for standard layout.
@@ -108,11 +117,13 @@ struct CagraPlannerBase : AlgorithmPlanner {
                                                               TeamSz,
                                                               Dim,
                                                               PqBitsV,
-                                                              PqLenV>>();
+                                                              PqLenV,
+                                                              tag_smem_f16>>();
     };
-    dispatch_cagra_team_dim(team_size, dataset_block_dim, [&add]<uint32_t TeamSz, uint32_t Dim>() {
-      add.template operator()<TeamSz, Dim, 0u, 0u>();
-    });
+    dispatch_cagra_standard_team_dim(
+      team_size, dataset_block_dim, [&add]<uint32_t TeamSz, uint32_t Dim>() {
+        add.template operator()<TeamSz, Dim, 0u, 0u>();
+      });
   }
 
   /// VPQ: only the `compute_distance` fragment (no standard dist_op / normalization in this path).
@@ -120,35 +131,179 @@ struct CagraPlannerBase : AlgorithmPlanner {
             std::enable_if_t<std::is_same_v<CB, tag_codebook_half>, int> = 0>
   void add_compute_distance_device_function(uint32_t team_size,
                                             uint32_t dataset_block_dim,
-                                            uint32_t pq_len)
+                                            uint32_t pq_len,
+                                            bool enable_fp8)
   {
     if (pq_len != 2 && pq_len != 4 && pq_len != 8) {
       RAFT_FAIL("CAGRA JIT VPQ compute_distance expects pq_len in {2,4,8} (matrix uses pq_bits=8)");
     }
-    auto add = [&]<uint32_t TeamSz, uint32_t Dim, uint32_t PqBitsV, uint32_t PqLenV>() {
-      this->add_static_fragment<fragment_tag_compute_distance<DataTag,
-                                                              IndexTag,
-                                                              DistanceTag,
-                                                              QueryTag,
-                                                              CodebookTag,
-                                                              TeamSz,
-                                                              Dim,
-                                                              PqBitsV,
-                                                              PqLenV>>();
+    auto add =
+      [&]<uint32_t TeamSz, uint32_t Dim, uint32_t PqBitsV, uint32_t PqLenV, typename SmemTag>() {
+        this->add_static_fragment<fragment_tag_compute_distance<DataTag,
+                                                                IndexTag,
+                                                                DistanceTag,
+                                                                QueryTag,
+                                                                CodebookTag,
+                                                                TeamSz,
+                                                                Dim,
+                                                                PqBitsV,
+                                                                PqLenV,
+                                                                SmemTag>>();
+      };
+    auto dispatch_smem = [&]<typename SmemTag>() {
+      dispatch_cagra_vpq_team_dim(
+        team_size,
+        dataset_block_dim,
+        pq_len,
+        [&add]<uint32_t TeamSz, uint32_t Dim, uint32_t PqBitsV, uint32_t PqLenV>() {
+          add.template operator()<TeamSz, Dim, PqBitsV, PqLenV, SmemTag>();
+        });
     };
-    dispatch_cagra_team_dim(
-      team_size, dataset_block_dim, [&add, pq_len]<uint32_t TeamSz, uint32_t Dim>() {
-        if (pq_len == 2) {
-          add.template operator()<TeamSz, Dim, 8u, 2u>();
-        } else if (pq_len == 4) {
-          add.template operator()<TeamSz, Dim, 8u, 4u>();
-        } else {
-          add.template operator()<TeamSz, Dim, 8u, 8u>();
-        }
-      });
+    if (enable_fp8) {
+      dispatch_smem.template operator()<tag_smem_e5m2>();
+    } else {
+      dispatch_smem.template operator()<tag_smem_f16>();
+    }
   }
 
  private:
+  template <typename Lambda>
+  static void dispatch_cagra_standard_team_dim(uint32_t team_size,
+                                               uint32_t dataset_block_dim,
+                                               Lambda&& l)
+  {
+    switch (team_size) {
+      case 8:
+        switch (dataset_block_dim) {
+          case 128: std::forward<Lambda>(l).template operator()<8u, 128u>(); return;
+          case 256: std::forward<Lambda>(l).template operator()<8u, 256u>(); return;
+          case 512: std::forward<Lambda>(l).template operator()<8u, 512u>(); return;
+          default: break;
+        }
+        break;
+      case 16:
+        switch (dataset_block_dim) {
+          case 128: std::forward<Lambda>(l).template operator()<16u, 128u>(); return;
+          case 256: std::forward<Lambda>(l).template operator()<16u, 256u>(); return;
+          case 512: std::forward<Lambda>(l).template operator()<16u, 512u>(); return;
+          default: break;
+        }
+        break;
+      case 32:
+        switch (dataset_block_dim) {
+          case 128: std::forward<Lambda>(l).template operator()<32u, 128u>(); return;
+          case 256: std::forward<Lambda>(l).template operator()<32u, 256u>(); return;
+          case 512: std::forward<Lambda>(l).template operator()<32u, 512u>(); return;
+          default: break;
+        }
+        break;
+      default: break;
+    }
+    RAFT_FAIL("Unsupported standard team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u",
+              static_cast<unsigned>(team_size),
+              static_cast<unsigned>(dataset_block_dim));
+  }
+
+  template <uint32_t PqLenV, typename Lambda>
+  static void dispatch_cagra_vpq_pq2_4_team_dim(uint32_t team_size,
+                                                uint32_t dataset_block_dim,
+                                                Lambda&& l)
+  {
+    switch (team_size) {
+      case 8:
+        switch (dataset_block_dim) {
+          case 128: std::forward<Lambda>(l).template operator()<8u, 128u, 8u, PqLenV>(); return;
+          case 256: std::forward<Lambda>(l).template operator()<8u, 256u, 8u, PqLenV>(); return;
+          case 512: std::forward<Lambda>(l).template operator()<8u, 512u, 8u, PqLenV>(); return;
+          default: break;
+        }
+        break;
+      case 16:
+        switch (dataset_block_dim) {
+          case 128: std::forward<Lambda>(l).template operator()<16u, 128u, 8u, PqLenV>(); return;
+          case 256: std::forward<Lambda>(l).template operator()<16u, 256u, 8u, PqLenV>(); return;
+          case 512: std::forward<Lambda>(l).template operator()<16u, 512u, 8u, PqLenV>(); return;
+          default: break;
+        }
+        break;
+      case 32:
+        switch (dataset_block_dim) {
+          case 128: std::forward<Lambda>(l).template operator()<32u, 128u, 8u, PqLenV>(); return;
+          case 256: std::forward<Lambda>(l).template operator()<32u, 256u, 8u, PqLenV>(); return;
+          case 512: std::forward<Lambda>(l).template operator()<32u, 512u, 8u, PqLenV>(); return;
+          default: break;
+        }
+        break;
+      default: break;
+    }
+    RAFT_FAIL(
+      "Unsupported VPQ pq_len=%u team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u",
+      static_cast<unsigned>(PqLenV),
+      static_cast<unsigned>(team_size),
+      static_cast<unsigned>(dataset_block_dim));
+  }
+
+  template <typename Lambda>
+  static void dispatch_cagra_vpq_pq8_team_dim(uint32_t team_size,
+                                              uint32_t dataset_block_dim,
+                                              Lambda&& l)
+  {
+    switch (team_size) {
+      case 4:
+        switch (dataset_block_dim) {
+          case 128: std::forward<Lambda>(l).template operator()<4u, 128u, 8u, 8u>(); return;
+          default: break;
+        }
+        break;
+      case 8:
+        switch (dataset_block_dim) {
+          case 256: std::forward<Lambda>(l).template operator()<8u, 256u, 8u, 8u>(); return;
+          default: break;
+        }
+        break;
+      case 16:
+        switch (dataset_block_dim) {
+          case 512: std::forward<Lambda>(l).template operator()<16u, 512u, 8u, 8u>(); return;
+          default: break;
+        }
+        break;
+      case 32:
+        switch (dataset_block_dim) {
+          case 1024: std::forward<Lambda>(l).template operator()<32u, 1024u, 8u, 8u>(); return;
+          default: break;
+        }
+        break;
+      default: break;
+    }
+    RAFT_FAIL(
+      "Unsupported VPQ pq_len=8 team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u",
+      static_cast<unsigned>(team_size),
+      static_cast<unsigned>(dataset_block_dim));
+  }
+
+  template <typename Lambda>
+  static void dispatch_cagra_vpq_team_dim(uint32_t team_size,
+                                          uint32_t dataset_block_dim,
+                                          uint32_t pq_len,
+                                          Lambda&& l)
+  {
+    switch (pq_len) {
+      case 2:
+        dispatch_cagra_vpq_pq2_4_team_dim<2u>(
+          team_size, dataset_block_dim, std::forward<Lambda>(l));
+        return;
+      case 4:
+        dispatch_cagra_vpq_pq2_4_team_dim<4u>(
+          team_size, dataset_block_dim, std::forward<Lambda>(l));
+        return;
+      case 8:
+        dispatch_cagra_vpq_pq8_team_dim(team_size, dataset_block_dim, std::forward<Lambda>(l));
+        return;
+      default: break;
+    }
+    RAFT_FAIL("CAGRA JIT VPQ expects pq_len in {2,4,8}; got %u", static_cast<unsigned>(pq_len));
+  }
+
   void add_dist_op_device_function(cuvs::distance::DistanceType metric)
   {
     // dist_op_matrix.json pairs tag_metric_hamming with uint8 query (tag_u8) only; L2/IP/L1 use
@@ -193,15 +348,16 @@ struct CagraPlannerBase : AlgorithmPlanner {
                                          uint32_t dataset_block_dim)
   {
     auto go = [&]<typename NormT>() {
-      dispatch_cagra_team_dim(team_size, dataset_block_dim, [&]<uint32_t TeamSz, uint32_t Dim>() {
-        this->add_static_fragment<fragment_tag_apply_normalization_standard<DataTag,
-                                                                            IndexTag,
-                                                                            DistanceTag,
-                                                                            QueryTag,
-                                                                            TeamSz,
-                                                                            Dim,
-                                                                            NormT>>();
-      });
+      dispatch_cagra_standard_team_dim(
+        team_size, dataset_block_dim, [&]<uint32_t TeamSz, uint32_t Dim>() {
+          this->add_static_fragment<fragment_tag_apply_normalization_standard<DataTag,
+                                                                              IndexTag,
+                                                                              DistanceTag,
+                                                                              QueryTag,
+                                                                              TeamSz,
+                                                                              Dim,
+                                                                              NormT>>();
+        });
     };
     // tag_u8 is only used for BitwiseHamming query layout; cosine norm fragments are built for
     // float query tag. Use if constexpr so we do not instantiate tag_norm_cosine with tag_u8
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh
index 08f44c171e..59b43bea64 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh
@@ -100,9 +100,10 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker_impl(
   constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim;
   constexpr auto PQ_BITS         = DescriptorT::kPqBits;
   constexpr auto PQ_LEN          = DescriptorT::kPqLen;
+  constexpr auto EnableFP8       = DescriptorT::kEnableFP8;
   using PQ_CODEBOOK_LOAD_T       = uint32_t;
 
-  using smem_val_config                  = vpq_smem_value_config<PQ_LEN>;
+  using smem_val_config                  = vpq_smem_value_config<PQ_LEN, EnableFP8>;
   using smem_val_pack_t                  = typename smem_val_config::smem_val_pack_t;
   using smem_val_pack_uint_t             = typename smem_val_config::smem_val_pack_uint_t;
   constexpr uint32_t num_packed_elements = smem_val_config::num_packed_elements;
@@ -154,23 +155,55 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker_impl(
           for (std::uint32_t m = 0; m < PQ_LEN / num_packed_elements; m++) {
             constexpr uint32_t vq_val_pack_num_elements = 2;
             constexpr auto kQueryBlock                  = DatasetBlockDim / (vlen * PQ_LEN);
-            const std::uint32_t vq_half2_index =
+            std::uint32_t vq_half2_index =
               m * (num_packed_elements / vq_val_pack_num_elements) + (PQ_LEN / 2) * v;
 
-            static_assert(num_packed_elements == 2,
-                          "CAGRA JIT VPQ currently stores pq_len=8 in half2 shared-memory packs");
-            const uint32_t query_val_index =
-              vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
+            uint32_t query_val_index;
+            if constexpr (num_packed_elements == 2) {
+              query_val_index =
+                vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
+            } else if constexpr (PQ_LEN == num_packed_elements) {
+              query_val_index = elem_offset + v * (DatasetBlockDim / (num_packed_elements * vlen)) +
+                                e * TeamSize + laneId;
+            } else {
+              const uint32_t query_vec_element_id =
+                (elem_offset + e * vlen * TeamSize + v + laneId * vlen) * PQ_LEN /
+                num_packed_elements;
+              constexpr auto kStride = vlen * PQ_LEN / num_packed_elements;
+              query_val_index =
+                transpose<DatasetBlockDim / num_packed_elements, kStride>(query_vec_element_id);
+            }
 
-            smem_val_pack_t q2, c2;
-            device::lds(q2, query_ptr + sizeof(smem_val_pack_t) * query_val_index);
-            device::lds(c2,
-                        pq_codebook_ptr +
-                          sizeof(smem_val_pack_uint_t) * ((1 << PQ_BITS) * m + (pq_code & 0xff)));
-            auto dist =
-              q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
-            dist = dist * dist;
-            norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+            if constexpr (num_packed_elements == 2) {
+              smem_val_pack_t q2, c2;
+              device::lds(q2, query_ptr + sizeof(smem_val_pack_t) * query_val_index);
+              device::lds(c2,
+                          pq_codebook_ptr +
+                            sizeof(smem_val_pack_uint_t) * ((1 << PQ_BITS) * m + (pq_code & 0xff)));
+              auto dist =
+                q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
+              dist = dist * dist;
+              norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+            } else if constexpr (num_packed_elements == 4 || num_packed_elements == 8) {
+              smem_val_pack_t q_vec, c_vec;
+              device::lds(q_vec.as_uint(),
+                          query_ptr + sizeof(smem_val_pack_uint_t) * query_val_index);
+              device::lds(c_vec.as_uint(),
+                          pq_codebook_ptr +
+                            sizeof(smem_val_pack_uint_t) * ((1 << PQ_BITS) * m + (pq_code & 0xff)));
+
+              half2 q2, c2;
+#pragma unroll
+              for (uint32_t bi = 0; bi < num_packed_elements / 2; bi++) {
+                q2 = q_vec.as_half2(bi);
+                c2 = c_vec.as_half2(bi);
+                auto dist =
+                  q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[vq_half2_index];
+                dist = dist * dist;
+                norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+                vq_half2_index += 1;
+              }
+            }
           }
           pq_code >>= 8;
         }
@@ -237,7 +270,8 @@ template <uint32_t TeamSize,
           typename DataT,
           typename IndexT,
           typename DistanceT,
-          typename QueryT>
+          typename QueryT,
+          bool EnableFP8>
 __device__ DistanceT compute_distance_impl(
   const typename dataset_descriptor_base_t<DataT, IndexT, DistanceT>::args_t args,
   IndexT dataset_index)
@@ -256,7 +290,8 @@ __device__ DistanceT compute_distance_impl(
                                                 DataT,
                                                 IndexT,
                                                 DistanceT,
-                                                QueryT>;
+                                                QueryT,
+                                                EnableFP8>;
     return compute_distance_vpq_impl<desc_t>(args, dataset_index);
   } else {
     static_assert(sizeof(TeamSize) == 0,
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in
index 13cd022918..130cbf502f 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in
@@ -11,6 +11,7 @@ constexpr uint32_t k_team_size         = @team_size@u;
 constexpr uint32_t k_dataset_block_dim = @dataset_block_dim@u;
 constexpr uint32_t k_pq_bits           = @pq_bits@u;
 constexpr uint32_t k_pq_len            = @pq_len@u;
+constexpr bool k_enable_fp8            = @enable_fp8@;
 
 using data_t     = @data_type@;
 using index_t    = @index_type@;
@@ -38,7 +39,8 @@ __device__ distance_t compute_distance<data_t, index_t, distance_t>(const args_t
                                                   data_t,
                                                   index_t,
                                                   distance_t,
-                                                  query_t>(args, dataset_index)
+                                                  query_t,
+                                                  k_enable_fp8>(args, dataset_index)
                           : distance_t{};
   return device::team_sum(per_thread, team_size_bits);
 }
@@ -55,7 +57,8 @@ compute_distance_per_thread<data_t, index_t, distance_t>(const args_t args, inde
                                data_t,
                                index_t,
                                distance_t,
-                               query_t>(args, dataset_index);
+                               query_t,
+                               k_enable_fp8>(args, dataset_index);
 }
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json
index 2e64ee2ce1..f1ce0daaab 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json
@@ -81,6 +81,12 @@
         "codebook_type": "void",
         "codebook_abbrev": "none"
       }
+    ],
+    "_smem": [
+      {
+        "enable_fp8": "false",
+        "smem_abbrev": "f16"
+      }
     ]
   },
   {
@@ -149,6 +155,16 @@
         "codebook_type": "half",
         "codebook_abbrev": "half"
       }
+    ],
+    "_smem": [
+      {
+        "enable_fp8": "false",
+        "smem_abbrev": "f16"
+      },
+      {
+        "enable_fp8": "true",
+        "smem_abbrev": "e5m2"
+      }
     ]
   },
   {
@@ -219,6 +235,16 @@
         "dataset_block_dim": "1024",
         "team_size": "32"
       }
+    ],
+    "_smem": [
+      {
+        "enable_fp8": "false",
+        "smem_abbrev": "f16"
+      },
+      {
+        "enable_fp8": "true",
+        "smem_abbrev": "e5m2"
+      }
     ]
   }
 ]
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh
index ed83c181fe..220c76ac96 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh
@@ -84,7 +84,8 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq_impl(
   constexpr auto kDatasetBlockDim    = DescriptorT::kDatasetBlockDim;
   constexpr auto PQ_BITS             = DescriptorT::kPqBits;
   constexpr auto PQ_LEN              = DescriptorT::kPqLen;
-  using smem_val_config              = vpq_smem_value_config<PQ_LEN>;
+  constexpr auto EnableFP8           = DescriptorT::kEnableFP8;
+  using smem_val_config              = vpq_smem_value_config<PQ_LEN, EnableFP8>;
   using smem_val_t                   = typename smem_val_config::smem_val_t;
   using smem_val_pack_t              = typename smem_val_config::smem_val_pack_t;
   using smem_val_pack_uint_t         = typename smem_val_config::smem_val_pack_uint_t;
@@ -120,10 +121,20 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq_impl(
         const auto smem_index =
           (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
 
-        smem_val_pack_t buf;
-        buf.x = r->pq_code_book_ptr()[i];
-        buf.y = r->pq_code_book_ptr()[i + 1];
-        device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_t), buf);
+        if constexpr (num_packed_elements == 2) {
+          smem_val_pack_t buf;
+          buf.x = r->pq_code_book_ptr()[i];
+          buf.y = r->pq_code_book_ptr()[i + 1];
+          device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_t), buf);
+        } else if constexpr (num_packed_elements == 4 || num_packed_elements == 8) {
+          smem_val_pack_t buf;
+#pragma unroll
+          for (uint32_t k = 0; k < num_packed_elements; k++) {
+            buf.data.x1[k] =
+              static_cast<smem_val_t>(static_cast<float>(r->pq_code_book_ptr()[i + k]));
+          }
+          device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_uint_t), buf.as_uint());
+        }
       }
     }
   }
@@ -137,9 +148,21 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq_impl(
                                   DescriptorT::kSMemCodeBookSizeInBytes);
   for (unsigned i = threadIdx.x * num_packed_elements; i < dim;
        i += blockDim.x * num_packed_elements) {
-    smem_val_pack_t buf{0, 0};
-    if (i < dim) { buf.x = mapping(queries_ptr[i]); }
-    if (i + 1 < dim) { buf.y = mapping(queries_ptr[i + 1]); }
+    smem_val_pack_t buf;
+    if constexpr (num_packed_elements == 2) {
+      buf.x = 0;
+      buf.y = 0;
+      if (i < dim) { buf.x = mapping(queries_ptr[i]); }
+      if (i + 1 < dim) { buf.y = mapping(queries_ptr[i + 1]); }
+    } else if constexpr (num_packed_elements == 4 || num_packed_elements == 8) {
+#pragma unroll
+      for (uint32_t k = 0; k < num_packed_elements; k++) {
+        buf.data.x1[k] = static_cast<smem_val_t>(0.0f);
+        if (i + k < dim) {
+          buf.data.x1[k] = static_cast<smem_val_t>(static_cast<float>(mapping(queries_ptr[i + k])));
+        }
+      }
+    }
     if constexpr ((PQ_BITS == 8) && (PQ_LEN % num_packed_elements == 0)) {
       constexpr uint32_t vlen = 4;  // **** DO NOT CHANGE ****
       constexpr auto kStride  = vlen * PQ_LEN / num_packed_elements;
@@ -162,7 +185,8 @@ template <uint32_t TeamSize,
           typename DataT,
           typename IndexT,
           typename DistanceT,
-          typename QueryT>
+          typename QueryT,
+          bool EnableFP8>
 __device__ const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* setup_workspace_impl(
   const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* desc_ptr,
   void* smem,
@@ -186,7 +210,8 @@ __device__ const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* setup_work
                                                       DataT,
                                                       IndexT,
                                                       DistanceT,
-                                                      QueryT>;
+                                                      QueryT,
+                                                      EnableFP8>;
     const desc_t* desc = static_cast<const desc_t*>(desc_ptr);
 
     const desc_t* result = setup_workspace_vpq_impl<desc_t>(desc, smem, queries, query_id);
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in
index fa17705250..2177212e36 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in
@@ -12,6 +12,7 @@ constexpr uint32_t k_team_size         = @team_size@u;
 constexpr uint32_t k_dataset_block_dim = @dataset_block_dim@u;
 constexpr uint32_t k_pq_bits           = @pq_bits@u;
 constexpr uint32_t k_pq_len            = @pq_len@u;
+constexpr bool k_enable_fp8            = @enable_fp8@;
 
 using data_t     = @data_type@;
 using index_t    = @index_type@;
@@ -39,7 +40,8 @@ setup_workspace<data_t, index_t, distance_t>(
                               data_t,
                               index_t,
                               distance_t,
-                              query_t>(desc, smem, queries, query_id);
+                              query_t,
+                              k_enable_fp8>(desc, smem, queries, query_id);
 }
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json
index 64c82ce13a..7ee92494e6 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json
@@ -81,6 +81,12 @@
         "codebook_type": "void",
         "codebook_abbrev": "none"
       }
+    ],
+    "_smem": [
+      {
+        "enable_fp8": "false",
+        "smem_abbrev": "f16"
+      }
     ]
   },
   {
@@ -149,6 +155,16 @@
         "codebook_type": "half",
         "codebook_abbrev": "half"
       }
+    ],
+    "_smem": [
+      {
+        "enable_fp8": "false",
+        "smem_abbrev": "f16"
+      },
+      {
+        "enable_fp8": "true",
+        "smem_abbrev": "e5m2"
+      }
     ]
   },
   {
@@ -219,6 +235,16 @@
         "dataset_block_dim": "1024",
         "team_size": "32"
       }
+    ],
+    "_smem": [
+      {
+        "enable_fp8": "false",
+        "smem_abbrev": "f16"
+      },
+      {
+        "enable_fp8": "true",
+        "smem_abbrev": "e5m2"
+      }
     ]
   }
 ]
diff --git a/cpp/src/neighbors/detail/cagra/packed_type.hpp b/cpp/src/neighbors/detail/cagra/packed_type.hpp
new file mode 100644
index 0000000000..f52edc126b
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/packed_type.hpp
@@ -0,0 +1,49 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+#include <cstdint>
+#include <raft/core/detail/macros.hpp>
+
+#include <cuda_fp16.h>
+#include <cuda_fp8.h>
+
+namespace cuvs::neighbors::cagra::detail::device {
+template <uint32_t Bit>
+struct uintN_t {};
+template <>
+struct uintN_t<32> {
+  using type = uint32_t;
+};
+template <>
+struct uintN_t<64> {
+  using type = uint64_t;
+};
+
+template <uint32_t NumPacked, uint32_t ExpBits>
+struct fp8xN {};
+
+template <uint32_t NumPacked>
+struct fp8xN<NumPacked, 5> {
+  using uint_t                           = typename uintN_t<8 * NumPacked>::type;
+  using unit_t                           = __nv_fp8_e5m2;
+  using x2_t                             = __nv_fp8x2_storage_t;
+  static constexpr uint32_t num_elements = NumPacked;
+
+  union {
+    unit_t x1[num_elements];
+    x2_t x2[num_elements / 2];
+    uint_t u;
+  } data;
+
+  HDI fp8xN() { data.u = 0; }
+
+  HDI uint_t& as_uint() { return data.u; }
+  HDI uint_t as_uint() const { return data.u; }
+  HDI half2 as_half2(const uint32_t i) const
+  {
+    return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2);
+  }
+};
+}  // namespace cuvs::neighbors::cagra::detail::device

From c323fa17eb8c603bb45a47e7c36c0fe3a6cc9d32 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Thu, 4 Jun 2026 18:40:30 +0900
Subject: [PATCH 106/119] Update EnableFP8

---
 cpp/CMakeLists.txt                            | 14 ++++----
 .../detail/jit_lto/cagra/cagra_fragments.hpp  |  8 +++--
 cpp/include/cuvs/neighbors/cagra.hpp          |  6 ++++
 .../detail/cagra/compute_distance.hpp         | 17 ++++-----
 .../cagra/compute_distance_vpq-impl.cuh       | 35 ++++++++++---------
 .../detail/cagra/compute_distance_vpq.hpp     | 12 ++++---
 .../cagra/compute_distance_vpq_inst.cu.in     |  4 +--
 .../cagra/compute_distance_vpq_matrix.json    | 12 +++++--
 .../cagra_jit_launcher_factory.hpp            | 12 +++----
 .../jit_lto_kernels/cagra_planner_base.hpp    | 32 ++++++++++-------
 .../jit_lto_kernels/compute_distance_impl.cuh |  8 ++---
 .../compute_distance_kernel.cu.in             |  6 ++--
 .../compute_distance_matrix.json              | 10 +++---
 .../jit_lto_kernels/setup_workspace_impl.cuh  |  8 ++---
 .../setup_workspace_kernel.cu.in              |  4 +--
 .../setup_workspace_matrix.json               | 10 +++---
 cpp/tests/neighbors/ann_cagra.cuh             | 22 +++++++++++-
 17 files changed, 135 insertions(+), 85 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7f9f88695c..a49df49812 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -268,12 +268,12 @@ if(NOT BUILD_CPU_ONLY)
     INPUT_FILE
       "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in"
     OUTPUT_FILE_FORMAT
-      "${CMAKE_CURRENT_BINARY_DIR}/src/neighbors/detail/cagra/compute_distance_vpq_inst_data_@data_abbrev@_index_@index_abbrev@_distance_@distance_abbrev@_codebook_@codebook_abbrev@_metric_@metric@_team_@team_size@_dim_@dim@_pq_bits_@pq_bits@_pq_len_@pq_len@.cu"
+      "${CMAKE_CURRENT_BINARY_DIR}/src/neighbors/detail/cagra/compute_distance_vpq_inst_data_@data_abbrev@_index_@index_abbrev@_distance_@distance_abbrev@_codebook_@codebook_abbrev@_metric_@metric@_team_@team_size@_dim_@dim@_pq_bits_@pq_bits@_pq_len_@pq_len@_smem_@smem_abbrev@.cu"
   )
   generate_string_matrix(
     cagra_compute_distance_vpq_selector_template_params
     ITEM_FORMAT
-    "\nvpq_descriptor_spec<DistanceType::@metric@, @team_size@, @dim@, @pq_bits@, @pq_len@, @codebook_type@, @data_type@, @index_type@, @distance_type@>"
+    "\nvpq_descriptor_spec<DistanceType::@metric@, @team_size@, @dim@, @pq_bits@, @pq_len@, @codebook_type@, @data_type@, @index_type@, @distance_type@, @smem_dtype@>"
     GLUE
     ","
     MATRIX_JSON_FILE
@@ -282,7 +282,7 @@ if(NOT BUILD_CPU_ONLY)
   generate_string_matrix(
     cagra_compute_distance_vpq_template_inst
     ITEM_FORMAT
-    "extern template struct vpq_descriptor_spec<DistanceType::@metric@, @team_size@, @dim@, @pq_bits@, @pq_len@, @codebook_type@, @data_type@, @index_type@, @distance_type@>@semicolon@"
+    "extern template struct vpq_descriptor_spec<DistanceType::@metric@, @team_size@, @dim@, @pq_bits@, @pq_len@, @codebook_type@, @data_type@, @index_type@, @distance_type@, @smem_dtype@>@semicolon@"
     GLUE
     "\n"
     MATRIX_JSON_FILE
@@ -688,13 +688,13 @@ if(NOT BUILD_CPU_ONLY)
   generate_jit_lto_kernels(
     jit_lto_files
     NAME_FORMAT
-      "cagra_setup_workspace@pq_prefix@_team_size_@team_size@_dataset_block_dim_@dataset_block_dim@_@pq_bits@pq_@pq_len@subd_data_@data_abbrev@_query_@query_abbrev@"
+      "cagra_setup_workspace@pq_prefix@_team_size_@team_size@_dataset_block_dim_@dataset_block_dim@_@pq_bits@pq_@pq_len@subd_data_@data_abbrev@_query_@query_abbrev@_smem_@smem_abbrev@"
     MATRIX_JSON_FILE
       "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json"
     KERNEL_INPUT_FILE
       "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in"
     FRAGMENT_TAG_FORMAT
-      "${cagra_ns}::fragment_tag_setup_workspace<${neighbors_ns}::tag_@data_abbrev@, ${neighbors_ns}::tag_index_@index_abbrev@, ${cagra_ns}::tag_dist_@distance_abbrev@, ${neighbors_ns}::tag_@query_abbrev@, ${cagra_ns}::tag_codebook_@codebook_abbrev@, @team_size@, @dataset_block_dim@, @pq_bits@, @pq_len@>"
+      "${cagra_ns}::fragment_tag_setup_workspace<${neighbors_ns}::tag_@data_abbrev@, ${neighbors_ns}::tag_index_@index_abbrev@, ${cagra_ns}::tag_dist_@distance_abbrev@, ${neighbors_ns}::tag_@query_abbrev@, ${cagra_ns}::tag_codebook_@codebook_abbrev@, @team_size@, @dataset_block_dim@, @pq_bits@, @pq_len@, ${cagra_ns}::tag_smem_@smem_abbrev@>"
     FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/cagra/cagra_fragments.hpp>"
                               "<cuvs/detail/jit_lto/common_fragments.hpp>"
     OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/cagra/setup_workspace"
@@ -704,13 +704,13 @@ if(NOT BUILD_CPU_ONLY)
   generate_jit_lto_kernels(
     jit_lto_files
     NAME_FORMAT
-      "cagra_compute_distance@pq_prefix@_team_size_@team_size@_dataset_block_dim_@dataset_block_dim@_@pq_bits@pq_@pq_len@subd_data_@data_abbrev@_query_@query_abbrev@"
+      "cagra_compute_distance@pq_prefix@_team_size_@team_size@_dataset_block_dim_@dataset_block_dim@_@pq_bits@pq_@pq_len@subd_data_@data_abbrev@_query_@query_abbrev@_smem_@smem_abbrev@"
     MATRIX_JSON_FILE
       "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json"
     KERNEL_INPUT_FILE
       "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in"
     FRAGMENT_TAG_FORMAT
-      "${cagra_ns}::fragment_tag_compute_distance<${neighbors_ns}::tag_@data_abbrev@, ${neighbors_ns}::tag_index_@index_abbrev@, ${cagra_ns}::tag_dist_@distance_abbrev@, ${neighbors_ns}::tag_@query_abbrev@, ${cagra_ns}::tag_codebook_@codebook_abbrev@, @team_size@, @dataset_block_dim@, @pq_bits@, @pq_len@>"
+      "${cagra_ns}::fragment_tag_compute_distance<${neighbors_ns}::tag_@data_abbrev@, ${neighbors_ns}::tag_index_@index_abbrev@, ${cagra_ns}::tag_dist_@distance_abbrev@, ${neighbors_ns}::tag_@query_abbrev@, ${cagra_ns}::tag_codebook_@codebook_abbrev@, @team_size@, @dataset_block_dim@, @pq_bits@, @pq_len@, ${cagra_ns}::tag_smem_@smem_abbrev@>"
     FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/cagra/cagra_fragments.hpp>"
                               "<cuvs/detail/jit_lto/common_fragments.hpp>"
     OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/cagra/compute_distance"
diff --git a/cpp/include/cuvs/detail/jit_lto/cagra/cagra_fragments.hpp b/cpp/include/cuvs/detail/jit_lto/cagra/cagra_fragments.hpp
index 0b42d79379..67cdd38783 100644
--- a/cpp/include/cuvs/detail/jit_lto/cagra/cagra_fragments.hpp
+++ b/cpp/include/cuvs/detail/jit_lto/cagra/cagra_fragments.hpp
@@ -16,6 +16,8 @@ struct tag_metric_cosine {};
 struct tag_metric_hamming {};
 struct tag_codebook_none {};
 struct tag_codebook_half {};
+struct tag_smem_f16 {};
+struct tag_smem_e5m2 {};
 struct tag_metric_l1 {};
 struct tag_norm_noop {};
 struct tag_norm_cosine {};
@@ -33,7 +35,8 @@ template <typename DataTag,
           uint32_t TeamSize,
           uint32_t DatasetBlockDim,
           uint32_t PqBits,
-          uint32_t PqLen>
+          uint32_t PqLen,
+          typename SmemTag>
 struct fragment_tag_setup_workspace {};
 
 template <typename DataTag,
@@ -44,7 +47,8 @@ template <typename DataTag,
           uint32_t TeamSize,
           uint32_t DatasetBlockDim,
           uint32_t PqBits,
-          uint32_t PqLen>
+          uint32_t PqLen,
+          typename SmemTag>
 struct fragment_tag_compute_distance {};
 
 template <typename QueryTag, typename DistanceTag, typename MetricTag>
diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 8edbcab8fa..d2a55ce406 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -276,6 +276,8 @@ enum class search_algo {
 
 enum class hash_mode { HASH = 0, SMALL = 1, AUTO = 100 };
 
+enum class internal_dtype { F16 = 0, E5M2 = 1, AUTO = 100 };
+
 struct search_params : cuvs::neighbors::search_params {
   /** Maximum number of queries to search at the same time (batch size). Auto select when 0.*/
   size_t max_queries = 0;
@@ -349,6 +351,10 @@ struct search_params : cuvs::neighbors::search_params {
    * negative, in which case the filtering rate is automatically calculated.
    */
   float filtering_rate = -1.0;
+
+  /** Data type of the query vector and codebook table on shared memory. Currently, only VPQ
+   * supports FP8. **/
+  internal_dtype smem_dtype = internal_dtype::AUTO;
 };
 
 /**
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
index 7f921ce948..45997a62a3 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
@@ -202,12 +202,12 @@ struct dataset_descriptor_host {
   uint32_t team_size             = 0;
 
   // JIT LTO metadata - stored when descriptor is created
-  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded;
-  uint32_t dataset_block_dim          = 0;
-  bool is_vpq                         = false;
-  uint32_t pq_bits                    = 0;
-  uint32_t pq_len                     = 0;
-  bool enable_fp8                     = false;
+  cuvs::distance::DistanceType metric               = cuvs::distance::DistanceType::L2Expanded;
+  uint32_t dataset_block_dim                        = 0;
+  bool is_vpq                                       = false;
+  uint32_t pq_bits                                  = 0;
+  uint32_t pq_len                                   = 0;
+  cuvs::neighbors::cagra::internal_dtype smem_dtype = cuvs::neighbors::cagra::internal_dtype::F16;
   // Codebook type is determined by DataT for VPQ (always half for now)
 
   struct state {
@@ -260,7 +260,8 @@ struct dataset_descriptor_host {
                           bool is_vpq_val      = false,
                           uint32_t pq_bits_val = 0,
                           uint32_t pq_len_val  = 0,
-                          bool enable_fp8_val  = false)
+                          cuvs::neighbors::cagra::internal_dtype smem_dtype_val =
+                            cuvs::neighbors::cagra::internal_dtype::F16)
     : value_{std::make_shared<state>(init, sizeof(DescriptorImpl))},
       smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()},
       team_size{dd_host.team_size()},
@@ -269,7 +270,7 @@ struct dataset_descriptor_host {
       is_vpq{is_vpq_val},
       pq_bits{pq_bits_val},
       pq_len{pq_len_val},
-      enable_fp8{enable_fp8_val}
+      smem_dtype{smem_dtype_val}
   {
   }
 
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index ea994c450a..f73f901f95 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -15,21 +15,24 @@
 
 namespace cuvs::neighbors::cagra::detail {
 
-template <uint32_t PQ_LEN, bool EnableFP8, class Enable = void>
+template <uint32_t PQ_LEN, cuvs::neighbors::cagra::internal_dtype SmemDType, class Enable = void>
 struct vpq_smem_value_config;
 
-template <uint32_t PQ_LEN, bool EnableFP8>
-struct vpq_smem_value_config<PQ_LEN, EnableFP8, std::enable_if_t<PQ_LEN == 2 || !EnableFP8>> {
+template <uint32_t PQ_LEN, cuvs::neighbors::cagra::internal_dtype SmemDType>
+struct vpq_smem_value_config<
+  PQ_LEN,
+  SmemDType,
+  std::enable_if_t<PQ_LEN == 2 || SmemDType == cuvs::neighbors::cagra::internal_dtype::F16>> {
   using smem_val_pack_t                         = half2;
   using smem_val_t                              = half;
   using smem_val_pack_uint_t                    = uint32_t;
   static constexpr uint32_t num_packed_elements = 2;
 };
 
-template <uint32_t PQ_LEN, bool EnableFP8>
+template <uint32_t PQ_LEN>
 struct vpq_smem_value_config<PQ_LEN,
-                             EnableFP8,
-                             std::enable_if_t<(PQ_LEN == 4 || PQ_LEN == 8) && EnableFP8>> {
+                             cuvs::neighbors::cagra::internal_dtype::E5M2,
+                             std::enable_if_t<PQ_LEN == 4 || PQ_LEN == 8>> {
   using smem_val_pack_t                         = device::fp8xN<PQ_LEN, 5>;
   using smem_val_t                              = typename smem_val_pack_t::unit_t;
   using smem_val_pack_uint_t                    = typename smem_val_pack_t::uint_t;
@@ -45,7 +48,7 @@ template <uint32_t TeamSize,
           typename IndexT,
           typename DistanceT,
           typename QueryT,
-          bool EnableFP8>
+          cuvs::neighbors::cagra::internal_dtype SmemDType>
 struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, IndexT, DistanceT> {
   using base_type   = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
   using CODE_BOOK_T = CodebookT;
@@ -61,7 +64,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
   constexpr static inline auto kDatasetBlockDim = DatasetBlockDim;
   constexpr static inline auto kPqBits          = PQ_BITS;
   constexpr static inline auto kPqLen           = PQ_LEN;
-  constexpr static inline auto kEnableFP8       = EnableFP8;
+  constexpr static inline auto kSmemDType       = SmemDType;
 
   static_assert(std::is_same_v<CODE_BOOK_T, half>, "Only CODE_BOOK_T = `half` is supported now");
 
@@ -104,7 +107,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
     return args.extra_word1;
   }
 
-  using smem_val_config = vpq_smem_value_config<PQ_LEN, EnableFP8>;
+  using smem_val_config = vpq_smem_value_config<PQ_LEN, SmemDType>;
 
   static constexpr std::uint32_t kSMemCodeBookSizeInBytes =
     (1 << PQ_BITS) * PQ_LEN * utils::size_of<typename smem_val_config::smem_val_pack_uint_t>() /
@@ -152,7 +155,7 @@ template <cuvs::distance::DistanceType Metric,
           typename DataT,
           typename IndexT,
           typename DistanceT,
-          bool EnableFP8>
+          cuvs::neighbors::cagra::internal_dtype SmemDType>
 RAFT_KERNEL __launch_bounds__(1, 1)
   vpq_dataset_descriptor_init_kernel(dataset_descriptor_base_t<DataT, IndexT, DistanceT>* out,
                                      const std::uint8_t* encoded_dataset_ptr,
@@ -171,7 +174,7 @@ RAFT_KERNEL __launch_bounds__(1, 1)
                                                  IndexT,
                                                  DistanceT,
                                                  half,
-                                                 EnableFP8>;
+                                                 SmemDType>;
   new (out) desc_type(
     encoded_dataset_ptr, encoded_dataset_dim, vq_code_book_ptr, pq_code_book_ptr, size, dim);
 }
@@ -185,7 +188,7 @@ template <cuvs::distance::DistanceType Metric,
           typename DataT,
           typename IndexT,
           typename DistanceT,
-          bool EnableFP8>
+          cuvs::neighbors::cagra::internal_dtype SmemDType>
 dataset_descriptor_host<DataT, IndexT, DistanceT>
 vpq_descriptor_spec<Metric,
                     TeamSize,
@@ -196,7 +199,7 @@ vpq_descriptor_spec<Metric,
                     DataT,
                     IndexT,
                     DistanceT,
-                    EnableFP8>::init_(const cagra::search_params& params,
+                    SmemDType>::init_(const cagra::search_params& params,
                                       const std::uint8_t* encoded_dataset_ptr,
                                       uint32_t encoded_dataset_dim,
                                       const CodebookT* vq_code_book_ptr,
@@ -213,7 +216,7 @@ vpq_descriptor_spec<Metric,
                                                  IndexT,
                                                  DistanceT,
                                                  half,
-                                                 EnableFP8>;
+                                                 SmemDType>;
 
   return host_type{
     desc_type{
@@ -229,7 +232,7 @@ vpq_descriptor_spec<Metric,
                                          DataT,
                                          IndexT,
                                          DistanceT,
-                                         EnableFP8><<<1, 1, 0, stream>>>(dev_ptr,
+                                         SmemDType><<<1, 1, 0, stream>>>(dev_ptr,
                                                                          encoded_dataset_ptr,
                                                                          encoded_dataset_dim,
                                                                          vq_code_book_ptr,
@@ -243,7 +246,7 @@ vpq_descriptor_spec<Metric,
     true,        // is_vpq
     PqBits,      // pq_bits
     PqLen,       // pq_len
-    EnableFP8};  // enable_fp8
+    SmemDType};  // smem_dtype
 }
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
index c6e4611ae8..b32a0f17c6 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
@@ -23,7 +23,7 @@ template <cuvs::distance::DistanceType Metric,
           typename DataT,
           typename IndexT,
           typename DistanceT,
-          bool EnableFP8>
+          cuvs::neighbors::cagra::internal_dtype SmemDType>
 struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
   using base_type = instance_spec<DataT, IndexT, DistanceT>;
   using typename base_type::data_type;
@@ -66,9 +66,11 @@ struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
                        cuvs::distance::DistanceType metric) -> double
   {
     const auto fp8_natively_supported = raft::getComputeCapability().first >= 9;
-    const auto use_fp8 =
-      params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::E5M2 ||
-      (params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::AUTO && fp8_natively_supported);
+    const auto selected_smem_dtype =
+      params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::AUTO
+        ? (fp8_natively_supported ? cuvs::neighbors::cagra::internal_dtype::E5M2
+                                  : cuvs::neighbors::cagra::internal_dtype::F16)
+        : params.smem_dtype;
 
     // If explicit team_size is specified and doesn't match the instance, discard it
     if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; }
@@ -76,7 +78,7 @@ struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
     // Match codebook params
     if (dataset.pq_bits() != PqBits) { return -1.0; }
     if (dataset.pq_len() != PqLen) { return -1.0; }
-    if (use_fp8 != EnableFP8) { return -1.0; }
+    if (selected_smem_dtype != SmemDType) { return -1.0; }
     // Keep auto-selection on the tuned VPQ diagonal while allowing explicit team_size requests to
     // use the expanded team_size / dataset_block_dim grid.
     constexpr std::uint32_t auto_dataset_block_dim_per_team = PqLen == 8 ? 32 : 16;
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in
index 676f25c9fd..25d4732a34 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in
@@ -13,7 +13,7 @@ constexpr uint32_t team_size = @team_size@;
 constexpr uint32_t dim       = @dim@;
 constexpr uint32_t pq_bits   = @pq_bits@;
 constexpr uint32_t pq_len    = @pq_len@;
-constexpr bool enable_fp8    = @enable_fp8@;
+constexpr auto smem_dtype    = @smem_dtype@;
 using codebook_t             = @codebook_type@;
 using data_t                 = @data_type@;
 using index_t                = @index_type@;
@@ -32,6 +32,6 @@ template struct vpq_descriptor_spec<metric,
                                     data_t,
                                     index_t,
                                     distance_t,
-                                    enable_fp8>;
+                                    smem_dtype>;
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json
index 7dac07c2a4..1241b2346c 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json
@@ -97,8 +97,14 @@
   "metric": [
     "L2Expanded"
   ],
-  "enable_fp8": [
-    "true",
-    "false"
+  "_smem": [
+    {
+      "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::E5M2",
+      "smem_abbrev": "e5m2"
+    },
+    {
+      "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::F16",
+      "smem_abbrev": "f16"
+    }
   ]
 }
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp
index 60e965796c..973a5a1176 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp
@@ -60,11 +60,11 @@ std::shared_ptr<AlgorithmLauncher> build_single_cta_launcher(
     planner.add_setup_workspace_device_function(dataset_desc.team_size,
                                                 dataset_desc.dataset_block_dim,
                                                 dataset_desc.pq_len,
-                                                dataset_desc.enable_fp8);
+                                                dataset_desc.smem_dtype);
     planner.add_compute_distance_device_function(dataset_desc.team_size,
                                                  dataset_desc.dataset_block_dim,
                                                  dataset_desc.pq_len,
-                                                 dataset_desc.enable_fp8);
+                                                 dataset_desc.smem_dtype);
   } else {
     planner.add_setup_workspace_device_function(dataset_desc.team_size,
                                                 dataset_desc.dataset_block_dim);
@@ -109,11 +109,11 @@ std::shared_ptr<AlgorithmLauncher> build_multi_cta_launcher(
     planner.add_setup_workspace_device_function(dataset_desc.team_size,
                                                 dataset_desc.dataset_block_dim,
                                                 dataset_desc.pq_len,
-                                                dataset_desc.enable_fp8);
+                                                dataset_desc.smem_dtype);
     planner.add_compute_distance_device_function(dataset_desc.team_size,
                                                  dataset_desc.dataset_block_dim,
                                                  dataset_desc.pq_len,
-                                                 dataset_desc.enable_fp8);
+                                                 dataset_desc.smem_dtype);
   } else {
     planner.add_setup_workspace_device_function(dataset_desc.team_size,
                                                 dataset_desc.dataset_block_dim);
@@ -158,11 +158,11 @@ std::shared_ptr<AlgorithmLauncher> build_multi_kernel_launcher(
     planner.add_setup_workspace_device_function(dataset_desc.team_size,
                                                 dataset_desc.dataset_block_dim,
                                                 dataset_desc.pq_len,
-                                                dataset_desc.enable_fp8);
+                                                dataset_desc.smem_dtype);
     planner.add_compute_distance_device_function(dataset_desc.team_size,
                                                  dataset_desc.dataset_block_dim,
                                                  dataset_desc.pq_len,
-                                                 dataset_desc.enable_fp8);
+                                                 dataset_desc.smem_dtype);
   } else {
     planner.add_setup_workspace_device_function(dataset_desc.team_size,
                                                 dataset_desc.dataset_block_dim);
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp
index 14ef271c2a..cce18a0216 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp
@@ -65,7 +65,7 @@ struct CagraPlannerBase : AlgorithmPlanner {
   void add_setup_workspace_device_function(uint32_t team_size,
                                            uint32_t dataset_block_dim,
                                            uint32_t pq_len,
-                                           bool enable_fp8)
+                                           cuvs::neighbors::cagra::internal_dtype smem_dtype)
   {
     if (pq_len != 2 && pq_len != 4 && pq_len != 8) {
       RAFT_FAIL("CAGRA JIT VPQ setup_workspace expects pq_len in {2,4,8} (matrix uses pq_bits=8)");
@@ -92,11 +92,7 @@ struct CagraPlannerBase : AlgorithmPlanner {
           add.template operator()<TeamSz, Dim, PqBitsV, PqLenV, SmemTag>();
         });
     };
-    if (enable_fp8) {
-      dispatch_smem.template operator()<tag_smem_e5m2>();
-    } else {
-      dispatch_smem.template operator()<tag_smem_f16>();
-    }
+    dispatch_cagra_smem_dtype(smem_dtype, dispatch_smem);
   }
 
   /// Registers dist_op + normalization + `compute_distance` for standard layout.
@@ -132,7 +128,7 @@ struct CagraPlannerBase : AlgorithmPlanner {
   void add_compute_distance_device_function(uint32_t team_size,
                                             uint32_t dataset_block_dim,
                                             uint32_t pq_len,
-                                            bool enable_fp8)
+                                            cuvs::neighbors::cagra::internal_dtype smem_dtype)
   {
     if (pq_len != 2 && pq_len != 4 && pq_len != 8) {
       RAFT_FAIL("CAGRA JIT VPQ compute_distance expects pq_len in {2,4,8} (matrix uses pq_bits=8)");
@@ -159,14 +155,26 @@ struct CagraPlannerBase : AlgorithmPlanner {
           add.template operator()<TeamSz, Dim, PqBitsV, PqLenV, SmemTag>();
         });
     };
-    if (enable_fp8) {
-      dispatch_smem.template operator()<tag_smem_e5m2>();
-    } else {
-      dispatch_smem.template operator()<tag_smem_f16>();
-    }
+    dispatch_cagra_smem_dtype(smem_dtype, dispatch_smem);
   }
 
  private:
+  template <typename Lambda>
+  static void dispatch_cagra_smem_dtype(cuvs::neighbors::cagra::internal_dtype smem_dtype,
+                                        Lambda&& l)
+  {
+    switch (smem_dtype) {
+      case cuvs::neighbors::cagra::internal_dtype::F16:
+        std::forward<Lambda>(l).template operator()<tag_smem_f16>();
+        return;
+      case cuvs::neighbors::cagra::internal_dtype::E5M2:
+        std::forward<Lambda>(l).template operator()<tag_smem_e5m2>();
+        return;
+      default: break;
+    }
+    RAFT_FAIL("Unsupported CAGRA JIT smem_dtype: %u", static_cast<unsigned>(smem_dtype));
+  }
+
   template <typename Lambda>
   static void dispatch_cagra_standard_team_dim(uint32_t team_size,
                                                uint32_t dataset_block_dim,
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh
index 59b43bea64..ea817110e7 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh
@@ -100,10 +100,10 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker_impl(
   constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim;
   constexpr auto PQ_BITS         = DescriptorT::kPqBits;
   constexpr auto PQ_LEN          = DescriptorT::kPqLen;
-  constexpr auto EnableFP8       = DescriptorT::kEnableFP8;
+  constexpr auto SmemDType       = DescriptorT::kSmemDType;
   using PQ_CODEBOOK_LOAD_T       = uint32_t;
 
-  using smem_val_config                  = vpq_smem_value_config<PQ_LEN, EnableFP8>;
+  using smem_val_config                  = vpq_smem_value_config<PQ_LEN, SmemDType>;
   using smem_val_pack_t                  = typename smem_val_config::smem_val_pack_t;
   using smem_val_pack_uint_t             = typename smem_val_config::smem_val_pack_uint_t;
   constexpr uint32_t num_packed_elements = smem_val_config::num_packed_elements;
@@ -271,7 +271,7 @@ template <uint32_t TeamSize,
           typename IndexT,
           typename DistanceT,
           typename QueryT,
-          bool EnableFP8>
+          cuvs::neighbors::cagra::internal_dtype SmemDType>
 __device__ DistanceT compute_distance_impl(
   const typename dataset_descriptor_base_t<DataT, IndexT, DistanceT>::args_t args,
   IndexT dataset_index)
@@ -291,7 +291,7 @@ __device__ DistanceT compute_distance_impl(
                                                 IndexT,
                                                 DistanceT,
                                                 QueryT,
-                                                EnableFP8>;
+                                                SmemDType>;
     return compute_distance_vpq_impl<desc_t>(args, dataset_index);
   } else {
     static_assert(sizeof(TeamSize) == 0,
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in
index 130cbf502f..1856781391 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in
@@ -11,7 +11,7 @@ constexpr uint32_t k_team_size         = @team_size@u;
 constexpr uint32_t k_dataset_block_dim = @dataset_block_dim@u;
 constexpr uint32_t k_pq_bits           = @pq_bits@u;
 constexpr uint32_t k_pq_len            = @pq_len@u;
-constexpr bool k_enable_fp8            = @enable_fp8@;
+constexpr auto k_smem_dtype            = @smem_dtype@;
 
 using data_t     = @data_type@;
 using index_t    = @index_type@;
@@ -40,7 +40,7 @@ __device__ distance_t compute_distance<data_t, index_t, distance_t>(const args_t
                                                   index_t,
                                                   distance_t,
                                                   query_t,
-                                                  k_enable_fp8>(args, dataset_index)
+                                                  k_smem_dtype>(args, dataset_index)
                           : distance_t{};
   return device::team_sum(per_thread, team_size_bits);
 }
@@ -58,7 +58,7 @@ compute_distance_per_thread<data_t, index_t, distance_t>(const args_t args, inde
                                index_t,
                                distance_t,
                                query_t,
-                               k_enable_fp8>(args, dataset_index);
+                               k_smem_dtype>(args, dataset_index);
 }
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json
index f1ce0daaab..4d260c5507 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json
@@ -84,7 +84,7 @@
     ],
     "_smem": [
       {
-        "enable_fp8": "false",
+        "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::F16",
         "smem_abbrev": "f16"
       }
     ]
@@ -158,11 +158,11 @@
     ],
     "_smem": [
       {
-        "enable_fp8": "false",
+        "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::F16",
         "smem_abbrev": "f16"
       },
       {
-        "enable_fp8": "true",
+        "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::E5M2",
         "smem_abbrev": "e5m2"
       }
     ]
@@ -238,11 +238,11 @@
     ],
     "_smem": [
       {
-        "enable_fp8": "false",
+        "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::F16",
         "smem_abbrev": "f16"
       },
       {
-        "enable_fp8": "true",
+        "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::E5M2",
         "smem_abbrev": "e5m2"
       }
     ]
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh
index 220c76ac96..494e0973fe 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh
@@ -84,8 +84,8 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq_impl(
   constexpr auto kDatasetBlockDim    = DescriptorT::kDatasetBlockDim;
   constexpr auto PQ_BITS             = DescriptorT::kPqBits;
   constexpr auto PQ_LEN              = DescriptorT::kPqLen;
-  constexpr auto EnableFP8           = DescriptorT::kEnableFP8;
-  using smem_val_config              = vpq_smem_value_config<PQ_LEN, EnableFP8>;
+  constexpr auto SmemDType           = DescriptorT::kSmemDType;
+  using smem_val_config              = vpq_smem_value_config<PQ_LEN, SmemDType>;
   using smem_val_t                   = typename smem_val_config::smem_val_t;
   using smem_val_pack_t              = typename smem_val_config::smem_val_pack_t;
   using smem_val_pack_uint_t         = typename smem_val_config::smem_val_pack_uint_t;
@@ -186,7 +186,7 @@ template <uint32_t TeamSize,
           typename IndexT,
           typename DistanceT,
           typename QueryT,
-          bool EnableFP8>
+          cuvs::neighbors::cagra::internal_dtype SmemDType>
 __device__ const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* setup_workspace_impl(
   const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* desc_ptr,
   void* smem,
@@ -211,7 +211,7 @@ __device__ const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* setup_work
                                                       IndexT,
                                                       DistanceT,
                                                       QueryT,
-                                                      EnableFP8>;
+                                                      SmemDType>;
     const desc_t* desc = static_cast<const desc_t*>(desc_ptr);
 
     const desc_t* result = setup_workspace_vpq_impl<desc_t>(desc, smem, queries, query_id);
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in
index 2177212e36..6a54c9f956 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in
@@ -12,7 +12,7 @@ constexpr uint32_t k_team_size         = @team_size@u;
 constexpr uint32_t k_dataset_block_dim = @dataset_block_dim@u;
 constexpr uint32_t k_pq_bits           = @pq_bits@u;
 constexpr uint32_t k_pq_len            = @pq_len@u;
-constexpr bool k_enable_fp8            = @enable_fp8@;
+constexpr auto k_smem_dtype            = @smem_dtype@;
 
 using data_t     = @data_type@;
 using index_t    = @index_type@;
@@ -41,7 +41,7 @@ setup_workspace<data_t, index_t, distance_t>(
                               index_t,
                               distance_t,
                               query_t,
-                              k_enable_fp8>(desc, smem, queries, query_id);
+                              k_smem_dtype>(desc, smem, queries, query_id);
 }
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json
index 7ee92494e6..567fe3e5a1 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json
@@ -84,7 +84,7 @@
     ],
     "_smem": [
       {
-        "enable_fp8": "false",
+        "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::F16",
         "smem_abbrev": "f16"
       }
     ]
@@ -158,11 +158,11 @@
     ],
     "_smem": [
       {
-        "enable_fp8": "false",
+        "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::F16",
         "smem_abbrev": "f16"
       },
       {
-        "enable_fp8": "true",
+        "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::E5M2",
         "smem_abbrev": "e5m2"
       }
     ]
@@ -238,11 +238,11 @@
     ],
     "_smem": [
       {
-        "enable_fp8": "false",
+        "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::F16",
         "smem_abbrev": "f16"
       },
       {
-        "enable_fp8": "true",
+        "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::E5M2",
         "smem_abbrev": "e5m2"
       }
     ]
diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh
index 826b8d1a3a..c8322d29fd 100644
--- a/cpp/tests/neighbors/ann_cagra.cuh
+++ b/cpp/tests/neighbors/ann_cagra.cuh
@@ -27,6 +27,7 @@
 #include <raft/linalg/normalize.cuh>
 #include <raft/linalg/reduce.cuh>
 #include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
 #include <raft/util/itertools.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -276,6 +277,7 @@ struct AnnCagraInputs {
   std::optional<bool> non_owning_memory_buffer_flag = std::nullopt;
   cuvs::neighbors::MergeStrategy merge_strategy =
     cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL;
+  cuvs::neighbors::cagra::internal_dtype smem_dtype = cuvs::neighbors::cagra::internal_dtype::AUTO;
 };
 
 inline ::std::ostream& operator<<(::std::ostream& os, const AnnCagraInputs& p)
@@ -299,6 +301,14 @@ inline ::std::ostream& operator<<(::std::ostream& os, const AnnCagraInputs& p)
     {search_algo::AUTO, "auto"}                   //
   };
   std::vector<std::string> build_algo = {"IVF_PQ", "NN_DESCENT", "ITERATIVE_CAGRA_SEARCH", "AUTO"};
+  const auto smem_dtype_str           = [](cuvs::neighbors::cagra::internal_dtype dtype) {
+    switch (dtype) {
+      case cuvs::neighbors::cagra::internal_dtype::F16: return "F16";
+      case cuvs::neighbors::cagra::internal_dtype::E5M2: return "E5M2";
+      case cuvs::neighbors::cagra::internal_dtype::AUTO: return "AUTO";
+    }
+    return "Unknown";
+  };
   std::vector<std::string> merge_strategy = {"PHYSICAL", "LOGICAL"};
   os << "{n_queries=" << p.n_queries << ", dataset shape=" << p.n_rows << "x" << p.dim
      << ", k=" << p.k << ", " << algo_name[p.algo] << ", max_queries=" << p.max_queries
@@ -312,7 +322,7 @@ inline ::std::ostream& operator<<(::std::ostream& os, const AnnCagraInputs& p)
   if (p.compression.has_value()) {
     auto vpq = p.compression.value();
     os << ", pq_bits=" << vpq.pq_bits << ", pq_dim=" << vpq.pq_dim
-       << ", vq_n_centers=" << vpq.vq_n_centers;
+       << ", vq_n_centers=" << vpq.vq_n_centers << ", smem_dtype=" << smem_dtype_str(p.smem_dtype);
   }
   os << '}' << std::endl;
   return os;
@@ -346,6 +356,10 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
     if (ps.metric == cuvs::distance::DistanceType::L1 &&
         ps.build_algo != graph_build_algo::ITERATIVE_CAGRA_SEARCH)
       GTEST_SKIP();
+    if (ps.smem_dtype == cuvs::neighbors::cagra::internal_dtype::E5M2 &&
+        raft::getComputeCapability().first < 9) {
+      GTEST_SKIP() << "CAGRA VPQ E5M2 smem dtype requires native FP8 support on SM90+";
+    }
     if (ps.metric == cuvs::distance::DistanceType::CosineExpanded) {
       if (ps.compression.has_value()) { GTEST_SKIP(); }
       if (ps.build_algo == graph_build_algo::ITERATIVE_CAGRA_SEARCH || ps.dim == 1) {
@@ -415,6 +429,7 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
         search_params.algo        = ps.algo;
         search_params.max_queries = ps.max_queries;
         search_params.team_size   = ps.team_size;
+        search_params.smem_dtype  = ps.smem_dtype;
 
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
           (const DataT*)database.data(), ps.n_rows, ps.dim);
@@ -1701,7 +1716,12 @@ inline std::vector<AnnCagraInputs> generate_inputs()
         ps.pq_dim       = input.dim / pq_len;
         ps.vq_n_centers = vq_n_centers;
         input.compression.emplace(ps);
+        input.smem_dtype = cuvs::neighbors::cagra::internal_dtype::AUTO;
         inputs.push_back(input);
+        if (pq_len >= 4 && vq_n_centers == 100) {
+          input.smem_dtype = cuvs::neighbors::cagra::internal_dtype::E5M2;
+          inputs.push_back(input);
+        }
       }
     }
   }

From a577563fccababf13e5eba40d44240038b9e6d46 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Fri, 5 Jun 2026 00:55:48 +0900
Subject: [PATCH 107/119] Update vpq test

---
 cpp/tests/neighbors/ann_cagra.cuh | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh
index c8322d29fd..5c0da34dd0 100644
--- a/cpp/tests/neighbors/ann_cagra.cuh
+++ b/cpp/tests/neighbors/ann_cagra.cuh
@@ -1711,15 +1711,15 @@ inline std::vector<AnnCagraInputs> generate_inputs()
                                                                 // without refinement
   for (uint32_t pq_len : {2, 4, 8}) {
     for (uint32_t vq_n_centers : {100, 1000}) {
-      for (auto input : inputs2) {
-        vpq_params ps{};
-        ps.pq_dim       = input.dim / pq_len;
-        ps.vq_n_centers = vq_n_centers;
-        input.compression.emplace(ps);
-        input.smem_dtype = cuvs::neighbors::cagra::internal_dtype::AUTO;
-        inputs.push_back(input);
-        if (pq_len >= 4 && vq_n_centers == 100) {
-          input.smem_dtype = cuvs::neighbors::cagra::internal_dtype::E5M2;
+      for (auto internal_smem_dtype : {cuvs::neighbors::cagra::internal_dtype::E5M2,
+                                       cuvs::neighbors::cagra::internal_dtype::F16,
+                                       cuvs::neighbors::cagra::internal_dtype::AUTO}) {
+        for (auto input : inputs2) {
+          vpq_params ps{};
+          ps.pq_dim       = input.dim / pq_len;
+          ps.vq_n_centers = vq_n_centers;
+          input.compression.emplace(ps);
+          input.smem_dtype = internal_smem_dtype;
           inputs.push_back(input);
         }
       }

From d05f5524eee4868dd62713e53e18e4afeaa0fce3 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Fri, 5 Jun 2026 14:55:22 +0900
Subject: [PATCH 108/119] Remove internal_dtype::AUTO

---
 cpp/include/cuvs/neighbors/cagra.hpp                   |  4 ++--
 cpp/src/neighbors/detail/cagra/cagra_search.cuh        |  7 +++----
 .../neighbors/detail/cagra/compute_distance_vpq.hpp    | 10 +---------
 cpp/tests/neighbors/ann_cagra.cuh                      |  6 ++----
 4 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index d2a55ce406..9a906687e3 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -276,7 +276,7 @@ enum class search_algo {
 
 enum class hash_mode { HASH = 0, SMALL = 1, AUTO = 100 };
 
-enum class internal_dtype { F16 = 0, E5M2 = 1, AUTO = 100 };
+enum class internal_dtype { F16 = 0, E5M2 = 1 };
 
 struct search_params : cuvs::neighbors::search_params {
   /** Maximum number of queries to search at the same time (batch size). Auto select when 0.*/
@@ -354,7 +354,7 @@ struct search_params : cuvs::neighbors::search_params {
 
   /** Data type of the query vector and codebook table on shared memory. Currently, only VPQ
    * supports FP8. **/
-  internal_dtype smem_dtype = internal_dtype::AUTO;
+  internal_dtype smem_dtype = internal_dtype::F16;
 };
 
 /**
diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index f199cf7882..6a64ad7a85 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -153,10 +153,9 @@ void search_main(raft::resources const& res,
   // Dispatch search parameters based on the dataset kind.
   if (auto* strided_dset = dynamic_cast<const strided_dataset<T, ds_idx_type>*>(&index.data());
       strided_dset != nullptr) {
-    if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO &&
-        params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::F16) {
-      RAFT_LOG_WARN("In this search mode, smem_dtype supports only AUTO or F16. Set it to AUTO.");
-      params.smem_dtype = cuvs::neighbors::cagra::internal_dtype::AUTO;
+    if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::F16) {
+      RAFT_LOG_WARN("In this search mode, smem_dtype supports only F16. Set it to F16.");
+      params.smem_dtype = cuvs::neighbors::cagra::internal_dtype::F16;
     }
     // Search using a plain (strided) row-major dataset
     RAFT_EXPECTS(index.metric() != cuvs::distance::DistanceType::CosineExpanded ||
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
index b32a0f17c6..83954491bf 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
@@ -8,7 +8,6 @@
 #include "compute_distance.hpp"
 
 #include <cuvs/distance/distance.hpp>
-#include <raft/util/cudart_utils.hpp>
 
 #include <type_traits>
 
@@ -65,20 +64,13 @@ struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
                        const DatasetT& dataset,
                        cuvs::distance::DistanceType metric) -> double
   {
-    const auto fp8_natively_supported = raft::getComputeCapability().first >= 9;
-    const auto selected_smem_dtype =
-      params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::AUTO
-        ? (fp8_natively_supported ? cuvs::neighbors::cagra::internal_dtype::E5M2
-                                  : cuvs::neighbors::cagra::internal_dtype::F16)
-        : params.smem_dtype;
-
     // If explicit team_size is specified and doesn't match the instance, discard it
     if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; }
     if (cuvs::distance::DistanceType::L2Expanded != metric) { return -1.0; }
     // Match codebook params
     if (dataset.pq_bits() != PqBits) { return -1.0; }
     if (dataset.pq_len() != PqLen) { return -1.0; }
-    if (selected_smem_dtype != SmemDType) { return -1.0; }
+    if (params.smem_dtype != SmemDType) { return -1.0; }
     // Keep auto-selection on the tuned VPQ diagonal while allowing explicit team_size requests to
     // use the expanded team_size / dataset_block_dim grid.
     constexpr std::uint32_t auto_dataset_block_dim_per_team = PqLen == 8 ? 32 : 16;
diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh
index 5c0da34dd0..5f83d2eb8d 100644
--- a/cpp/tests/neighbors/ann_cagra.cuh
+++ b/cpp/tests/neighbors/ann_cagra.cuh
@@ -277,7 +277,7 @@ struct AnnCagraInputs {
   std::optional<bool> non_owning_memory_buffer_flag = std::nullopt;
   cuvs::neighbors::MergeStrategy merge_strategy =
     cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL;
-  cuvs::neighbors::cagra::internal_dtype smem_dtype = cuvs::neighbors::cagra::internal_dtype::AUTO;
+  cuvs::neighbors::cagra::internal_dtype smem_dtype = cuvs::neighbors::cagra::internal_dtype::F16;
 };
 
 inline ::std::ostream& operator<<(::std::ostream& os, const AnnCagraInputs& p)
@@ -305,7 +305,6 @@ inline ::std::ostream& operator<<(::std::ostream& os, const AnnCagraInputs& p)
     switch (dtype) {
       case cuvs::neighbors::cagra::internal_dtype::F16: return "F16";
       case cuvs::neighbors::cagra::internal_dtype::E5M2: return "E5M2";
-      case cuvs::neighbors::cagra::internal_dtype::AUTO: return "AUTO";
     }
     return "Unknown";
   };
@@ -1712,8 +1711,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
   for (uint32_t pq_len : {2, 4, 8}) {
     for (uint32_t vq_n_centers : {100, 1000}) {
       for (auto internal_smem_dtype : {cuvs::neighbors::cagra::internal_dtype::E5M2,
-                                       cuvs::neighbors::cagra::internal_dtype::F16,
-                                       cuvs::neighbors::cagra::internal_dtype::AUTO}) {
+                                       cuvs::neighbors::cagra::internal_dtype::F16}) {
         for (auto input : inputs2) {
           vpq_params ps{};
           ps.pq_dim       = input.dim / pq_len;

From 902073970e238dbede59697b2992bf066a8b75a7 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Fri, 5 Jun 2026 15:15:36 +0900
Subject: [PATCH 109/119] Update fp8xN to used SW emulated FP8 when FP8 is not
 natively supported

---
 .../neighbors/detail/cagra/packed_type.hpp    | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/packed_type.hpp b/cpp/src/neighbors/detail/cagra/packed_type.hpp
index f52edc126b..4e67fd50e6 100644
--- a/cpp/src/neighbors/detail/cagra/packed_type.hpp
+++ b/cpp/src/neighbors/detail/cagra/packed_type.hpp
@@ -3,6 +3,8 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
+#include "../../ivf_pq/ivf_pq_fp_8bit.cuh"
+
 #include <cstdint>
 #include <raft/core/detail/macros.hpp>
 
@@ -26,24 +28,39 @@ struct fp8xN {};
 
 template <uint32_t NumPacked>
 struct fp8xN<NumPacked, 5> {
-  using uint_t                           = typename uintN_t<8 * NumPacked>::type;
-  using unit_t                           = __nv_fp8_e5m2;
-  using x2_t                             = __nv_fp8x2_storage_t;
+  using uint_t = typename uintN_t<8 * NumPacked>::type;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  using unit_t = __nv_fp8_e5m2;
+  using x2_t   = __nv_fp8x2_storage_t;
+#else
+  using unit_t = cuvs::neighbors::ivf_pq::detail::fp_8bit<5u, true>;
+#endif
   static constexpr uint32_t num_elements = NumPacked;
 
-  union {
+  union storage_t {
     unit_t x1[num_elements];
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
     x2_t x2[num_elements / 2];
+#endif
     uint_t u;
+
+    HDI storage_t() : u{0} {}
   } data;
 
-  HDI fp8xN() { data.u = 0; }
+  HDI fp8xN() = default;
 
   HDI uint_t& as_uint() { return data.u; }
   HDI uint_t as_uint() const { return data.u; }
   HDI half2 as_half2(const uint32_t i) const
   {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
     return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2);
+#else
+    half2 r;
+    r.x = static_cast<half>(data.x1[2 * i]);
+    r.y = static_cast<half>(data.x1[2 * i + 1]);
+    return r;
+#endif
   }
 };
 }  // namespace cuvs::neighbors::cagra::detail::device

From 627ee0dc1026b5d267c665e144e12ebb58eec6e7 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Fri, 5 Jun 2026 15:26:36 +0900
Subject: [PATCH 110/119] Fix VPQ test

---
 cpp/tests/neighbors/ann_cagra.cuh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh
index 5f83d2eb8d..46b427e241 100644
--- a/cpp/tests/neighbors/ann_cagra.cuh
+++ b/cpp/tests/neighbors/ann_cagra.cuh
@@ -355,10 +355,6 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
     if (ps.metric == cuvs::distance::DistanceType::L1 &&
         ps.build_algo != graph_build_algo::ITERATIVE_CAGRA_SEARCH)
       GTEST_SKIP();
-    if (ps.smem_dtype == cuvs::neighbors::cagra::internal_dtype::E5M2 &&
-        raft::getComputeCapability().first < 9) {
-      GTEST_SKIP() << "CAGRA VPQ E5M2 smem dtype requires native FP8 support on SM90+";
-    }
     if (ps.metric == cuvs::distance::DistanceType::CosineExpanded) {
       if (ps.compression.has_value()) { GTEST_SKIP(); }
       if (ps.build_algo == graph_build_algo::ITERATIVE_CAGRA_SEARCH || ps.dim == 1) {

From e7e4205c21ca87d9ebe83e810a8c001e31f6d26f Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Fri, 5 Jun 2026 15:46:35 +0900
Subject: [PATCH 111/119] Fix compilation error

---
 cpp/src/neighbors/detail/cagra/packed_type.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/detail/cagra/packed_type.hpp b/cpp/src/neighbors/detail/cagra/packed_type.hpp
index 4e67fd50e6..cfcc68be09 100644
--- a/cpp/src/neighbors/detail/cagra/packed_type.hpp
+++ b/cpp/src/neighbors/detail/cagra/packed_type.hpp
@@ -47,7 +47,7 @@ struct fp8xN<NumPacked, 5> {
     HDI storage_t() : u{0} {}
   } data;
 
-  HDI fp8xN() = default;
+  HDI fp8xN() : data{} {}
 
   HDI uint_t& as_uint() { return data.u; }
   HDI uint_t as_uint() const { return data.u; }

From 1032ffb2f4a8ee0cc938c6c11bec5260d758dbaf Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Mon, 8 Jun 2026 13:34:46 +0900
Subject: [PATCH 112/119] Update VPQ test to use VpqMathT

---
 cpp/tests/neighbors/ann_cagra.cuh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh
index 46b427e241..38f77a4346 100644
--- a/cpp/tests/neighbors/ann_cagra.cuh
+++ b/cpp/tests/neighbors/ann_cagra.cuh
@@ -477,9 +477,11 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
         if (ps.compression.has_value()) {
           auto decoded_dataset =
             raft::make_device_matrix<DataT, int64_t>(handle_, ps.n_rows, ps.dim);
-          cuvs::neighbors::decode_vpq_dataset<DataT, half>(
+
+          using VpqMathT = half;
+          cuvs::neighbors::decode_vpq_dataset<DataT, VpqMathT>(
             decoded_dataset.view(),
-            dynamic_cast<const cuvs::neighbors::vpq_dataset<half, int64_t>&>(index.data()),
+            dynamic_cast<const cuvs::neighbors::vpq_dataset<VpqMathT, int64_t>&>(index.data()),
             raft::resource::get_cuda_stream(handle_));
           auto indices_out_view = raft::make_device_matrix_view<SearchIdxT, int64_t>(
             indices_dev.data(), ps.n_queries, ps.k);

From 02e372639a21db8f75f2798067db139631333157 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Mon, 8 Jun 2026 15:14:05 +0900
Subject: [PATCH 113/119] Add pq_bits assert

---
 cpp/tests/neighbors/vpq_utils.cuh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/tests/neighbors/vpq_utils.cuh b/cpp/tests/neighbors/vpq_utils.cuh
index 613b5fe1d5..44b4d188ee 100644
--- a/cpp/tests/neighbors/vpq_utils.cuh
+++ b/cpp/tests/neighbors/vpq_utils.cuh
@@ -51,6 +51,9 @@ void decode_vpq_dataset(raft::device_matrix_view<data_t, int64_t> decoded_datase
 {
   const auto dataset_size = decoded_dataset.extent(0);
   RAFT_EXPECTS(vpq_dataset.data.extent(0) == dataset_size, "Dataset sizes mismatch");
+  RAFT_EXPECTS(vpq_dataset.pq_bits() == 8,
+               "decode_vpq_dataset currently only supports pq_bits == 8 (got %u)",
+               vpq_dataset.pq_bits());
 
   constexpr uint32_t block_size  = 256;
   constexpr uint32_t warp_size   = 32;

From c608bd16a0beb25683b428d05cecd6b9dda027d7 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 10 Jun 2026 00:18:42 +0900
Subject: [PATCH 114/119] Remove SW emulated FP8

---
 .../neighbors/detail/cagra/cagra_search.cuh   |  6 +++++
 .../detail/cagra/compute_distance_vpq.hpp     | 13 ++++++++-
 .../neighbors/detail/cagra/packed_type.hpp    | 27 ++++---------------
 cpp/tests/neighbors/ann_cagra.cuh             |  1 -
 4 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index 6a64ad7a85..a5925b16d2 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -184,6 +184,12 @@ void search_main(raft::resources const& res,
     RAFT_FAIL("FP32 VPQ dataset support is coming soon");
   } else if (auto* vpq_dset = dynamic_cast<const vpq_dataset<half, ds_idx_type>*>(&index.data());
              vpq_dset != nullptr) {
+    if (params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::E5M2 &&
+        raft::getComputeCapability().first < 9) {
+      RAFT_LOG_WARN(
+        "CAGRA VPQ E5M2 smem_dtype requires native FP8 support on SM90+. Falling back to F16.");
+      params.smem_dtype = cuvs::neighbors::cagra::internal_dtype::F16;
+    }
     auto desc = dataset_descriptor_init_with_cache<T, graph_idx_type, DistanceT>(
       res, params, *vpq_dset, index.metric(), nullptr);
     search_main_core<T, graph_idx_type, DistanceT, CagraSampleFilterT, IdxT, OutputIdxT>(
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
index 83954491bf..6781eb6abc 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
@@ -8,11 +8,22 @@
 #include "compute_distance.hpp"
 
 #include <cuvs/distance/distance.hpp>
+#include <raft/util/cudart_utils.hpp>
 
 #include <type_traits>
 
 namespace cuvs::neighbors::cagra::detail {
 
+inline auto select_supported_vpq_smem_dtype(const cagra::search_params& params)
+  -> cuvs::neighbors::cagra::internal_dtype
+{
+  if (params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::E5M2 &&
+      raft::getComputeCapability().first < 9) {
+    return cuvs::neighbors::cagra::internal_dtype::F16;
+  }
+  return params.smem_dtype;
+}
+
 template <cuvs::distance::DistanceType Metric,
           uint32_t TeamSize,
           uint32_t DatasetBlockDim,
@@ -70,7 +81,7 @@ struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
     // Match codebook params
     if (dataset.pq_bits() != PqBits) { return -1.0; }
     if (dataset.pq_len() != PqLen) { return -1.0; }
-    if (params.smem_dtype != SmemDType) { return -1.0; }
+    if (select_supported_vpq_smem_dtype(params) != SmemDType) { return -1.0; }
     // Keep auto-selection on the tuned VPQ diagonal while allowing explicit team_size requests to
     // use the expanded team_size / dataset_block_dim grid.
     constexpr std::uint32_t auto_dataset_block_dim_per_team = PqLen == 8 ? 32 : 16;
diff --git a/cpp/src/neighbors/detail/cagra/packed_type.hpp b/cpp/src/neighbors/detail/cagra/packed_type.hpp
index cfcc68be09..f52edc126b 100644
--- a/cpp/src/neighbors/detail/cagra/packed_type.hpp
+++ b/cpp/src/neighbors/detail/cagra/packed_type.hpp
@@ -3,8 +3,6 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include "../../ivf_pq/ivf_pq_fp_8bit.cuh"
-
 #include <cstdint>
 #include <raft/core/detail/macros.hpp>
 
@@ -28,39 +26,24 @@ struct fp8xN {};
 
 template <uint32_t NumPacked>
 struct fp8xN<NumPacked, 5> {
-  using uint_t = typename uintN_t<8 * NumPacked>::type;
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-  using unit_t = __nv_fp8_e5m2;
-  using x2_t   = __nv_fp8x2_storage_t;
-#else
-  using unit_t = cuvs::neighbors::ivf_pq::detail::fp_8bit<5u, true>;
-#endif
+  using uint_t                           = typename uintN_t<8 * NumPacked>::type;
+  using unit_t                           = __nv_fp8_e5m2;
+  using x2_t                             = __nv_fp8x2_storage_t;
   static constexpr uint32_t num_elements = NumPacked;
 
-  union storage_t {
+  union {
     unit_t x1[num_elements];
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
     x2_t x2[num_elements / 2];
-#endif
     uint_t u;
-
-    HDI storage_t() : u{0} {}
   } data;
 
-  HDI fp8xN() : data{} {}
+  HDI fp8xN() { data.u = 0; }
 
   HDI uint_t& as_uint() { return data.u; }
   HDI uint_t as_uint() const { return data.u; }
   HDI half2 as_half2(const uint32_t i) const
   {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
     return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2);
-#else
-    half2 r;
-    r.x = static_cast<half>(data.x1[2 * i]);
-    r.y = static_cast<half>(data.x1[2 * i + 1]);
-    return r;
-#endif
   }
 };
 }  // namespace cuvs::neighbors::cagra::detail::device
diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh
index 38f77a4346..93e933cc94 100644
--- a/cpp/tests/neighbors/ann_cagra.cuh
+++ b/cpp/tests/neighbors/ann_cagra.cuh
@@ -27,7 +27,6 @@
 #include <raft/linalg/normalize.cuh>
 #include <raft/linalg/reduce.cuh>
 #include <raft/random/rng.cuh>
-#include <raft/util/cudart_utils.hpp>
 #include <raft/util/itertools.hpp>
 
 #include <rmm/device_buffer.hpp>

From f706baae2cb9e559460e531e84ddff17b0bbbff4 Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 10 Jun 2026 16:57:58 +0900
Subject: [PATCH 115/119] Update dispatch funcs

---
 .../jit_lto_kernels/cagra_planner_base.hpp    | 204 +++++++-----------
 1 file changed, 81 insertions(+), 123 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp
index cce18a0216..0666ef815a 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp
@@ -175,120 +175,6 @@ struct CagraPlannerBase : AlgorithmPlanner {
     RAFT_FAIL("Unsupported CAGRA JIT smem_dtype: %u", static_cast<unsigned>(smem_dtype));
   }
 
-  template <typename Lambda>
-  static void dispatch_cagra_standard_team_dim(uint32_t team_size,
-                                               uint32_t dataset_block_dim,
-                                               Lambda&& l)
-  {
-    switch (team_size) {
-      case 8:
-        switch (dataset_block_dim) {
-          case 128: std::forward<Lambda>(l).template operator()<8u, 128u>(); return;
-          case 256: std::forward<Lambda>(l).template operator()<8u, 256u>(); return;
-          case 512: std::forward<Lambda>(l).template operator()<8u, 512u>(); return;
-          default: break;
-        }
-        break;
-      case 16:
-        switch (dataset_block_dim) {
-          case 128: std::forward<Lambda>(l).template operator()<16u, 128u>(); return;
-          case 256: std::forward<Lambda>(l).template operator()<16u, 256u>(); return;
-          case 512: std::forward<Lambda>(l).template operator()<16u, 512u>(); return;
-          default: break;
-        }
-        break;
-      case 32:
-        switch (dataset_block_dim) {
-          case 128: std::forward<Lambda>(l).template operator()<32u, 128u>(); return;
-          case 256: std::forward<Lambda>(l).template operator()<32u, 256u>(); return;
-          case 512: std::forward<Lambda>(l).template operator()<32u, 512u>(); return;
-          default: break;
-        }
-        break;
-      default: break;
-    }
-    RAFT_FAIL("Unsupported standard team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u",
-              static_cast<unsigned>(team_size),
-              static_cast<unsigned>(dataset_block_dim));
-  }
-
-  template <uint32_t PqLenV, typename Lambda>
-  static void dispatch_cagra_vpq_pq2_4_team_dim(uint32_t team_size,
-                                                uint32_t dataset_block_dim,
-                                                Lambda&& l)
-  {
-    switch (team_size) {
-      case 8:
-        switch (dataset_block_dim) {
-          case 128: std::forward<Lambda>(l).template operator()<8u, 128u, 8u, PqLenV>(); return;
-          case 256: std::forward<Lambda>(l).template operator()<8u, 256u, 8u, PqLenV>(); return;
-          case 512: std::forward<Lambda>(l).template operator()<8u, 512u, 8u, PqLenV>(); return;
-          default: break;
-        }
-        break;
-      case 16:
-        switch (dataset_block_dim) {
-          case 128: std::forward<Lambda>(l).template operator()<16u, 128u, 8u, PqLenV>(); return;
-          case 256: std::forward<Lambda>(l).template operator()<16u, 256u, 8u, PqLenV>(); return;
-          case 512: std::forward<Lambda>(l).template operator()<16u, 512u, 8u, PqLenV>(); return;
-          default: break;
-        }
-        break;
-      case 32:
-        switch (dataset_block_dim) {
-          case 128: std::forward<Lambda>(l).template operator()<32u, 128u, 8u, PqLenV>(); return;
-          case 256: std::forward<Lambda>(l).template operator()<32u, 256u, 8u, PqLenV>(); return;
-          case 512: std::forward<Lambda>(l).template operator()<32u, 512u, 8u, PqLenV>(); return;
-          default: break;
-        }
-        break;
-      default: break;
-    }
-    RAFT_FAIL(
-      "Unsupported VPQ pq_len=%u team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u",
-      static_cast<unsigned>(PqLenV),
-      static_cast<unsigned>(team_size),
-      static_cast<unsigned>(dataset_block_dim));
-  }
-
-  template <typename Lambda>
-  static void dispatch_cagra_vpq_pq8_team_dim(uint32_t team_size,
-                                              uint32_t dataset_block_dim,
-                                              Lambda&& l)
-  {
-    switch (team_size) {
-      case 4:
-        switch (dataset_block_dim) {
-          case 128: std::forward<Lambda>(l).template operator()<4u, 128u, 8u, 8u>(); return;
-          default: break;
-        }
-        break;
-      case 8:
-        switch (dataset_block_dim) {
-          case 256: std::forward<Lambda>(l).template operator()<8u, 256u, 8u, 8u>(); return;
-          default: break;
-        }
-        break;
-      case 16:
-        switch (dataset_block_dim) {
-          case 512: std::forward<Lambda>(l).template operator()<16u, 512u, 8u, 8u>(); return;
-          default: break;
-        }
-        break;
-      case 32:
-        switch (dataset_block_dim) {
-          case 1024: std::forward<Lambda>(l).template operator()<32u, 1024u, 8u, 8u>(); return;
-          default: break;
-        }
-        break;
-      default: break;
-    }
-    RAFT_FAIL(
-      "Unsupported VPQ pq_len=8 team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u",
-      static_cast<unsigned>(team_size),
-      static_cast<unsigned>(dataset_block_dim));
-  }
-
   template <typename Lambda>
   static void dispatch_cagra_vpq_team_dim(uint32_t team_size,
                                           uint32_t dataset_block_dim,
@@ -384,15 +270,11 @@ struct CagraPlannerBase : AlgorithmPlanner {
   // template parameters; CAGRA reads team_size / dataset_block_dim from the host descriptor at
   // planning time.
   template <typename Lambda>
-  static void dispatch_cagra_team_dim(uint32_t team_size, uint32_t dataset_block_dim, Lambda&& l)
+  static void dispatch_cagra_standard_team_dim(uint32_t team_size,
+                                               uint32_t dataset_block_dim,
+                                               Lambda&& l)
   {
     switch (team_size) {
-      case 4:
-        switch (dataset_block_dim) {
-          case 128: std::forward<Lambda>(l).template operator()<4u, 128u>(); return;
-          default: break;
-        }
-        break;
       case 8:
         switch (dataset_block_dim) {
           case 128: std::forward<Lambda>(l).template operator()<8u, 128u>(); return;
@@ -414,17 +296,93 @@ struct CagraPlannerBase : AlgorithmPlanner {
           case 128: std::forward<Lambda>(l).template operator()<32u, 128u>(); return;
           case 256: std::forward<Lambda>(l).template operator()<32u, 256u>(); return;
           case 512: std::forward<Lambda>(l).template operator()<32u, 512u>(); return;
-          case 1024: std::forward<Lambda>(l).template operator()<32u, 1024u>(); return;
           default: break;
         }
         break;
       default: break;
     }
-    RAFT_FAIL("Unsupported team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u",
+    RAFT_FAIL("Unsupported standard team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u",
               static_cast<unsigned>(team_size),
               static_cast<unsigned>(dataset_block_dim));
   }
 
+  template <uint32_t PqLenV, typename Lambda>
+  static void dispatch_cagra_vpq_pq2_4_team_dim(uint32_t team_size,
+                                                uint32_t dataset_block_dim,
+                                                Lambda&& l)
+  {
+    switch (team_size) {
+      case 8:
+        switch (dataset_block_dim) {
+          case 128: std::forward<Lambda>(l).template operator()<8u, 128u, 8u, PqLenV>(); return;
+          case 256: std::forward<Lambda>(l).template operator()<8u, 256u, 8u, PqLenV>(); return;
+          case 512: std::forward<Lambda>(l).template operator()<8u, 512u, 8u, PqLenV>(); return;
+          default: break;
+        }
+        break;
+      case 16:
+        switch (dataset_block_dim) {
+          case 128: std::forward<Lambda>(l).template operator()<16u, 128u, 8u, PqLenV>(); return;
+          case 256: std::forward<Lambda>(l).template operator()<16u, 256u, 8u, PqLenV>(); return;
+          case 512: std::forward<Lambda>(l).template operator()<16u, 512u, 8u, PqLenV>(); return;
+          default: break;
+        }
+        break;
+      case 32:
+        switch (dataset_block_dim) {
+          case 128: std::forward<Lambda>(l).template operator()<32u, 128u, 8u, PqLenV>(); return;
+          case 256: std::forward<Lambda>(l).template operator()<32u, 256u, 8u, PqLenV>(); return;
+          case 512: std::forward<Lambda>(l).template operator()<32u, 512u, 8u, PqLenV>(); return;
+          default: break;
+        }
+        break;
+      default: break;
+    }
+    RAFT_FAIL(
+      "Unsupported VPQ pq_len=%u team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u",
+      static_cast<unsigned>(PqLenV),
+      static_cast<unsigned>(team_size),
+      static_cast<unsigned>(dataset_block_dim));
+  }
+
+  template <typename Lambda>
+  static void dispatch_cagra_vpq_pq8_team_dim(uint32_t team_size,
+                                              uint32_t dataset_block_dim,
+                                              Lambda&& l)
+  {
+    switch (team_size) {
+      case 4:
+        switch (dataset_block_dim) {
+          case 128: std::forward<Lambda>(l).template operator()<4u, 128u, 8u, 8u>(); return;
+          default: break;
+        }
+        break;
+      case 8:
+        switch (dataset_block_dim) {
+          case 256: std::forward<Lambda>(l).template operator()<8u, 256u, 8u, 8u>(); return;
+          default: break;
+        }
+        break;
+      case 16:
+        switch (dataset_block_dim) {
+          case 512: std::forward<Lambda>(l).template operator()<16u, 512u, 8u, 8u>(); return;
+          default: break;
+        }
+        break;
+      case 32:
+        switch (dataset_block_dim) {
+          case 1024: std::forward<Lambda>(l).template operator()<32u, 1024u, 8u, 8u>(); return;
+          default: break;
+        }
+        break;
+      default: break;
+    }
+    RAFT_FAIL(
+      "Unsupported VPQ pq_len=8 team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u",
+      static_cast<unsigned>(team_size),
+      static_cast<unsigned>(dataset_block_dim));
+  }
+
   void add_sample_filter_device_function()
   {
     if constexpr (!std::is_same_v<SampleFilterJitTag_, tag_cagra_jit_sample_filter_link_absent>) {

From 0eef38fe66032618b645f452373c1c426eabcdbb Mon Sep 17 00:00:00 2001
From: enp1s0 <ootomo.h.x86@gmail.com>
Date: Wed, 10 Jun 2026 16:58:20 +0900
Subject: [PATCH 116/119] Fix ldg_cg use

---
 .../detail/cagra/jit_lto_kernels/compute_distance_impl.cuh | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh
index ea817110e7..a6dd0495fd 100644
--- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh
@@ -127,12 +127,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker_impl(
     for (std::uint32_t e = 0; e < nelem; e++) {
       const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen;
       if (k >= n_subspace) break;
-      if constexpr (std::is_same_v<PQ_CODEBOOK_LOAD_T, uint32_t>) {
-        device::ldg_cg(pq_codes[e],
-                       reinterpret_cast<const PQ_CODEBOOK_LOAD_T*>(dataset_ptr + 4 + k));
-      } else {
-        pq_codes[e] = *reinterpret_cast<const PQ_CODEBOOK_LOAD_T*>(dataset_ptr + 4 + k);
-      }
+      device::ldg_cg(pq_codes[e], reinterpret_cast<const PQ_CODEBOOK_LOAD_T*>(dataset_ptr + 4 + k));
     }
     //
     if constexpr (PQ_LEN % 2 == 0) {

From b5355d94ed02deb4eb72fbe39a9689415e71bf0e Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Mon, 15 Jun 2026 06:53:31 -0700
Subject: [PATCH 117/119] Add shuffle_dataset option for iterative CAGRA-Q
 graph build

Optionally permute the VPQ-compressed dataset before the iterative build
loop to break spatial locality, then unshuffle the resulting graph back to
the original node ordering. Adds the shuffle_dataset (and smem_dtype)
build-search params and bench parsing.
---
 .../src/cuvs/cuvs_ann_bench_param_parser.h    |  21 +++
 cpp/include/cuvs/neighbors/cagra.hpp          |  17 ++
 .../neighbors/detail/cagra/cagra_build.cuh    | 156 +++++++++++++++++-
 3 files changed, 190 insertions(+), 4 deletions(-)

diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
index b636465949..9977484742 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -423,6 +423,27 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index
             arg.hashmap_mode = cuvs::neighbors::cagra::hash_mode::AUTO;
           }
         }
+        // Whether to shuffle the (compressed) dataset before the iterative build loop.
+        if (build_search_conf.contains("shuffle_dataset")) {
+          arg.shuffle_dataset = build_search_conf.at("shuffle_dataset").get<bool>();
+        }
+        // Precision of the codebook/query in shared memory for the VPQ search used during
+        // the iterative build. Accepts an integer code (0=F16, 1=E5M2) or a string.
+        if (build_search_conf.contains("smem_dtype")) {
+          const auto& sd = build_search_conf.at("smem_dtype");
+          if (sd.is_number_integer()) {
+            arg.smem_dtype = static_cast<cuvs::neighbors::cagra::internal_dtype>(sd.get<int>());
+          } else {
+            std::string s = sd.get<std::string>();
+            if (s == "f16" || s == "F16" || s == "fp16" || s == "half") {
+              arg.smem_dtype = cuvs::neighbors::cagra::internal_dtype::F16;
+            } else if (s == "e5m2" || s == "E5M2" || s == "fp8") {
+              arg.smem_dtype = cuvs::neighbors::cagra::internal_dtype::E5M2;
+            } else {
+              throw std::runtime_error("invalid value for build_search smem_dtype: " + s);
+            }
+          }
+        }
       }
     },
     params.graph_build_params);
diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index daac54946e..fd96b2d052 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -166,6 +166,23 @@ struct iterative_search_params : cuvs::neighbors::cagra::search_params {
    */
   std::optional<cuvs::neighbors::vpq_params> build_compression = std::nullopt;
 
+  /**
+   * Whether to shuffle the dataset before building the graph.
+   *
+   * When enabled, the compressed dataset is randomly permuted before graph
+   * construction begins. This can improve graph quality by breaking any
+   * spatial locality in the original dataset ordering that might cause
+   * the iterative builder to get stuck in local optima during early
+   * iterations.
+   *
+   * After graph construction, the node indices in the graph are remapped
+   * back to the original dataset ordering.
+   *
+   * Only applies when compression is enabled (build_compression or
+   * index_params::compression is set).
+   */
+  bool shuffle_dataset = true;
+
   iterative_search_params()
   {
     this->search_width   = 1;
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 00822330ce..f0396063f3 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -17,8 +17,17 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/matrix/gather.cuh>
+#include <raft/random/permute.cuh>
+#include <raft/util/cuda_utils.cuh>
 #include <raft/util/integer_utils.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/scatter.h>
+#include <thrust/transform.h>
+
 #include <cuvs/cluster/kmeans.hpp>
 #include <cuvs/distance/distance.hpp>
 #include <cuvs/neighbors/cagra.hpp>
@@ -51,6 +60,32 @@ namespace cuvs::neighbors::cagra::detail {
 constexpr double to_mib(size_t bytes) { return static_cast<double>(bytes) / (1 << 20); }
 constexpr double to_gib(size_t bytes) { return static_cast<double>(bytes) / (1 << 30); }
 
+// Functor to remap indices using a permutation lookup table
+template <typename IdxT>
+struct remap_indices_op {
+  const IdxT* perm;
+  __host__ __device__ IdxT operator()(IdxT idx) const { return perm[idx]; }
+};
+
+// Functor to compute scattered output index for graph row reordering
+template <typename IdxT>
+struct graph_scatter_index_op {
+  const IdxT* perm;
+  int64_t degree;
+  __host__ __device__ int64_t operator()(int64_t idx) const
+  {
+    int64_t row = idx / degree;
+    int64_t col = idx % degree;
+    return static_cast<int64_t>(perm[row]) * degree + col;
+  }
+};
+
+// Functor to convert int64_t to IdxT
+template <typename IdxT>
+struct cast_to_idx_op {
+  __host__ __device__ IdxT operator()(int64_t v) const { return static_cast<IdxT>(v); }
+};
+
 template <typename T, typename IdxT>
 void check_graph_degree(size_t& intermediate_degree, size_t& graph_degree, size_t dataset_size)
 {
@@ -2203,15 +2238,80 @@ auto iterative_build_graph(
   // Generate the compressed index once if compression is enabled
   const uint64_t dataset_dim = dev_dataset.extent(1);
   std::optional<index<T, IdxT>> idx_opt;
+
+  // Optional shuffle permutation for randomizing dataset order during build.
+  // inverse_perm[shuffled_idx] = original_idx
+  // perm[shuffled_idx] = original_idx, used to unshuffle the graph after build
+  auto dev_perm         = raft::make_device_vector<IdxT, int64_t>(res, 0);
+  bool dataset_shuffled = false;
+
+  // Warn if shuffle is requested but compression is not enabled
+  if (iter_params.shuffle_dataset && !build_compression.has_value()) {
+    RAFT_LOG_WARN("shuffle_dataset is only supported with compression enabled; ignoring");
+  }
+
   if (build_compression.has_value()) {
     auto start = std::chrono::high_resolution_clock::now();
     RAFT_EXPECTS(params.metric == cuvs::distance::DistanceType::L2Expanded,
                  "VPQ compression is only supported with L2Expanded distance mertric");
+
+    // Build the VPQ compressed dataset
+    auto vpq_dset =
+      cuvs::preprocessing::quantize::pq::vpq_build(res, *build_compression, dev_dataset);
+
+    // Optionally shuffle the compressed dataset to break spatial locality
+    if (iter_params.shuffle_dataset) {
+      auto shuffle_start = std::chrono::high_resolution_clock::now();
+      RAFT_LOG_INFO("Shuffling compressed dataset to randomize build order...");
+
+      auto stream        = raft::resource::get_cuda_stream(res);
+      const auto n_rows  = vpq_dset.data.extent(0);
+      const auto row_len = vpq_dset.data.extent(1);
+
+      // Generate random permutation: perm[i] = source index for output row i
+      // i.e., shuffled_data[i] = original_data[perm[i]]
+      // So perm maps: shuffled_idx -> original_idx
+      // Use int64_t for permutation to match vpq_dataset's index type
+      auto dev_perm_i64 = raft::make_device_vector<int64_t, int64_t>(res, n_rows);
+
+      // Use legacy permute API to generate permutation indices only (out=nullptr, in=nullptr)
+      // This just fills dev_perm_i64 with a random permutation of [0, n_rows)
+      raft::random::permute<uint8_t, int64_t, int64_t>(dev_perm_i64.data_handle(),
+                                                       static_cast<uint8_t*>(nullptr),
+                                                       static_cast<const uint8_t*>(nullptr),
+                                                       static_cast<int64_t>(row_len),
+                                                       static_cast<int64_t>(n_rows),
+                                                       true,
+                                                       stream);
+
+      // Apply permutation to VPQ data: shuffled_data[i] = original_data[perm[i]]
+      // Use in-place gather which reorders rows according to the map
+      raft::matrix::gather(res, vpq_dset.data.view(), raft::make_const_mdspan(dev_perm_i64.view()));
+
+      // Store perm as IdxT for graph unshuffling later
+      // perm[shuffled_idx] = original_idx
+      // This is used for:
+      // 1. Remapping neighbor values: neighbor j (shuffled) -> perm[j] (original)
+      // 2. Reordering rows: row i (for shuffled node i) -> position perm[i] (original node)
+      dev_perm = raft::make_device_vector<IdxT, int64_t>(res, n_rows);
+      cast_to_idx_op<IdxT> cast_op;
+      thrust::transform(raft::resource::get_thrust_policy(res),
+                        dev_perm_i64.data_handle(),
+                        dev_perm_i64.data_handle() + n_rows,
+                        dev_perm.data_handle(),
+                        cast_op);
+
+      dataset_shuffled = true;
+
+      auto shuffle_end = std::chrono::high_resolution_clock::now();
+      auto shuffle_ms =
+        std::chrono::duration_cast<std::chrono::milliseconds>(shuffle_end - shuffle_start).count();
+      RAFT_LOG_INFO("# Dataset shuffle time: %.3lf sec", (double)shuffle_ms / 1000);
+    }
+
     idx_opt.emplace(res, params.metric);
-    idx_opt->update_dataset(
-      res,
-      // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later
-      cuvs::preprocessing::quantize::pq::vpq_build(res, *build_compression, dev_dataset));
+    // Use the (optionally shuffled) compressed dataset built above.
+    idx_opt->update_dataset(res, std::move(vpq_dset));
     auto end        = std::chrono::high_resolution_clock::now();
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
     RAFT_LOG_INFO("# VPQ compression time: %.3lf sec", (double)elapsed_ms / 1000);
@@ -2325,6 +2425,54 @@ auto iterative_build_graph(
   // (once for the build loop and once in build()'s shared tail). We could avoid this by returning
   // the index directly (with its VPQ dataset and device-side graph) instead of just the host graph.
   auto stream = raft::resource::get_cuda_stream(res);
+
+  // If the dataset was shuffled, we need to unshuffle the graph:
+  // Recall: perm[shuffled_idx] = original_idx (stored in dev_perm)
+  // 1. Remap neighbor indices from shuffled space to original space
+  // 2. Reorder rows from shuffled order to original order
+  if (dataset_shuffled) {
+    auto unshuffle_start = std::chrono::high_resolution_clock::now();
+    RAFT_LOG_INFO("Unshuffling graph to restore original dataset ordering...");
+
+    const auto n_rows = dev_graph.extent(0);
+    const auto degree = dev_graph.extent(1);
+
+    // Step 1: Remap all neighbor indices using perm
+    // graph[i][j] contains shuffled index j; we need original index = perm[j]
+    remap_indices_op<IdxT> remap_op{dev_perm.data_handle()};
+    thrust::transform(raft::resource::get_thrust_policy(res),
+                      dev_graph.data_handle(),
+                      dev_graph.data_handle() + n_rows * degree,
+                      dev_graph.data_handle(),
+                      remap_op);
+
+    // Step 2: Reorder rows back to original order
+    // Row i in dev_graph is for shuffled node i, which is original node perm[i].
+    // We want this row to be at position perm[i] in the final graph.
+    // scatter: output[map[i]] = input[i], so map[i] = perm[i]
+    auto dev_unshuffled_graph = raft::make_device_matrix<IdxT, int64_t>(res, n_rows, degree);
+
+    // Use thrust::scatter to reorder: for each row i, place it at position perm[i]
+    // We scatter row-by-row conceptually, but do it element-wise with computed output indices
+    graph_scatter_index_op<IdxT> scatter_idx_op{dev_perm.data_handle(), degree};
+    auto output_indices =
+      thrust::make_transform_iterator(thrust::make_counting_iterator<int64_t>(0), scatter_idx_op);
+
+    thrust::scatter(raft::resource::get_thrust_policy(res),
+                    dev_graph.data_handle(),
+                    dev_graph.data_handle() + n_rows * degree,
+                    output_indices,
+                    dev_unshuffled_graph.data_handle());
+
+    dev_graph = std::move(dev_unshuffled_graph);
+
+    auto unshuffle_end = std::chrono::high_resolution_clock::now();
+    auto unshuffle_ms =
+      std::chrono::duration_cast<std::chrono::milliseconds>(unshuffle_end - unshuffle_start)
+        .count();
+    RAFT_LOG_INFO("# Graph unshuffle time: %.3lf sec", (double)unshuffle_ms / 1000);
+  }
+
   cagra_graph = raft::make_host_matrix<IdxT, int64_t>(dev_graph.extent(0), dev_graph.extent(1));
   raft::copy(cagra_graph.data_handle(),
              dev_graph.data_handle(),

From 23c811b44e857f098d7d3ba9ce0c1c1944d680d4 Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Mon, 15 Jun 2026 09:40:27 -0700
Subject: [PATCH 118/119] fix oob due to in place raft shuffle

---
 cpp/src/neighbors/detail/cagra/cagra_build.cuh    | 15 ++++++++++++---
 .../detail/cagra/search_multi_kernel.cuh          |  6 +++++-
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index f0396063f3..ef91de6df6 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -2284,9 +2284,18 @@ auto iterative_build_graph(
                                                        true,
                                                        stream);
 
-      // Apply permutation to VPQ data: shuffled_data[i] = original_data[perm[i]]
-      // Use in-place gather which reorders rows according to the map
-      raft::matrix::gather(res, vpq_dset.data.view(), raft::make_const_mdspan(dev_perm_i64.view()));
+      // Apply permutation to VPQ data: shuffled_data[i] = original_data[perm[i]].
+      // NOTE: use an out-of-place device gather into a temporary buffer rather than the
+      // in-place gather overload. The in-place overload uses a host-orchestrated,
+      // double-buffered, multi-stream path that races here and triggers an asynchronous
+      // illegal memory access (the crash disappears under CUDA_LAUNCH_BLOCKING=1).
+      auto shuffled_data = raft::make_device_matrix<uint8_t, int64_t>(
+        res, vpq_dset.data.extent(0), vpq_dset.data.extent(1));
+      raft::matrix::gather(res,
+                           raft::make_const_mdspan(vpq_dset.data.view()),
+                           raft::make_const_mdspan(dev_perm_i64.view()),
+                           shuffled_data.view());
+      vpq_dset.data = std::move(shuffled_data);
 
       // Store perm as IdxT for graph unshuffling later
       // perm[shuffled_idx] = original_idx
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
index e09ef82a39..f1c7305833 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -511,7 +511,11 @@ struct search
                       hashmap.data(),
                       hash_bitlen,
                       stream,
-                      static_cast<IndexT>(this->dataset_size));
+                      // Bound random seed selection to the graph size, not the dataset size.
+                      // During iterative / CAGRA-Q build the graph is smaller than the dataset,
+                      // so using dataset_size here selects seeds that index past the graph end
+                      // (out-of-bounds access). See https://github.com/rapidsai/cuvs/pull/1780.
+                      static_cast<IndexT>(graph.extent(0)));
 
     std::shared_ptr<AlgorithmLauncher> compute_distance_to_child_nodes_launcher =
       make_cagra_multi_kernel_jit_launcher<DATA_T,

From 8fc10ac0d8c649371aacadd6fd2508e9875d4a70 Mon Sep 17 00:00:00 2001
From: Irina Reshodko <ireshodko@nvidia.com>
Date: Wed, 1 Jul 2026 02:55:11 -0700
Subject: [PATCH 119/119] made the growth-phase build search itopk parameter a
 tunable parameter

---
 cpp/include/cuvs/neighbors/cagra.hpp          |  5 ++++
 .../neighbors/detail/cagra/cagra_build.cuh    | 23 +++++++++++++++++--
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 532423253c..1e6e87b2e5 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -190,6 +190,11 @@ struct iterative_search_params : cuvs::neighbors::cagra::search_params {
   {
     this->search_width   = 1;
     this->max_iterations = 8;
+    // itopk_size controls the search during the *growing* iterations of the build loop.
+    // 0 (default) means auto-select per iteration (max(graph_degree + 32, 128)); a nonzero
+    // value overrides it for the growing iterations. The final iteration always uses a fixed
+    // itopk tied to the output topk, regardless of this value.
+    this->itopk_size = 0;
   }
 };
 
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index ef91de6df6..10e44377e3 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -2209,6 +2209,17 @@ auto iterative_build_graph(
   RAFT_LOG_DEBUG("# graph_degree = %lu", (uint64_t)graph_degree);
   RAFT_LOG_DEBUG("# topk = %lu", (uint64_t)topk);
 
+  // A fixed itopk_size (0 = auto) governs the growing iterations, which build graphs of degree
+  // ~graph_degree/2 and thus request topk ~= graph_degree/2 + 1; the search planner requires
+  // topk <= itopk_size. (The full-size iterations override itopk internally, so they are not
+  // constrained by this value.)
+  RAFT_EXPECTS(iter_params.itopk_size == 0 ||
+                 iter_params.itopk_size >= graph_degree / 2 + 1,
+               "iterative build search itopk_size (%zu) must be 0 (auto) or >= "
+               "graph_degree / 2 + 1 (%zu)",
+               (size_t)iter_params.itopk_size,
+               (size_t)(graph_degree / 2 + 1));
+
   // Create an initial graph. The initial graph created here is not suitable for
   // searching, but connectivity is guaranteed.
   auto offset = raft::make_host_vector<IdxT, int64_t>(small_graph_degree);
@@ -2344,8 +2355,16 @@ auto iterative_build_graph(
     // The search count (topk) is set to the next graph degree + 1, because
     // pruning is not used except in the last iteration.
     // (*) The appropriate setting for itopk_size requires careful consideration.
-    auto curr_topk       = next_graph_degree + 1;
-    auto curr_itopk_size = std::max(next_graph_degree + 32, (uint64_t)128);
+    auto curr_topk = next_graph_degree + 1;
+    // The configurable itopk (iter_params.itopk_size, 0 = auto) applies only to the true growing
+    // iterations, where the degree being built is small_graph_degree. When the graph reaches its
+    // full size the search builds a graph_degree-degree graph (topk = graph_degree + 1); that
+    // iteration needs a larger itopk, so it overrides the configured value with the auto formula.
+    // The final iteration (flag_last) uses a fixed itopk tied to the output topk.
+    auto curr_itopk_size =
+      (iter_params.itopk_size > 0 && next_graph_degree == small_graph_degree)
+        ? (uint64_t)iter_params.itopk_size
+        : std::max(next_graph_degree + 32, (uint64_t)128);
     if (flag_last) {
       curr_topk       = topk;
       curr_itopk_size = curr_topk + 32;