diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 6dd19b6781..62e890fb09 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -897,6 +897,7 @@ struct index : cuvs::neighbors::index {
  * - L2
  * - InnerProduct (currently only supported with IVF-PQ as the build algorithm)
  * - CosineExpanded
+ * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm)
  *
  * Usage example:
  * @code{.cpp}
@@ -935,6 +936,7 @@ auto build(raft::resources const& res,
  * - L2
  * - InnerProduct (currently only supported with IVF-PQ as the build algorithm)
  * - CosineExpanded
+ * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm)
  *
  * Usage example:
  * @code{.cpp}
@@ -973,6 +975,7 @@ auto build(raft::resources const& res,
  * - L2
  * - InnerProduct (currently only supported with IVF-PQ as the build algorithm)
  * - CosineExpanded (dataset norms are computed as float regardless of input data type)
+ * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm)
  *
  * Usage example:
  * @code{.cpp}
@@ -1010,6 +1013,7 @@ auto build(raft::resources const& res,
  * The following distance metrics are supported:
  * - L2
  * - CosineExpanded (dataset norms are computed as float regardless of input data type)
+ * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm)
  *
  * Usage example:
  * @code{.cpp}
@@ -1047,6 +1051,9 @@ auto build(raft::resources const& res,
  * The following distance metrics are supported:
  * - L2
  * - CosineExpanded (dataset norms are computed as float regardless of input data type)
+ * - L1
+ * - BitwiseHamming (currently only supported with NN-Descent and Iterative Search as the build
+ * algorithm, and only for int8_t and uint8_t data types)
  *
  * Usage example:
  * @code{.cpp}
@@ -1085,6 +1092,9 @@ auto build(raft::resources const& res,
  * - L2
  * - InnerProduct (currently only supported with IVF-PQ as the build algorithm)
  * - CosineExpanded (dataset norms are computed as float regardless of input data type)
+ * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm)
+ * - BitwiseHamming (currently only supported with NN-Descent and Iterative Search as the build
+ * algorithm, and only for int8_t and uint8_t data types)
  *
  * Usage example:
  * @code{.cpp}
@@ -1123,6 +1133,9 @@ auto build(raft::resources const& res,
  * - L2
  * - InnerProduct (currently only supported with IVF-PQ as the build algorithm)
  * - CosineExpanded (dataset norms are computed as float regardless of input data type)
+ * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm)
+ * - BitwiseHamming (currently only supported with NN-Descent and Iterative Search as the build
+ * algorithm, and only for int8_t and uint8_t data types)
  *
  * Usage example:
  * @code{.cpp}
@@ -1161,6 +1174,9 @@ auto build(raft::resources const& res,
  * - L2
  * - InnerProduct (currently only supported with IVF-PQ as the build algorithm)
  * - CosineExpanded (dataset norms are computed as float regardless of input data type)
+ * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm)
+ * - BitwiseHamming (currently only supported with NN-Descent and Iterative Search as the build
+ * algorithm, and only for int8_t and uint8_t data types)
  *
  * Usage example:
  * @code{.cpp}
diff --git a/cpp/include/cuvs/neighbors/nn_descent.hpp b/cpp/include/cuvs/neighbors/nn_descent.hpp
index 44fbaed592..8e94edfaa5 100644
--- a/cpp/include/cuvs/neighbors/nn_descent.hpp
+++ b/cpp/include/cuvs/neighbors/nn_descent.hpp
@@ -218,6 +218,7 @@ struct index : cuvs::neighbors::index {
  * - L2SqrtExpanded
  * - CosineExpanded
  * - InnerProduct
+ * - L1
  *
  * Usage example:
  * @code{.cpp}
@@ -254,6 +255,7 @@ auto build(raft::resources const& res,
  * - L2SqrtExpanded
  * - CosineExpanded
  * - InnerProduct
+ * - L1
  *
  * Usage example:
  * @code{.cpp}
@@ -292,6 +294,7 @@ auto build(raft::resources const& res,
  * - L2SqrtExpanded
  * - CosineExpanded
  * - InnerProduct
+ * - L1
  *
  * Usage example:
  * @code{.cpp}
@@ -328,6 +331,7 @@ auto build(raft::resources const& res,
  * - L2SqrtExpanded
  * - CosineExpanded
  * - InnerProduct
+ * - L1
  *
  * Usage example:
  * @code{.cpp}
@@ -366,6 +370,7 @@ auto build(raft::resources const& res,
  * - L2SqrtExpanded
  * - CosineExpanded
  * - InnerProduct
+ * - L1
  * - BitwiseHamming
  *
  * Usage example:
@@ -403,6 +408,7 @@ auto build(raft::resources const& res,
  * - L2SqrtExpanded
  * - CosineExpanded
  * - InnerProduct
+ * - L1
  * - BitwiseHamming
  *
  * Usage example:
@@ -442,6 +448,7 @@ auto build(raft::resources const& res,
  * - L2SqrtExpanded
  * - CosineExpanded
  * - InnerProduct
+ * - L1
  * - BitwiseHamming
  *
  * Usage example:
@@ -479,6 +486,7 @@ auto build(raft::resources const& res,
  * - L2SqrtExpanded
  * - CosineExpanded
  * - InnerProduct
+ * - L1
  * - BitwiseHamming
  *
  * Usage example:
diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh
index a1eb829569..a0561c7921 100644
--- a/cpp/src/neighbors/detail/nn_descent.cuh
+++ b/cpp/src/neighbors/detail/nn_descent.cuh
@@ -122,6 +122,25 @@ constexpr __host__ __device__ __forceinline__ int skew_dim(int ndim)
   }
 }
 
+template <typename T>
+struct dtype_traits;
+
+template <>
+struct dtype_traits<float> {
+  static constexpr int APAD           = 4;
+  static constexpr int BPAD           = 4;
+  static constexpr int TILE_COL_WIDTH = 32;
+  static __device__ __forceinline__ float to_float(float v) { return v; }
+};
+
+template <>
+struct dtype_traits<__half> {
+  static constexpr int APAD           = 8;
+  static constexpr int BPAD           = 8;
+  static constexpr int TILE_COL_WIDTH = 64;
+  static __device__ __forceinline__ float to_float(__half v) { return __half2float(v); }
+};
+
 template <typename T>
 __device__ __forceinline__ ResultItem<T> xor_swap(ResultItem<T> x, int mask, int dir)
 {
@@ -272,7 +291,8 @@ RAFT_KERNEL preprocess_data_kernel(
   for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) {
     int idx = step * raft::warp_size() + threadIdx.x;
     if (idx < dim) {
-      if (metric == cuvs::distance::DistanceType::InnerProduct) {
+      if (metric == cuvs::distance::DistanceType::InnerProduct ||
+          metric == cuvs::distance::DistanceType::L1) {
         output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx];
       } else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
         output_data[list_id * dim + idx] =
@@ -518,7 +538,8 @@ __device__ __forceinline__ void calculate_metric(float* s_distances,
         for (int d = 0; d < data_dim; d++) {
           s_distances[i] += __popc(static_cast<uint32_t>(data_n1[d] ^ data_n2[d]) & 0xff);
         }
-      } else {  // L2Expanded or L2SqrtExpanded
+      } else if (metric == cuvs::distance::DistanceType::L2Expanded ||
+                 metric == cuvs::distance::DistanceType::L2SqrtExpanded) {
         s_distances[i] =
           l2_norms[row_neighbors[row_id]] + l2_norms[col_neighbors[col_id]] - 2.0 * s_distances[i];
         // for fp32 vs fp16 precision differences resulting in negative distances when distance
@@ -535,13 +556,28 @@ __device__ __forceinline__ void calculate_metric(float* s_distances,
   }
 }
 
+struct DistAccumulator {
+  cuvs::distance::DistanceType metric;
+  __device__ __forceinline__ float operator()(float a, float b) const
+  {
+    if (metric == cuvs::distance::DistanceType::L1) { return raft::abs(a - b); }
+    return a * b;
+  }
+};
+
 // launch_bounds here denote BLOCK_SIZE = 512 and MIN_BLOCKS_PER_SM = 4
 // Per
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications,
 // MAX_RESIDENT_THREAD_PER_SM = BLOCK_SIZE * BLOCKS_PER_SM = 2048
 // For architectures 750 and 860 (890), the values for MAX_RESIDENT_THREAD_PER_SM
 // is 1024 and 1536 respectively, which means the bounds don't work anymore
-template <typename Index_t, typename ID_t = InternalID_t<Index_t>, typename DistEpilogue_t>
+// SIMT kernel: scalar element-wise distance computation.
+// Used for fp32 data (all metrics) and fp16 data with L1 distance (which cannot use tensor cores).
+template <typename Data_t,
+          typename Index_t,
+          typename ID_t = InternalID_t<Index_t>,
+          typename DistEpilogue_t>
+  requires(std::is_same_v<Data_t, float> || std::is_same_v<Data_t, __half>)
 RAFT_KERNEL
 #ifdef __CUDA_ARCH__
 // Use minBlocksPerMultiprocessor = 4 on specific arches
@@ -552,32 +588,31 @@ __launch_bounds__(BLOCK_SIZE, 4)
 __launch_bounds__(BLOCK_SIZE)
 #endif
 #endif
-  local_join_kernel(const Index_t* graph_new,
-                    const Index_t* rev_graph_new,
-                    const int2* sizes_new,
-                    const Index_t* graph_old,
-                    const Index_t* rev_graph_old,
-                    const int2* sizes_old,
-                    const int width,
-                    const float* data,
-                    const int data_dim,
-                    ID_t* graph,
-                    DistData_t* dists,
-                    int graph_width,
-                    int* locks,
-                    DistData_t* l2_norms,
-                    cuvs::distance::DistanceType metric,
-                    DistEpilogue_t dist_epilogue)
+  local_join_kernel_simt(const Index_t* graph_new,
+                         const Index_t* rev_graph_new,
+                         const int2* sizes_new,
+                         const Index_t* graph_old,
+                         const Index_t* rev_graph_old,
+                         const int2* sizes_old,
+                         const int width,
+                         const Data_t* data,
+                         const int data_dim,
+                         ID_t* graph,
+                         DistData_t* dists,
+                         int graph_width,
+                         int* locks,
+                         DistData_t* l2_norms,
+                         cuvs::distance::DistanceType metric,
+                         DistEpilogue_t dist_epilogue)
 {
 #if (__CUDA_ARCH__ >= 700)
-  using namespace nvcuda;
   __shared__ int s_list[MAX_NUM_BI_SAMPLES * 2];
 
-  constexpr int APAD           = 4;
-  constexpr int BPAD           = 4;
-  constexpr int TILE_COL_WIDTH = 32;
-  __shared__ float s_nv[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + APAD];
-  __shared__ float s_ov[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + BPAD];
+  constexpr int APAD           = dtype_traits<Data_t>::APAD;
+  constexpr int BPAD           = dtype_traits<Data_t>::BPAD;
+  constexpr int TILE_COL_WIDTH = dtype_traits<Data_t>::TILE_COL_WIDTH;
+  __shared__ Data_t s_nv[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + APAD];
+  __shared__ Data_t s_ov[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + BPAD];
   __shared__ float s_distances[MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES];
 
   // s_distances: MAX_NUM_BI_SAMPLES x SKEWED_MAX_NUM_BI_SAMPLES, reuse the space of s_ov
@@ -635,48 +670,49 @@ __launch_bounds__(BLOCK_SIZE)
   int lane_id             = threadIdx.x % raft::warp_size();
   constexpr int num_warps = BLOCK_SIZE / raft::warp_size();
 
-  if (metric != cuvs::distance::DistanceType::BitwiseHamming) {
-    int tid = threadIdx.x;
-    for (int i = tid; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x)
-      s_distances[i] = 0.0f;
+  DistAccumulator dist_acc(metric);
 
-    __syncthreads();
+  int tid = threadIdx.x;
+  for (int i = tid; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x)
+    s_distances[i] = 0.0f;
 
-    for (int step = 0; step < raft::ceildiv(data_dim, TILE_COL_WIDTH); step++) {
-      int num_load_elems = (step == raft::ceildiv(data_dim, TILE_COL_WIDTH) - 1)
-                             ? data_dim - step * TILE_COL_WIDTH
-                             : TILE_COL_WIDTH;
+  __syncthreads();
+
+  for (int step = 0; step < raft::ceildiv(data_dim, TILE_COL_WIDTH); step++) {
+    int num_load_elems = (step == raft::ceildiv(data_dim, TILE_COL_WIDTH) - 1)
+                           ? data_dim - step * TILE_COL_WIDTH
+                           : TILE_COL_WIDTH;
 #pragma unroll
-      for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) {
-        int idx = i * num_warps + warp_id;
-        if (idx < list_new_size) {
-          size_t neighbor_id = new_neighbors[idx];
-          size_t idx_in_data = neighbor_id * data_dim;
-          load_vec(s_nv[idx],
-                   data + idx_in_data + step * TILE_COL_WIDTH,
-                   num_load_elems,
-                   TILE_COL_WIDTH,
-                   lane_id);
-        }
+    for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) {
+      int idx = i * num_warps + warp_id;
+      if (idx < list_new_size) {
+        size_t neighbor_id = new_neighbors[idx];
+        size_t idx_in_data = neighbor_id * data_dim;
+        load_vec(s_nv[idx],
+                 data + idx_in_data + step * TILE_COL_WIDTH,
+                 num_load_elems,
+                 TILE_COL_WIDTH,
+                 lane_id);
       }
-      __syncthreads();
+    }
+    __syncthreads();
 
-      // this is much faster than a warp-collaborative multiplication because MAX_NUM_BI_SAMPLES is
-      // fixed and small (64)
-      for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES;
-           i += blockDim.x) {
-        int tmp_row = i / SKEWED_MAX_NUM_BI_SAMPLES;
-        int tmp_col = i % SKEWED_MAX_NUM_BI_SAMPLES;
-        if (tmp_row < list_new_size && tmp_col < list_new_size) {
-          float acc = 0.0f;
-          for (int d = 0; d < num_load_elems; d++) {
-            acc += s_nv[tmp_row][d] * s_nv[tmp_col][d];
-          }
-          s_distances[i] += acc;
+    // this is much faster than a warp-collaborative multiplication because MAX_NUM_BI_SAMPLES is
+    // fixed and small (64)
+    for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
+      int tmp_row = i / SKEWED_MAX_NUM_BI_SAMPLES;
+      int tmp_col = i % SKEWED_MAX_NUM_BI_SAMPLES;
+      if (tmp_row < list_new_size && tmp_col < list_new_size) {
+        float acc = 0.0f;
+        for (int d = 0; d < num_load_elems; d++) {
+          float a = dtype_traits<Data_t>::to_float(s_nv[tmp_row][d]);
+          float b = dtype_traits<Data_t>::to_float(s_nv[tmp_col][d]);
+          acc += dist_acc(a, b);
         }
+        s_distances[i] += acc;
       }
-      __syncthreads();
     }
+    __syncthreads();
   }
   __syncthreads();
 
@@ -706,63 +742,61 @@ __launch_bounds__(BLOCK_SIZE)
 
   __syncthreads();
 
-  if (metric != cuvs::distance::DistanceType::BitwiseHamming) {
-    int tid = threadIdx.x;
-    for (int i = tid; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x)
-      s_distances[i] = 0.0f;
+  for (int i = tid; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x)
+    s_distances[i] = 0.0f;
 
-    __syncthreads();
+  __syncthreads();
 
-    for (int step = 0; step < raft::ceildiv(data_dim, TILE_COL_WIDTH); step++) {
-      int num_load_elems = (step == raft::ceildiv(data_dim, TILE_COL_WIDTH) - 1)
-                             ? data_dim - step * TILE_COL_WIDTH
-                             : TILE_COL_WIDTH;
-      if (TILE_COL_WIDTH < data_dim) {
-#pragma unroll
-        for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) {
-          int idx = i * num_warps + warp_id;
-          if (idx < list_new_size) {
-            size_t neighbor_id = new_neighbors[idx];
-            size_t idx_in_data = neighbor_id * data_dim;
-            load_vec(s_nv[idx],
-                     data + idx_in_data + step * TILE_COL_WIDTH,
-                     num_load_elems,
-                     TILE_COL_WIDTH,
-                     lane_id);
-          }
-        }
-      }
+  for (int step = 0; step < raft::ceildiv(data_dim, TILE_COL_WIDTH); step++) {
+    int num_load_elems = (step == raft::ceildiv(data_dim, TILE_COL_WIDTH) - 1)
+                           ? data_dim - step * TILE_COL_WIDTH
+                           : TILE_COL_WIDTH;
+    if (TILE_COL_WIDTH < data_dim) {
 #pragma unroll
       for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) {
         int idx = i * num_warps + warp_id;
-        if (idx < list_old_size) {
-          size_t neighbor_id = old_neighbors[idx];
+        if (idx < list_new_size) {
+          size_t neighbor_id = new_neighbors[idx];
           size_t idx_in_data = neighbor_id * data_dim;
-          load_vec(s_ov[idx],
+          load_vec(s_nv[idx],
                    data + idx_in_data + step * TILE_COL_WIDTH,
                    num_load_elems,
                    TILE_COL_WIDTH,
                    lane_id);
         }
       }
-      __syncthreads();
+    }
+#pragma unroll
+    for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) {
+      int idx = i * num_warps + warp_id;
+      if (idx < list_old_size) {
+        size_t neighbor_id = old_neighbors[idx];
+        size_t idx_in_data = neighbor_id * data_dim;
+        load_vec(s_ov[idx],
+                 data + idx_in_data + step * TILE_COL_WIDTH,
+                 num_load_elems,
+                 TILE_COL_WIDTH,
+                 lane_id);
+      }
+    }
+    __syncthreads();
 
-      // this is much faster than a warp-collaborative multiplication because MAX_NUM_BI_SAMPLES is
-      // fixed and small (64)
-      for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES;
-           i += blockDim.x) {
-        int tmp_row = i / SKEWED_MAX_NUM_BI_SAMPLES;
-        int tmp_col = i % SKEWED_MAX_NUM_BI_SAMPLES;
-        if (tmp_row < list_new_size && tmp_col < list_old_size) {
-          float acc = 0.0f;
-          for (int d = 0; d < num_load_elems; d++) {
-            acc += s_nv[tmp_row][d] * s_ov[tmp_col][d];
-          }
-          s_distances[i] += acc;
+    // this is much faster than a warp-collaborative multiplication because MAX_NUM_BI_SAMPLES is
+    // fixed and small (64)
+    for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
+      int tmp_row = i / SKEWED_MAX_NUM_BI_SAMPLES;
+      int tmp_col = i % SKEWED_MAX_NUM_BI_SAMPLES;
+      if (tmp_row < list_new_size && tmp_col < list_old_size) {
+        float acc = 0.0f;
+        for (int d = 0; d < num_load_elems; d++) {
+          float a = dtype_traits<Data_t>::to_float(s_nv[tmp_row][d]);
+          float b = dtype_traits<Data_t>::to_float(s_ov[tmp_col][d]);
+          acc += dist_acc(a, b);
         }
+        s_distances[i] += acc;
       }
-      __syncthreads();
     }
+    __syncthreads();
   }
   __syncthreads();
 
@@ -820,22 +854,22 @@ __launch_bounds__(BLOCK_SIZE, 4)
 __launch_bounds__(BLOCK_SIZE)
 #endif
 #endif
-  local_join_kernel(const Index_t* graph_new,
-                    const Index_t* rev_graph_new,
-                    const int2* sizes_new,
-                    const Index_t* graph_old,
-                    const Index_t* rev_graph_old,
-                    const int2* sizes_old,
-                    const int width,
-                    const __half* data,
-                    const int data_dim,
-                    ID_t* graph,
-                    DistData_t* dists,
-                    int graph_width,
-                    int* locks,
-                    DistData_t* l2_norms,
-                    cuvs::distance::DistanceType metric,
-                    DistEpilogue_t dist_epilogue)
+  local_join_kernel_wmma(const Index_t* graph_new,
+                         const Index_t* rev_graph_new,
+                         const int2* sizes_new,
+                         const Index_t* graph_old,
+                         const Index_t* rev_graph_old,
+                         const int2* sizes_old,
+                         const int width,
+                         const __half* data,
+                         const int data_dim,
+                         ID_t* graph,
+                         DistData_t* dists,
+                         int graph_width,
+                         int* locks,
+                         DistData_t* l2_norms,
+                         cuvs::distance::DistanceType metric,
+                         DistEpilogue_t dist_epilogue)
 {
 #if (__CUDA_ARCH__ >= 700)
   using namespace nvcuda;
@@ -1380,39 +1414,56 @@ void GNND<Data_t, Index_t>::local_join(cudaStream_t stream, DistEpilogue_t dist_
 {
   raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
   if (d_data_float_.has_value()) {
-    local_join_kernel<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
-                                                        h_rev_graph_new_.data_handle(),
-                                                        d_list_sizes_new_.data_handle(),
-                                                        h_graph_old_.data_handle(),
-                                                        h_rev_graph_old_.data_handle(),
-                                                        d_list_sizes_old_.data_handle(),
-                                                        NUM_SAMPLES,
-                                                        d_data_float_.value().data_handle(),
-                                                        ndim_,
-                                                        graph_buffer_.data_handle(),
-                                                        dists_buffer_.data_handle(),
-                                                        DEGREE_ON_DEVICE,
-                                                        d_locks_.data_handle(),
-                                                        l2_norms_.data_handle(),
-                                                        build_config_.metric,
-                                                        dist_epilogue);
+    local_join_kernel_simt<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
+                                                             h_rev_graph_new_.data_handle(),
+                                                             d_list_sizes_new_.data_handle(),
+                                                             h_graph_old_.data_handle(),
+                                                             h_rev_graph_old_.data_handle(),
+                                                             d_list_sizes_old_.data_handle(),
+                                                             NUM_SAMPLES,
+                                                             d_data_float_->data_handle(),
+                                                             ndim_,
+                                                             graph_buffer_.data_handle(),
+                                                             dists_buffer_.data_handle(),
+                                                             DEGREE_ON_DEVICE,
+                                                             d_locks_.data_handle(),
+                                                             l2_norms_.data_handle(),
+                                                             build_config_.metric,
+                                                             dist_epilogue);
+  } else if (build_config_.metric == cuvs::distance::DistanceType::L1) {
+    local_join_kernel_simt<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
+                                                             h_rev_graph_new_.data_handle(),
+                                                             d_list_sizes_new_.data_handle(),
+                                                             h_graph_old_.data_handle(),
+                                                             h_rev_graph_old_.data_handle(),
+                                                             d_list_sizes_old_.data_handle(),
+                                                             NUM_SAMPLES,
+                                                             d_data_half_.value().data_handle(),
+                                                             ndim_,
+                                                             graph_buffer_.data_handle(),
+                                                             dists_buffer_.data_handle(),
+                                                             DEGREE_ON_DEVICE,
+                                                             d_locks_.data_handle(),
+                                                             l2_norms_.data_handle(),
+                                                             build_config_.metric,
+                                                             dist_epilogue);
   } else {
-    local_join_kernel<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
-                                                        h_rev_graph_new_.data_handle(),
-                                                        d_list_sizes_new_.data_handle(),
-                                                        h_graph_old_.data_handle(),
-                                                        h_rev_graph_old_.data_handle(),
-                                                        d_list_sizes_old_.data_handle(),
-                                                        NUM_SAMPLES,
-                                                        d_data_half_.value().data_handle(),
-                                                        ndim_,
-                                                        graph_buffer_.data_handle(),
-                                                        dists_buffer_.data_handle(),
-                                                        DEGREE_ON_DEVICE,
-                                                        d_locks_.data_handle(),
-                                                        l2_norms_.data_handle(),
-                                                        build_config_.metric,
-                                                        dist_epilogue);
+    local_join_kernel_wmma<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
+                                                             h_rev_graph_new_.data_handle(),
+                                                             d_list_sizes_new_.data_handle(),
+                                                             h_graph_old_.data_handle(),
+                                                             h_rev_graph_old_.data_handle(),
+                                                             d_list_sizes_old_.data_handle(),
+                                                             NUM_SAMPLES,
+                                                             d_data_half_.value().data_handle(),
+                                                             ndim_,
+                                                             graph_buffer_.data_handle(),
+                                                             dists_buffer_.data_handle(),
+                                                             DEGREE_ON_DEVICE,
+                                                             d_locks_.data_handle(),
+                                                             l2_norms_.data_handle(),
+                                                             build_config_.metric,
+                                                             dist_epilogue);
   }
 }
 
diff --git a/cpp/src/neighbors/detail/nn_descent_gnnd.hpp b/cpp/src/neighbors/detail/nn_descent_gnnd.hpp
index b0799505f4..a2639e4f43 100644
--- a/cpp/src/neighbors/detail/nn_descent_gnnd.hpp
+++ b/cpp/src/neighbors/detail/nn_descent_gnnd.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -216,7 +216,7 @@ class GNND {
                          int2* list_sizes,
                          cudaStream_t stream = 0);
 
-  template <typename DistEpilogue_t = raft::identity_op>
+  template <typename DistEpilogue_t>
   void local_join(cudaStream_t stream = 0, DistEpilogue_t dist_epilogue = DistEpilogue_t{});
 
   raft::resources const& res;
@@ -264,10 +264,11 @@ inline BuildConfig get_build_config(raft::resources const& res,
                          params.metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
                          params.metric == cuvs::distance::DistanceType::CosineExpanded ||
                          params.metric == cuvs::distance::DistanceType::InnerProduct ||
-                         params.metric == cuvs::distance::DistanceType::BitwiseHamming;
+                         params.metric == cuvs::distance::DistanceType::BitwiseHamming ||
+                         params.metric == cuvs::distance::DistanceType::L1;
   RAFT_EXPECTS(allowed_metrics,
                "The metric for NN Descent should be L2Expanded, L2SqrtExpanded, CosineExpanded, "
-               "InnerProduct or BitwiseHamming");
+               "InnerProduct, BitwiseHamming or L1");
   RAFT_EXPECTS(
     metric == params.metric,
     "The metrics set in nn_descent::index_params and nn_descent::index are inconsistent");
diff --git a/cpp/tests/neighbors/ann_nn_descent.cuh b/cpp/tests/neighbors/ann_nn_descent.cuh
index 7db6a523e9..568cebbc23 100644
--- a/cpp/tests/neighbors/ann_nn_descent.cuh
+++ b/cpp/tests/neighbors/ann_nn_descent.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -471,7 +471,8 @@ const std::vector<AnnNNDescentInputs> inputs =
                                                       cuvs::distance::DistanceType::L2Expanded,
                                                       cuvs::distance::DistanceType::L2SqrtExpanded,
                                                       cuvs::distance::DistanceType::InnerProduct,
-                                                      cuvs::distance::DistanceType::CosineExpanded},
+                                                      cuvs::distance::DistanceType::CosineExpanded,
+                                                      cuvs::distance::DistanceType::L1},
                                                      {false, true},
                                                      {0.90});