diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index 6dd19b6781..62e890fb09 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -897,6 +897,7 @@ struct index : cuvs::neighbors::index { * - L2 * - InnerProduct (currently only supported with IVF-PQ as the build algorithm) * - CosineExpanded + * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm) * * Usage example: * @code{.cpp} @@ -935,6 +936,7 @@ auto build(raft::resources const& res, * - L2 * - InnerProduct (currently only supported with IVF-PQ as the build algorithm) * - CosineExpanded + * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm) * * Usage example: * @code{.cpp} @@ -973,6 +975,7 @@ auto build(raft::resources const& res, * - L2 * - InnerProduct (currently only supported with IVF-PQ as the build algorithm) * - CosineExpanded (dataset norms are computed as float regardless of input data type) + * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm) * * Usage example: * @code{.cpp} @@ -1010,6 +1013,7 @@ auto build(raft::resources const& res, * The following distance metrics are supported: * - L2 * - CosineExpanded (dataset norms are computed as float regardless of input data type) + * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm) * * Usage example: * @code{.cpp} @@ -1047,6 +1051,9 @@ auto build(raft::resources const& res, * The following distance metrics are supported: * - L2 * - CosineExpanded (dataset norms are computed as float regardless of input data type) + * - L1 + * - BitwiseHamming (currently only supported with NN-Descent and Iterative Search as the build + * algorithm, and only for int8_t and uint8_t data types) * * Usage example: * @code{.cpp} @@ -1085,6 +1092,9 @@ auto build(raft::resources const& res, * - L2 * - InnerProduct (currently only supported with IVF-PQ as the build algorithm) * - CosineExpanded (dataset norms are computed as float regardless of input data type) + * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm) + * - BitwiseHamming (currently only supported with NN-Descent and Iterative Search as the build + * algorithm, and only for int8_t and uint8_t data types) * * Usage example: * @code{.cpp} @@ -1123,6 +1133,9 @@ auto build(raft::resources const& res, * - L2 * - InnerProduct (currently only supported with IVF-PQ as the build algorithm) * - CosineExpanded (dataset norms are computed as float regardless of input data type) + * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm) + * - BitwiseHamming (currently only supported with NN-Descent and Iterative Search as the build + * algorithm, and only for int8_t and uint8_t data types) * * Usage example: * @code{.cpp} @@ -1161,6 +1174,9 @@ auto build(raft::resources const& res, * - L2 * - InnerProduct (currently only supported with IVF-PQ as the build algorithm) * - CosineExpanded (dataset norms are computed as float regardless of input data type) + * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm) + * - BitwiseHamming (currently only supported with NN-Descent and Iterative Search as the build + * algorithm, and only for int8_t and uint8_t data types) * * Usage example: * @code{.cpp} diff --git a/cpp/include/cuvs/neighbors/nn_descent.hpp b/cpp/include/cuvs/neighbors/nn_descent.hpp index 44fbaed592..8e94edfaa5 100644 --- a/cpp/include/cuvs/neighbors/nn_descent.hpp +++ b/cpp/include/cuvs/neighbors/nn_descent.hpp @@ -218,6 +218,7 @@ struct index : cuvs::neighbors::index { * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * * Usage example: * @code{.cpp} @@ -254,6 +255,7 @@ auto build(raft::resources const& res, * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * * Usage example: * @code{.cpp} @@ -292,6 +294,7 @@ auto build(raft::resources const& res, * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * * Usage example: * @code{.cpp} @@ -328,6 +331,7 @@ auto build(raft::resources const& res, * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * * Usage example: * @code{.cpp} @@ -366,6 +370,7 @@ auto build(raft::resources const& res, * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * - BitwiseHamming * * Usage example: @@ -403,6 +408,7 @@ auto build(raft::resources const& res, * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * - BitwiseHamming * * Usage example: @@ -442,6 +448,7 @@ auto build(raft::resources const& res, * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * - BitwiseHamming * * Usage example: @@ -479,6 +486,7 @@ auto build(raft::resources const& res, * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * - BitwiseHamming * * Usage example: diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh index a1eb829569..a0561c7921 100644 --- a/cpp/src/neighbors/detail/nn_descent.cuh +++ b/cpp/src/neighbors/detail/nn_descent.cuh @@ -122,6 +122,25 @@ constexpr __host__ __device__ __forceinline__ int skew_dim(int ndim) } } +template +struct dtype_traits; + +template <> +struct dtype_traits { + static constexpr int APAD = 4; + static constexpr int BPAD = 4; + static constexpr int TILE_COL_WIDTH = 32; + static __device__ __forceinline__ float to_float(float v) { return v; } +}; + +template <> +struct dtype_traits<__half> { + static constexpr int APAD = 8; + static constexpr int BPAD = 8; + static constexpr int TILE_COL_WIDTH = 64; + static __device__ __forceinline__ float to_float(__half v) { return __half2float(v); } +}; + template __device__ __forceinline__ ResultItem xor_swap(ResultItem x, int mask, int dir) { @@ -272,7 +291,8 @@ RAFT_KERNEL preprocess_data_kernel( for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) { int idx = step * raft::warp_size() + threadIdx.x; if (idx < dim) { - if (metric == cuvs::distance::DistanceType::InnerProduct) { + if (metric == cuvs::distance::DistanceType::InnerProduct || + metric == cuvs::distance::DistanceType::L1) { output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx]; } else if (metric == cuvs::distance::DistanceType::CosineExpanded) { output_data[list_id * dim + idx] = @@ -518,7 +538,8 @@ __device__ __forceinline__ void calculate_metric(float* s_distances, for (int d = 0; d < data_dim; d++) { s_distances[i] += __popc(static_cast(data_n1[d] ^ data_n2[d]) & 0xff); } - } else { // L2Expanded or L2SqrtExpanded + } else if (metric == cuvs::distance::DistanceType::L2Expanded || + metric == cuvs::distance::DistanceType::L2SqrtExpanded) { s_distances[i] = l2_norms[row_neighbors[row_id]] + l2_norms[col_neighbors[col_id]] - 2.0 * s_distances[i]; // for fp32 vs fp16 precision differences resulting in negative distances when distance @@ -535,13 +556,28 @@ __device__ __forceinline__ void calculate_metric(float* s_distances, } } +struct DistAccumulator { + cuvs::distance::DistanceType metric; + __device__ __forceinline__ float operator()(float a, float b) const + { + if (metric == cuvs::distance::DistanceType::L1) { return raft::abs(a - b); } + return a * b; + } +}; + // launch_bounds here denote BLOCK_SIZE = 512 and MIN_BLOCKS_PER_SM = 4 // Per // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications, // MAX_RESIDENT_THREAD_PER_SM = BLOCK_SIZE * BLOCKS_PER_SM = 2048 // For architectures 750 and 860 (890), the values for MAX_RESIDENT_THREAD_PER_SM // is 1024 and 1536 respectively, which means the bounds don't work anymore -template , typename DistEpilogue_t> +// SIMT kernel: scalar element-wise distance computation. +// Used for fp32 data (all metrics) and fp16 data with L1 distance (which cannot use tensor cores). +template , + typename DistEpilogue_t> + requires(std::is_same_v || std::is_same_v) RAFT_KERNEL #ifdef __CUDA_ARCH__ // Use minBlocksPerMultiprocessor = 4 on specific arches @@ -552,32 +588,31 @@ __launch_bounds__(BLOCK_SIZE, 4) __launch_bounds__(BLOCK_SIZE) #endif #endif - local_join_kernel(const Index_t* graph_new, - const Index_t* rev_graph_new, - const int2* sizes_new, - const Index_t* graph_old, - const Index_t* rev_graph_old, - const int2* sizes_old, - const int width, - const float* data, - const int data_dim, - ID_t* graph, - DistData_t* dists, - int graph_width, - int* locks, - DistData_t* l2_norms, - cuvs::distance::DistanceType metric, - DistEpilogue_t dist_epilogue) + local_join_kernel_simt(const Index_t* graph_new, + const Index_t* rev_graph_new, + const int2* sizes_new, + const Index_t* graph_old, + const Index_t* rev_graph_old, + const int2* sizes_old, + const int width, + const Data_t* data, + const int data_dim, + ID_t* graph, + DistData_t* dists, + int graph_width, + int* locks, + DistData_t* l2_norms, + cuvs::distance::DistanceType metric, + DistEpilogue_t dist_epilogue) { #if (__CUDA_ARCH__ >= 700) - using namespace nvcuda; __shared__ int s_list[MAX_NUM_BI_SAMPLES * 2]; - constexpr int APAD = 4; - constexpr int BPAD = 4; - constexpr int TILE_COL_WIDTH = 32; - __shared__ float s_nv[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + APAD]; - __shared__ float s_ov[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + BPAD]; + constexpr int APAD = dtype_traits::APAD; + constexpr int BPAD = dtype_traits::BPAD; + constexpr int TILE_COL_WIDTH = dtype_traits::TILE_COL_WIDTH; + __shared__ Data_t s_nv[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + APAD]; + __shared__ Data_t s_ov[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + BPAD]; __shared__ float s_distances[MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES]; // s_distances: MAX_NUM_BI_SAMPLES x SKEWED_MAX_NUM_BI_SAMPLES, reuse the space of s_ov @@ -635,48 +670,49 @@ __launch_bounds__(BLOCK_SIZE) int lane_id = threadIdx.x % raft::warp_size(); constexpr int num_warps = BLOCK_SIZE / raft::warp_size(); - if (metric != cuvs::distance::DistanceType::BitwiseHamming) { - int tid = threadIdx.x; - for (int i = tid; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) - s_distances[i] = 0.0f; + DistAccumulator dist_acc(metric); - __syncthreads(); + int tid = threadIdx.x; + for (int i = tid; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) + s_distances[i] = 0.0f; - for (int step = 0; step < raft::ceildiv(data_dim, TILE_COL_WIDTH); step++) { - int num_load_elems = (step == raft::ceildiv(data_dim, TILE_COL_WIDTH) - 1) - ? data_dim - step * TILE_COL_WIDTH - : TILE_COL_WIDTH; + __syncthreads(); + + for (int step = 0; step < raft::ceildiv(data_dim, TILE_COL_WIDTH); step++) { + int num_load_elems = (step == raft::ceildiv(data_dim, TILE_COL_WIDTH) - 1) + ? data_dim - step * TILE_COL_WIDTH + : TILE_COL_WIDTH; #pragma unroll - for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) { - int idx = i * num_warps + warp_id; - if (idx < list_new_size) { - size_t neighbor_id = new_neighbors[idx]; - size_t idx_in_data = neighbor_id * data_dim; - load_vec(s_nv[idx], - data + idx_in_data + step * TILE_COL_WIDTH, - num_load_elems, - TILE_COL_WIDTH, - lane_id); - } + for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) { + int idx = i * num_warps + warp_id; + if (idx < list_new_size) { + size_t neighbor_id = new_neighbors[idx]; + size_t idx_in_data = neighbor_id * data_dim; + load_vec(s_nv[idx], + data + idx_in_data + step * TILE_COL_WIDTH, + num_load_elems, + TILE_COL_WIDTH, + lane_id); } - __syncthreads(); + } + __syncthreads(); - // this is much faster than a warp-collaborative multiplication because MAX_NUM_BI_SAMPLES is - // fixed and small (64) - for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; - i += blockDim.x) { - int tmp_row = i / SKEWED_MAX_NUM_BI_SAMPLES; - int tmp_col = i % SKEWED_MAX_NUM_BI_SAMPLES; - if (tmp_row < list_new_size && tmp_col < list_new_size) { - float acc = 0.0f; - for (int d = 0; d < num_load_elems; d++) { - acc += s_nv[tmp_row][d] * s_nv[tmp_col][d]; - } - s_distances[i] += acc; + // this is much faster than a warp-collaborative multiplication because MAX_NUM_BI_SAMPLES is + // fixed and small (64) + for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) { + int tmp_row = i / SKEWED_MAX_NUM_BI_SAMPLES; + int tmp_col = i % SKEWED_MAX_NUM_BI_SAMPLES; + if (tmp_row < list_new_size && tmp_col < list_new_size) { + float acc = 0.0f; + for (int d = 0; d < num_load_elems; d++) { + float a = dtype_traits::to_float(s_nv[tmp_row][d]); + float b = dtype_traits::to_float(s_nv[tmp_col][d]); + acc += dist_acc(a, b); } + s_distances[i] += acc; } - __syncthreads(); } + __syncthreads(); } __syncthreads(); @@ -706,63 +742,61 @@ __launch_bounds__(BLOCK_SIZE) __syncthreads(); - if (metric != cuvs::distance::DistanceType::BitwiseHamming) { - int tid = threadIdx.x; - for (int i = tid; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) - s_distances[i] = 0.0f; + for (int i = tid; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) + s_distances[i] = 0.0f; - __syncthreads(); + __syncthreads(); - for (int step = 0; step < raft::ceildiv(data_dim, TILE_COL_WIDTH); step++) { - int num_load_elems = (step == raft::ceildiv(data_dim, TILE_COL_WIDTH) - 1) - ? data_dim - step * TILE_COL_WIDTH - : TILE_COL_WIDTH; - if (TILE_COL_WIDTH < data_dim) { -#pragma unroll - for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) { - int idx = i * num_warps + warp_id; - if (idx < list_new_size) { - size_t neighbor_id = new_neighbors[idx]; - size_t idx_in_data = neighbor_id * data_dim; - load_vec(s_nv[idx], - data + idx_in_data + step * TILE_COL_WIDTH, - num_load_elems, - TILE_COL_WIDTH, - lane_id); - } - } - } + for (int step = 0; step < raft::ceildiv(data_dim, TILE_COL_WIDTH); step++) { + int num_load_elems = (step == raft::ceildiv(data_dim, TILE_COL_WIDTH) - 1) + ? data_dim - step * TILE_COL_WIDTH + : TILE_COL_WIDTH; + if (TILE_COL_WIDTH < data_dim) { #pragma unroll for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) { int idx = i * num_warps + warp_id; - if (idx < list_old_size) { - size_t neighbor_id = old_neighbors[idx]; + if (idx < list_new_size) { + size_t neighbor_id = new_neighbors[idx]; size_t idx_in_data = neighbor_id * data_dim; - load_vec(s_ov[idx], + load_vec(s_nv[idx], data + idx_in_data + step * TILE_COL_WIDTH, num_load_elems, TILE_COL_WIDTH, lane_id); } } - __syncthreads(); + } +#pragma unroll + for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) { + int idx = i * num_warps + warp_id; + if (idx < list_old_size) { + size_t neighbor_id = old_neighbors[idx]; + size_t idx_in_data = neighbor_id * data_dim; + load_vec(s_ov[idx], + data + idx_in_data + step * TILE_COL_WIDTH, + num_load_elems, + TILE_COL_WIDTH, + lane_id); + } + } + __syncthreads(); - // this is much faster than a warp-collaborative multiplication because MAX_NUM_BI_SAMPLES is - // fixed and small (64) - for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; - i += blockDim.x) { - int tmp_row = i / SKEWED_MAX_NUM_BI_SAMPLES; - int tmp_col = i % SKEWED_MAX_NUM_BI_SAMPLES; - if (tmp_row < list_new_size && tmp_col < list_old_size) { - float acc = 0.0f; - for (int d = 0; d < num_load_elems; d++) { - acc += s_nv[tmp_row][d] * s_ov[tmp_col][d]; - } - s_distances[i] += acc; + // this is much faster than a warp-collaborative multiplication because MAX_NUM_BI_SAMPLES is + // fixed and small (64) + for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) { + int tmp_row = i / SKEWED_MAX_NUM_BI_SAMPLES; + int tmp_col = i % SKEWED_MAX_NUM_BI_SAMPLES; + if (tmp_row < list_new_size && tmp_col < list_old_size) { + float acc = 0.0f; + for (int d = 0; d < num_load_elems; d++) { + float a = dtype_traits::to_float(s_nv[tmp_row][d]); + float b = dtype_traits::to_float(s_ov[tmp_col][d]); + acc += dist_acc(a, b); } + s_distances[i] += acc; } - __syncthreads(); } + __syncthreads(); } __syncthreads(); @@ -820,22 +854,22 @@ __launch_bounds__(BLOCK_SIZE, 4) __launch_bounds__(BLOCK_SIZE) #endif #endif - local_join_kernel(const Index_t* graph_new, - const Index_t* rev_graph_new, - const int2* sizes_new, - const Index_t* graph_old, - const Index_t* rev_graph_old, - const int2* sizes_old, - const int width, - const __half* data, - const int data_dim, - ID_t* graph, - DistData_t* dists, - int graph_width, - int* locks, - DistData_t* l2_norms, - cuvs::distance::DistanceType metric, - DistEpilogue_t dist_epilogue) + local_join_kernel_wmma(const Index_t* graph_new, + const Index_t* rev_graph_new, + const int2* sizes_new, + const Index_t* graph_old, + const Index_t* rev_graph_old, + const int2* sizes_old, + const int width, + const __half* data, + const int data_dim, + ID_t* graph, + DistData_t* dists, + int graph_width, + int* locks, + DistData_t* l2_norms, + cuvs::distance::DistanceType metric, + DistEpilogue_t dist_epilogue) { #if (__CUDA_ARCH__ >= 700) using namespace nvcuda; @@ -1380,39 +1414,56 @@ void GNND::local_join(cudaStream_t stream, DistEpilogue_t dist_ { raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits::max()); if (d_data_float_.has_value()) { - local_join_kernel<<>>(graph_.h_graph_new.data_handle(), - h_rev_graph_new_.data_handle(), - d_list_sizes_new_.data_handle(), - h_graph_old_.data_handle(), - h_rev_graph_old_.data_handle(), - d_list_sizes_old_.data_handle(), - NUM_SAMPLES, - d_data_float_.value().data_handle(), - ndim_, - graph_buffer_.data_handle(), - dists_buffer_.data_handle(), - DEGREE_ON_DEVICE, - d_locks_.data_handle(), - l2_norms_.data_handle(), - build_config_.metric, - dist_epilogue); + local_join_kernel_simt<<>>(graph_.h_graph_new.data_handle(), + h_rev_graph_new_.data_handle(), + d_list_sizes_new_.data_handle(), + h_graph_old_.data_handle(), + h_rev_graph_old_.data_handle(), + d_list_sizes_old_.data_handle(), + NUM_SAMPLES, + d_data_float_->data_handle(), + ndim_, + graph_buffer_.data_handle(), + dists_buffer_.data_handle(), + DEGREE_ON_DEVICE, + d_locks_.data_handle(), + l2_norms_.data_handle(), + build_config_.metric, + dist_epilogue); + } else if (build_config_.metric == cuvs::distance::DistanceType::L1) { + local_join_kernel_simt<<>>(graph_.h_graph_new.data_handle(), + h_rev_graph_new_.data_handle(), + d_list_sizes_new_.data_handle(), + h_graph_old_.data_handle(), + h_rev_graph_old_.data_handle(), + d_list_sizes_old_.data_handle(), + NUM_SAMPLES, + d_data_half_.value().data_handle(), + ndim_, + graph_buffer_.data_handle(), + dists_buffer_.data_handle(), + DEGREE_ON_DEVICE, + d_locks_.data_handle(), + l2_norms_.data_handle(), + build_config_.metric, + dist_epilogue); } else { - local_join_kernel<<>>(graph_.h_graph_new.data_handle(), - h_rev_graph_new_.data_handle(), - d_list_sizes_new_.data_handle(), - h_graph_old_.data_handle(), - h_rev_graph_old_.data_handle(), - d_list_sizes_old_.data_handle(), - NUM_SAMPLES, - d_data_half_.value().data_handle(), - ndim_, - graph_buffer_.data_handle(), - dists_buffer_.data_handle(), - DEGREE_ON_DEVICE, - d_locks_.data_handle(), - l2_norms_.data_handle(), - build_config_.metric, - dist_epilogue); + local_join_kernel_wmma<<>>(graph_.h_graph_new.data_handle(), + h_rev_graph_new_.data_handle(), + d_list_sizes_new_.data_handle(), + h_graph_old_.data_handle(), + h_rev_graph_old_.data_handle(), + d_list_sizes_old_.data_handle(), + NUM_SAMPLES, + d_data_half_.value().data_handle(), + ndim_, + graph_buffer_.data_handle(), + dists_buffer_.data_handle(), + DEGREE_ON_DEVICE, + d_locks_.data_handle(), + l2_norms_.data_handle(), + build_config_.metric, + dist_epilogue); } } diff --git a/cpp/src/neighbors/detail/nn_descent_gnnd.hpp b/cpp/src/neighbors/detail/nn_descent_gnnd.hpp index b0799505f4..a2639e4f43 100644 --- a/cpp/src/neighbors/detail/nn_descent_gnnd.hpp +++ b/cpp/src/neighbors/detail/nn_descent_gnnd.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -216,7 +216,7 @@ class GNND { int2* list_sizes, cudaStream_t stream = 0); - template + template void local_join(cudaStream_t stream = 0, DistEpilogue_t dist_epilogue = DistEpilogue_t{}); raft::resources const& res; @@ -264,10 +264,11 @@ inline BuildConfig get_build_config(raft::resources const& res, params.metric == cuvs::distance::DistanceType::L2SqrtExpanded || params.metric == cuvs::distance::DistanceType::CosineExpanded || params.metric == cuvs::distance::DistanceType::InnerProduct || - params.metric == cuvs::distance::DistanceType::BitwiseHamming; + params.metric == cuvs::distance::DistanceType::BitwiseHamming || + params.metric == cuvs::distance::DistanceType::L1; RAFT_EXPECTS(allowed_metrics, "The metric for NN Descent should be L2Expanded, L2SqrtExpanded, CosineExpanded, " - "InnerProduct or BitwiseHamming"); + "InnerProduct, BitwiseHamming or L1"); RAFT_EXPECTS( metric == params.metric, "The metrics set in nn_descent::index_params and nn_descent::index are inconsistent"); diff --git a/cpp/tests/neighbors/ann_nn_descent.cuh b/cpp/tests/neighbors/ann_nn_descent.cuh index 7db6a523e9..568cebbc23 100644 --- a/cpp/tests/neighbors/ann_nn_descent.cuh +++ b/cpp/tests/neighbors/ann_nn_descent.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -471,7 +471,8 @@ const std::vector inputs = cuvs::distance::DistanceType::L2Expanded, cuvs::distance::DistanceType::L2SqrtExpanded, cuvs::distance::DistanceType::InnerProduct, - cuvs::distance::DistanceType::CosineExpanded}, + cuvs::distance::DistanceType::CosineExpanded, + cuvs::distance::DistanceType::L1}, {false, true}, {0.90});