From cce66e66360d2a387b017c4566ce8666d5158580 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Sun, 3 Aug 2025 16:50:24 +0900 Subject: [PATCH 001/119] Add pq_len=8 instances --- cpp/CMakeLists.txt | 12 ++ .../detail/cagra/compute_distance-ext.cuh | 134 +++++++++++++++++- .../detail/cagra/compute_distance.cu | 14 +- .../cagra/compute_distance_00_generate.py | 6 +- ...d_float_uint32_dim128_t8_8pq_8subd_half.cu | 41 ++++++ ..._float_uint32_dim256_t16_8pq_8subd_half.cu | 41 ++++++ ..._float_uint32_dim512_t32_8pq_8subd_half.cu | 41 ++++++ ...ed_half_uint32_dim128_t8_8pq_8subd_half.cu | 41 ++++++ ...d_half_uint32_dim256_t16_8pq_8subd_half.cu | 41 ++++++ ...d_half_uint32_dim512_t32_8pq_8subd_half.cu | 41 ++++++ ...ed_int8_uint32_dim128_t8_8pq_8subd_half.cu | 41 ++++++ ...d_int8_uint32_dim256_t16_8pq_8subd_half.cu | 41 ++++++ ...d_int8_uint32_dim512_t32_8pq_8subd_half.cu | 41 ++++++ ...d_uint8_uint32_dim128_t8_8pq_8subd_half.cu | 41 ++++++ ..._uint8_uint32_dim256_t16_8pq_8subd_half.cu | 41 ++++++ ..._uint8_uint32_dim512_t32_8pq_8subd_half.cu | 41 ++++++ 16 files changed, 653 insertions(+), 5 deletions(-) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 50ee1a0ce2..ed78fcaca5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -237,28 +237,40 @@ if(BUILD_SHARED_LIBS) src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh index faca960808..3957abc6b5 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,6 +62,15 @@ extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec>; @@ -469,50 +589,62 @@ using descriptor_instances = instance_selector< standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec>; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu index 4158863465..cbb6f4540b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,50 +34,62 @@ template struct instance_selector< standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec>; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py index 1c813f1a50..9834ca696f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import glob template = """/* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -54,7 +54,7 @@ #mxdim_team = [(32, 8), (64, 16), (128, 32)] pq_bits = [8] -pq_lens = [2, 4] +pq_lens = [2, 4, 8] # rblock = [(256, 4), (512, 2), (1024, 1)] # rcandidates = [32] diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu new file mode 100644 index 0000000000..52fb9140fe --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu new file mode 100644 index 0000000000..41cea6b5b1 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu new file mode 100644 index 0000000000..b0f650f45a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu new file mode 100644 index 0000000000..82d5d738ab --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu new file mode 100644 index 0000000000..a8b1472cff --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu new file mode 100644 index 0000000000..dc56563595 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu new file mode 100644 index 0000000000..4cb61e18ad --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu new file mode 100644 index 0000000000..5463f42ab5 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu new file mode 100644 index 0000000000..64d436115b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu new file mode 100644 index 0000000000..eb9bc63041 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu new file mode 100644 index 0000000000..3825658388 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu new file mode 100644 index 0000000000..f4fc937e25 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail From 43e6145e21ce04013f18c01dafb698ba4414475c Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Fri, 15 Aug 2025 00:46:01 +0900 Subject: [PATCH 002/119] Update CAGRA-Q test --- cpp/tests/neighbors/ann_cagra.cuh | 48 ++++++++++++++++++- cpp/tests/neighbors/ann_utils.cuh | 35 +++++++++++--- cpp/tests/neighbors/vpq_utils.cuh | 77 +++++++++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 9 deletions(-) create mode 100644 cpp/tests/neighbors/vpq_utils.cuh diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh index e10e1300ca..156b55ad24 100644 --- a/cpp/tests/neighbors/ann_cagra.cuh +++ b/cpp/tests/neighbors/ann_cagra.cuh @@ -17,12 +17,14 @@ #include "../test_utils.cuh" #include "ann_utils.cuh" +#include "vpq_utils.cuh" #include #include "naive_knn.cuh" #include #include +#include #include #include #include @@ -446,6 +448,46 @@ class AnnCagraTest : public ::testing::TestWithParam { raft::update_host(indices_Cagra.data(), indices_dev.data(), queries_size, stream_); raft::resource::sync_stream(handle_); + + reference_recall = 1; + if (ps.compression.has_value()) { + auto decoded_dataset = + raft::make_device_matrix(handle_, ps.n_rows, ps.dim); + cuvs::neighbors::decode_vpq_dataset( + decoded_dataset.view(), + dynamic_cast&>(index.data()), + raft::resource::get_cuda_stream(handle_)); + auto indices_out_view = raft::make_device_matrix_view( + indices_dev.data(), ps.n_queries, ps.k); + auto dists_out_view = raft::make_device_matrix_view( + distances_dev.data(), ps.n_queries, ps.k); + + cuvs::neighbors::naive_knn(handle_, + dists_out_view.data_handle(), + indices_out_view.data_handle(), + search_queries.data(), + decoded_dataset.data_handle(), + ps.n_queries, + ps.n_rows, + ps.dim, + ps.k, + ps.metric); + std::vector indices_vpq_dataset(queries_size); + std::vector distances_vpq_dataset(queries_size); + raft::update_host( + distances_vpq_dataset.data(), dists_out_view.data_handle(), queries_size, stream_); + raft::update_host( + indices_vpq_dataset.data(), indices_out_view.data_handle(), queries_size, stream_); + + reference_recall = std::get<1>(calc_recall(indices_naive, + indices_vpq_dataset, + distances_naive, + distances_vpq_dataset, + ps.n_queries, + ps.k, + 0)); + printf("reference_recall = %e\n", reference_recall); + } } // for (int i = 0; i < min(ps.n_queries, 10); i++) { @@ -455,7 +497,7 @@ class AnnCagraTest : public ::testing::TestWithParam { // print_vector("T", distances_naive.data() + i * ps.k, ps.k, std::cout); // print_vector("C", distances_Cagra.data() + i * ps.k, ps.k, std::cout); // } - double min_recall = ps.min_recall; + double min_recall = ps.min_recall * reference_recall; EXPECT_TRUE(eval_neighbours(indices_naive, indices_Cagra, distances_naive, @@ -504,6 +546,7 @@ class AnnCagraTest : public ::testing::TestWithParam { AnnCagraInputs ps; rmm::device_uvector database; rmm::device_uvector search_queries; + double reference_recall; }; template @@ -1325,7 +1368,8 @@ inline std::vector generate_inputs() {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL, cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL}); // don't demand high recall // without refinement - for (uint32_t pq_len : {2}) { // for now, only pq_len = 2 is supported, more options coming soon + for (uint32_t pq_len : + {2, 4, 8}) { // for now, only pq_len = 2 is supported, more options coming soon for (uint32_t vq_n_centers : {100, 1000}) { for (auto input : inputs2) { vpq_params ps{}; diff --git a/cpp/tests/neighbors/ann_utils.cuh b/cpp/tests/neighbors/ann_utils.cuh index 0c01c48c9c..dbf5c1f6b3 100644 --- a/cpp/tests/neighbors/ann_utils.cuh +++ b/cpp/tests/neighbors/ann_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -207,7 +207,7 @@ auto eval_recall(const std::vector& expected_idx, double min_recall, bool test_unique = true) -> testing::AssertionResult { - auto [actual_recall, match_count, total_count] = + auto [actual_recall, index_based_actual_recall, match_count, total_count] = calc_recall(expected_idx, actual_idx, rows, cols); double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps); RAFT_LOG_INFO("Recall = %f (%zu/%zu), the error is %2.1f%% %s the threshold (eps = %f).", @@ -239,8 +239,9 @@ auto calc_recall(const std::vector& expected_idx, size_t cols, double eps) { - size_t match_count = 0; - size_t total_count = static_cast(rows) * static_cast(cols); + size_t match_count = 0; + size_t index_match_count = 0; + size_t total_count = static_cast(rows) * static_cast(cols); for (size_t i = 0; i < rows; ++i) { for (size_t k = 0; k < cols; ++k) { size_t idx_k = i * cols + k; // row major assumption! @@ -259,8 +260,28 @@ auto calc_recall(const std::vector& expected_idx, } } } - return std::make_tuple( - static_cast(match_count) / static_cast(total_count), match_count, total_count); + + // Index based recall + for (size_t i = 0; i < rows; ++i) { + for (size_t k = 0; k < cols; ++k) { + size_t idx_k = i * cols + k; // row major assumption! + auto act_idx = actual_idx[idx_k]; + for (size_t j = 0; j < cols; ++j) { + size_t idx = i * cols + j; // row major assumption! + auto exp_idx = expected_idx[idx]; + + if (act_idx == exp_idx) { + index_match_count++; + break; + } + } + } + } + + return std::make_tuple(static_cast(match_count) / static_cast(total_count), + static_cast(index_match_count) / static_cast(total_count), + match_count, + total_count); } /** same as eval_recall, but in case indices do not match, @@ -277,7 +298,7 @@ auto eval_neighbours(const std::vector& expected_idx, bool test_unique = true, size_t max_duplicates = 0) -> testing::AssertionResult { - auto [actual_recall, match_count, total_count] = + auto [actual_recall, index_based_actual_recall, match_count, total_count] = calc_recall(expected_idx, actual_idx, expected_dist, actual_dist, rows, cols, eps); double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps); diff --git a/cpp/tests/neighbors/vpq_utils.cuh b/cpp/tests/neighbors/vpq_utils.cuh new file mode 100644 index 0000000000..383e5ef063 --- /dev/null +++ b/cpp/tests/neighbors/vpq_utils.cuh @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +namespace cuvs::neighbors { +template +__global__ void decode_vpq_dataset_kernel(data_t* const decoded_dataset_ptr, + const uint32_t ldd, + const math_t* const vq_codebook_ptr, + const uint32_t ldv, + const math_t* const pq_codebook_ptr, + const uint32_t pq_subspace_dim, + const uint32_t pq_table_size, + const uint32_t dataset_dim, + const size_t dataset_size, + const uint8_t* const data_ptr, + const uint32_t ldi) +{ + constexpr uint32_t warp_size = 32; + const size_t batch_id = (blockIdx.x * blockDim.x + threadIdx.x) / warp_size; + if (batch_id >= dataset_size) { return; } + + const auto local_data_ptr = data_ptr + ldi * batch_id; + const auto vq_code = *reinterpret_cast(local_data_ptr); + const auto pq_code_ptr = local_data_ptr + sizeof(uint32_t); + const auto vq_vec_ptr = vq_codebook_ptr + vq_code * ldv; + auto local_dst_ptr = decoded_dataset_ptr + batch_id * ldd; + + const auto lane_id = threadIdx.x % warp_size; + for (uint32_t i = lane_id; i < dataset_dim; i += warp_size) { + const auto pq_code = pq_code_ptr[i / pq_subspace_dim]; + const auto pq_v = pq_codebook_ptr[pq_code * pq_subspace_dim + (i % pq_subspace_dim)]; + + local_dst_ptr[i] = static_cast(vq_vec_ptr[i]) + static_cast(pq_v); + } +} + +template +void decode_vpq_dataset(raft::device_matrix_view decoded_dataset, + const cuvs::neighbors::vpq_dataset& vpq_dataset, + cudaStream_t cuda_stream) +{ + const auto dataset_size = decoded_dataset.extent(0); + RAFT_EXPECTS(vpq_dataset.data.extent(0) == dataset_size, "Dataset sizes mismatch"); + + constexpr uint32_t block_size = 256; + constexpr uint32_t warp_size = 32; + constexpr int64_t vecs_per_cta = block_size / warp_size; + const auto grid_size = raft::div_rounding_up_safe(decoded_dataset.extent(0), vecs_per_cta); + + decode_vpq_dataset_kernel + <<>>(decoded_dataset.data_handle(), + decoded_dataset.stride(0), + vpq_dataset.vq_code_book.data_handle(), + vpq_dataset.vq_code_book.stride(0), + vpq_dataset.pq_code_book.data_handle(), + vpq_dataset.pq_len(), + 1u << vpq_dataset.pq_bits(), + vpq_dataset.dim(), + dataset_size, + vpq_dataset.data.data_handle(), + vpq_dataset.data.stride(0)); +} +} // namespace cuvs::neighbors From 16321bcd48bef9b967abf7dc3859ee53d2db8078 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 3 Sep 2025 12:36:54 +0900 Subject: [PATCH 003/119] Update CAGRA-Q distance kernel --- .../detail/cagra/compute_distance_vpq-impl.cuh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 6caa173f2c..3c11a8d30f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -234,10 +234,11 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim; constexpr auto PQ_BITS = DescriptorT::kPqBits; constexpr auto PQ_LEN = DescriptorT::kPqLen; + using PQ_CODEBOOK_LOAD_T = uint32_t; const uint32_t query_ptr = pq_codebook_ptr + DescriptorT::kSMemCodeBookSizeInBytes; static_assert(PQ_BITS == 8, "Only pq_bits == 8 is supported at the moment."); - constexpr uint32_t vlen = 4; // **** DO NOT CHANGE **** + constexpr uint32_t vlen = utils::size_of() / utils::size_of(); constexpr uint32_t nelem = raft::div_rounding_up_unsafe(DatasetBlockDim / PQ_LEN, TeamSize * vlen); @@ -250,13 +251,12 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( for (uint32_t elem_offset = 0; elem_offset * PQ_LEN < dim; elem_offset += DatasetBlockDim / PQ_LEN) { // Loading PQ codes - uint32_t pq_codes[nelem]; + PQ_CODEBOOK_LOAD_T pq_codes[nelem]; #pragma unroll for (std::uint32_t e = 0; e < nelem; e++) { const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen; if (k >= n_subspace) break; - // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory) - device::ldg_cg(pq_codes[e], reinterpret_cast(dataset_ptr + 4 + k)); + device::ldg_cg(pq_codes[e], reinterpret_cast(dataset_ptr + 4 + k)); } // if constexpr (PQ_LEN % 2 == 0) { @@ -274,7 +274,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( device::ldg_ca(vq_vals[m], vq_code_book_ptr + d); } // Compute distance - std::uint32_t pq_code = pq_codes[e]; + PQ_CODEBOOK_LOAD_T pq_code = pq_codes[e]; #pragma unroll for (std::uint32_t v = 0; v < vlen; v++) { if (PQ_LEN * (v + k) >= dim) break; From bfdc2d4fc802ec83c4d70ca4c60ad7cf1cff54e7 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 3 Sep 2025 15:35:48 +0900 Subject: [PATCH 004/119] Add DatasetBlockDim check --- cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 3c11a8d30f..cbd8c71b08 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -241,6 +241,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( constexpr uint32_t vlen = utils::size_of() / utils::size_of(); constexpr uint32_t nelem = raft::div_rounding_up_unsafe(DatasetBlockDim / PQ_LEN, TeamSize * vlen); + static_assert(DatasetBlockDim / PQ_LEN >= TeamSize * vlen, "DatasetBlockDim is too small"); constexpr auto kTeamMask = DescriptorT::kTeamSize - 1; constexpr auto kTeamVLen = TeamSize * vlen; From 23c02e16689c53d56580b4ffc90d4ae01f41f3ea Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 3 Sep 2025 16:33:32 +0900 Subject: [PATCH 005/119] Update VPQ compute distance kernel --- cpp/CMakeLists.txt | 40 +- .../detail/cagra/compute_distance-ext.cuh | 649 ++++++++++++------ .../detail/cagra/compute_distance.cu | 104 ++- .../cagra/compute_distance_00_generate.py | 30 +- .../cagra/compute_distance_vpq-impl.cuh | 8 +- ...float_uint32_dim1024_t32_8pq_8subd_half.cu | 41 ++ ..._float_uint32_dim128_t4_8pq_8subd_half.cu} | 2 +- ..._float_uint32_dim256_t8_8pq_8subd_half.cu} | 2 +- ...float_uint32_dim512_t16_8pq_8subd_half.cu} | 2 +- ...ed_float_uint32_dim64_t4_8pq_2subd_half.cu | 41 ++ ...ed_float_uint32_dim64_t4_8pq_4subd_half.cu | 41 ++ ..._half_uint32_dim1024_t32_8pq_8subd_half.cu | 41 ++ ...d_half_uint32_dim128_t4_8pq_8subd_half.cu} | 2 +- ...d_half_uint32_dim256_t8_8pq_8subd_half.cu} | 2 +- ..._half_uint32_dim512_t16_8pq_8subd_half.cu} | 2 +- ...ded_half_uint32_dim64_t4_8pq_2subd_half.cu | 41 ++ ...ded_half_uint32_dim64_t4_8pq_4subd_half.cu | 41 ++ ..._int8_uint32_dim1024_t32_8pq_8subd_half.cu | 41 ++ ...d_int8_uint32_dim128_t4_8pq_8subd_half.cu} | 2 +- ...d_int8_uint32_dim256_t8_8pq_8subd_half.cu} | 2 +- ..._int8_uint32_dim512_t16_8pq_8subd_half.cu} | 2 +- ...ded_int8_uint32_dim64_t4_8pq_2subd_half.cu | 41 ++ ...ded_int8_uint32_dim64_t4_8pq_4subd_half.cu | 41 ++ ...uint8_uint32_dim1024_t32_8pq_8subd_half.cu | 41 ++ ..._uint8_uint32_dim128_t4_8pq_8subd_half.cu} | 2 +- ..._uint8_uint32_dim256_t8_8pq_8subd_half.cu} | 2 +- ...uint8_uint32_dim512_t16_8pq_8subd_half.cu} | 2 +- ...ed_uint8_uint32_dim64_t4_8pq_2subd_half.cu | 41 ++ ...ed_uint8_uint32_dim64_t4_8pq_4subd_half.cu | 41 ++ ..._L2Expanded_dim128_t8_uint32_t_uint64_t.cu | 41 ++ ...L2Expanded_dim256_t16_uint32_t_uint64_t.cu | 41 ++ ...L2Expanded_dim512_t32_uint32_t_uint64_t.cu | 41 ++ ...q_L2Expanded_dim64_t4_uint32_t_uint64_t.cu | 41 ++ 33 files changed, 1251 insertions(+), 260 deletions(-) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu} (97%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu} (97%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu} (97%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu} (97%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu} (97%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu} (97%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu} (97%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu} (97%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu} (97%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu} (97%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu} (97%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu} (97%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 23aa794004..3d154917bd 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -235,42 +235,58 @@ if(BUILD_SHARED_LIBS) src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_8subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu + src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu + src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu + src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh index 3957abc6b5..45078a91d6 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh @@ -27,6 +27,7 @@ #include "compute_distance_standard.hpp" #include "compute_distance_vpq.hpp" +#include "compute_distance_vrabitq.hpp" namespace cuvs::neighbors::cagra::detail { @@ -44,6 +45,39 @@ extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct standard_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; -extern template struct standard_descriptor_spec; -extern template struct standard_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; +extern template struct vrabitq_descriptor_spec; +extern template struct vrabitq_descriptor_spec; +extern template struct vrabitq_descriptor_spec; +extern template struct vrabitq_descriptor_spec; extern template struct standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vrabitq_descriptor_spec, + vrabitq_descriptor_spec, + vrabitq_descriptor_spec, + vrabitq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec>; @@ -587,64 +780,112 @@ extern template struct instance_selector< using descriptor_instances = instance_selector< standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vrabitq_descriptor_spec, + vrabitq_descriptor_spec, + vrabitq_descriptor_spec, + vrabitq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec>; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu index cbb6f4540b..d3fc849147 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu @@ -32,64 +32,112 @@ using namespace cuvs::distance; template struct instance_selector< standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vrabitq_descriptor_spec, + vrabitq_descriptor_spec, + vrabitq_descriptor_spec, + vrabitq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec>; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py index 9834ca696f..0b877f889a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -50,6 +50,9 @@ """ mxdim_team = [(128, 8), (256, 16), (512, 32)] +vpq_2_4_mxdim_team = [(64, 4), (128, 8), (256, 16), (512, 32)] +vpq_8_mxdim_team = [(128, 4), (256, 8), (512, 16), (1024, 32)] +vrq_mxdim_team = [(64, 4), (128, 8), (256, 16), (512, 32)] #mxdim_team = [(64, 8), (128, 16), (256, 32)] #mxdim_team = [(32, 8), (64, 16), (128, 32)] @@ -98,9 +101,11 @@ f.write(template.format(includes=includes, content=content)) cmake_list.append(f" src/neighbors/detail/cagra/{path}") - # CAGRA-Q - for code_book_t in code_book_types: - for pq_len in pq_lens: + for pq_len in pq_lens: + vpq_mxdim_team = vpq_8_mxdim_team if pq_len == 8 else vpq_2_4_mxdim_team + for (mxdim, team) in vpq_mxdim_team: + # CAGRA-Q + for code_book_t in code_book_types: for pq_bit in pq_bits: for metric in ['L2Expanded']: path = f"compute_distance_vpq_{metric}_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}.cu" @@ -113,6 +118,24 @@ f.write(template.format(includes=includes, content=content)) cmake_list.append(f" src/neighbors/detail/cagra/{path}") +# CAGRA-RaBitQ +for (mxdim, team) in vrq_mxdim_team: + for vq_code_t in ['uint32_t']: + for rabitq_code_t in ['uint64_t']: + for metric in ['L2Expanded']: + data_t = 'float' + idx_t = 'uint32_t' + distance_t = 'float' + path = f"compute_distance_vrabitq_{metric}_dim{mxdim}_t{team}_{vq_code_t}_{rabitq_code_t}.cu" + includes = '#include "compute_distance_vrabitq-impl.cuh"' + params = f"{metric_prefix}{metric}, {team}, {mxdim}, {data_t}, {idx_t}, {vq_code_t}, {rabitq_code_t}, float, {distance_t}" + spec = f"vrabitq_descriptor_spec<{params}>" + content = f"""template struct {spec};""" + specs.append(spec) + with open(path, "w") as f: + f.write(template.format(includes=includes, content=content)) + cmake_list.append(f" src/neighbors/detail/cagra/{path}") + # CAGRA (Binary Hamming distance) for (mxdim, team) in mxdim_team: metric = 'BitwiseHamming' @@ -137,6 +160,7 @@ #include "compute_distance_standard.hpp" #include "compute_distance_vpq.hpp" +#include "compute_distance_vrabitq.hpp" ''' newline = "\n" contents = f''' diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index cbd8c71b08..7545c25a2b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -257,7 +257,13 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( for (std::uint32_t e = 0; e < nelem; e++) { const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen; if (k >= n_subspace) break; - device::ldg_cg(pq_codes[e], reinterpret_cast(dataset_ptr + 4 + k)); + + if constexpr (std::is_same_v) { + device::ldg_cg(pq_codes[e], + reinterpret_cast(dataset_ptr + 4 + k)); + } else { + pq_codes[e] = *reinterpret_cast(dataset_ptr + 4 + k); + } } // if constexpr (PQ_LEN % 2 == 0) { diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu new file mode 100644 index 0000000000..5c458a281a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu similarity index 97% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu index 52fb9140fe..d5579a2be0 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_8subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu @@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail { using namespace cuvs::distance; template struct vpq_descriptor_spec python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu new file mode 100644 index 0000000000..6a01a5c0d5 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu new file mode 100644 index 0000000000..a9766124f8 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu similarity index 97% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu index 82d5d738ab..c5d6c72a6f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_8subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu @@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail { using namespace cuvs::distance; template struct vpq_descriptor_spec python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu new file mode 100644 index 0000000000..36592482e1 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu new file mode 100644 index 0000000000..e2d68ae772 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu similarity index 97% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu index 4cb61e18ad..65cdfb0998 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_8subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu @@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail { using namespace cuvs::distance; template struct vpq_descriptor_spec python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu new file mode 100644 index 0000000000..42a56de4a5 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu new file mode 100644 index 0000000000..6217a0047c --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu similarity index 97% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu index eb9bc63041..6d06771052 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_8subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu @@ -29,7 +29,7 @@ namespace cuvs::neighbors::cagra::detail { using namespace cuvs::distance; template struct vpq_descriptor_spec python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu new file mode 100644 index 0000000000..6f59b47bbe --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu new file mode 100644 index 0000000000..10ddfc0163 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vrabitq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vrabitq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu new file mode 100644 index 0000000000..e057457a6a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vrabitq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vrabitq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu new file mode 100644 index 0000000000..c30bd76785 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vrabitq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vrabitq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu new file mode 100644 index 0000000000..472dd9821f --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vrabitq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vrabitq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail From e0f629cb82e157d8c0fbdfd13cf57bac6c4528ef Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Tue, 21 Oct 2025 17:11:20 +0900 Subject: [PATCH 006/119] Add fp_8bit4 --- cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh index 1b098ac5c1..2806e88646 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -111,4 +111,10 @@ struct fp_8bit { } }; +template +struct fp_8bit4 { + fp_8bit x, y, z, w; + HDI fp_8bit4() : x(0), y(0), z(0), w(0) {} +}; + } // namespace cuvs::neighbors::ivf_pq::detail From 0da1aa2bcf2357fb0e6f4470552bdeaaa22a69c3 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Tue, 21 Oct 2025 17:40:03 +0900 Subject: [PATCH 007/119] Fix compilation error --- cpp/CMakeLists.txt | 28 ++--- .../detail/cagra/compute_distance-ext.cuh | 109 ------------------ .../detail/cagra/compute_distance.cu | 36 ------ .../cagra/compute_distance_00_generate.py | 19 --- ...d_CosineExpanded_float_uint32_dim128_t8.cu | 2 +- ..._CosineExpanded_float_uint32_dim256_t16.cu | 2 +- ..._CosineExpanded_float_uint32_dim512_t32.cu | 2 +- ...rd_CosineExpanded_half_uint32_dim128_t8.cu | 2 +- ...d_CosineExpanded_half_uint32_dim256_t16.cu | 2 +- ...d_CosineExpanded_half_uint32_dim512_t32.cu | 2 +- ...rd_CosineExpanded_int8_uint32_dim128_t8.cu | 2 +- ...d_CosineExpanded_int8_uint32_dim256_t16.cu | 2 +- ...d_CosineExpanded_int8_uint32_dim512_t32.cu | 2 +- ...d_CosineExpanded_uint8_uint32_dim128_t8.cu | 2 +- ..._CosineExpanded_uint8_uint32_dim256_t16.cu | 2 +- ..._CosineExpanded_uint8_uint32_dim512_t32.cu | 2 +- 16 files changed, 24 insertions(+), 192 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f4831ef1c7..6b4397307c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -224,12 +224,21 @@ if(NOT BUILD_CPU_ONLY) src/neighbors/detail/cagra/compute_distance_standard_BitwiseHamming_u8_uint32_dim128_t8.cu src/neighbors/detail/cagra/compute_distance_standard_BitwiseHamming_u8_uint32_dim256_t16.cu src/neighbors/detail/cagra/compute_distance_standard_BitwiseHamming_u8_uint32_dim512_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim128_t8.cu src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim256_t16.cu src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu + src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu @@ -245,15 +254,6 @@ if(NOT BUILD_CPU_ONLY) src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim128_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim256_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim512_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim128_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim256_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim512_t32.cu - src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim128_t8.cu - src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim256_t16.cu - src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim512_t32.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu @@ -308,10 +308,6 @@ if(NOT BUILD_CPU_ONLY) src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu - src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu - src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu - src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh index 093bb92730..ce97558f67 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh @@ -27,7 +27,6 @@ #include "compute_distance_standard.hpp" #include "compute_distance_vpq.hpp" -#include "compute_distance_vrabitq.hpp" namespace cuvs::neighbors::cagra::detail { @@ -681,42 +680,6 @@ extern template struct vpq_descriptor_spec; -extern template struct vrabitq_descriptor_spec; -extern template struct vrabitq_descriptor_spec; -extern template struct vrabitq_descriptor_spec; -extern template struct vrabitq_descriptor_spec; extern template struct standard_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, - vrabitq_descriptor_spec, - vrabitq_descriptor_spec, - vrabitq_descriptor_spec, - vrabitq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec>; @@ -946,42 +873,6 @@ using descriptor_instances = instance_selector< vpq_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, - vrabitq_descriptor_spec, - vrabitq_descriptor_spec, - vrabitq_descriptor_spec, - vrabitq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec>; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu index ab39670bc8..13fdb7b832 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu @@ -114,42 +114,6 @@ template struct instance_selector< vpq_descriptor_spec, vpq_descriptor_spec, vpq_descriptor_spec, - vrabitq_descriptor_spec, - vrabitq_descriptor_spec, - vrabitq_descriptor_spec, - vrabitq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec>; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py index 440c04cffa..095b4bb36b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -119,24 +119,6 @@ f.write(template.format(includes=includes, content=content)) cmake_list.append(f" src/neighbors/detail/cagra/{path}") -# CAGRA-RaBitQ -for (mxdim, team) in vrq_mxdim_team: - for vq_code_t in ['uint32_t']: - for rabitq_code_t in ['uint64_t']: - for metric in ['L2Expanded']: - data_t = 'float' - idx_t = 'uint32_t' - distance_t = 'float' - path = f"compute_distance_vrabitq_{metric}_dim{mxdim}_t{team}_{vq_code_t}_{rabitq_code_t}.cu" - includes = '#include "compute_distance_vrabitq-impl.cuh"' - params = f"{metric_prefix}{metric}, {team}, {mxdim}, {data_t}, {idx_t}, {vq_code_t}, {rabitq_code_t}, float, {distance_t}" - spec = f"vrabitq_descriptor_spec<{params}>" - content = f"""template struct {spec};""" - specs.append(spec) - with open(path, "w") as f: - f.write(template.format(includes=includes, content=content)) - cmake_list.append(f" src/neighbors/detail/cagra/{path}") - # CAGRA (Binary Hamming distance) for (mxdim, team) in mxdim_team: metric = 'BitwiseHamming' @@ -161,7 +143,6 @@ #include "compute_distance_standard.hpp" #include "compute_distance_vpq.hpp" -#include "compute_distance_vrabitq.hpp" ''' newline = "\n" contents = f''' diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim128_t8.cu index c34298040c..c5d3579d25 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim128_t8.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim256_t16.cu index a8b4c726c2..c1f000700d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim256_t16.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim512_t32.cu index 6c8a090093..ea51d0af5c 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim512_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_float_uint32_dim512_t32.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim128_t8.cu index cfc13a2e5d..c8bc8d46e3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim128_t8.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim256_t16.cu index fc6e5084b2..faa3e5d765 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim256_t16.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim512_t32.cu index bad82cbb4e..1a1eb89630 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim512_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_half_uint32_dim512_t32.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim128_t8.cu index 4babd36f31..104059b9ed 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim128_t8.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim256_t16.cu index 4593798241..4e057ed800 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim256_t16.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim512_t32.cu index 987d7e8f26..5b76a3c17b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim512_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_int8_uint32_dim512_t32.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim128_t8.cu index 02f86c8d74..82d7c39886 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim128_t8.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim128_t8.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim256_t16.cu index 8c01303faa..2a5349bcf5 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim256_t16.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim256_t16.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim512_t32.cu index d39fe01d7c..7951da6233 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim512_t32.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_CosineExpanded_uint8_uint32_dim512_t32.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 62ba0adbacb6f4e7eb62da3709b86a4dd61880aa Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Tue, 21 Oct 2025 17:40:36 +0900 Subject: [PATCH 008/119] Add as_u32 --- cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh index 2806e88646..4395c46408 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh @@ -115,6 +115,9 @@ template struct fp_8bit4 { fp_8bit x, y, z, w; HDI fp_8bit4() : x(0), y(0), z(0), w(0) {} + + HDI uint32_t& as_u32() { return *reinterpret_cast(this); } + HDI uint32_t as_u32() const { return *reinterpret_cast(this); } }; } // namespace cuvs::neighbors::ivf_pq::detail From 7f9f614355256a88561f4a4da38538d8671b0e7b Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Tue, 21 Oct 2025 23:36:45 +0900 Subject: [PATCH 009/119] Update VPQ --- .../cagra/compute_distance_vpq-impl.cuh | 52 ++++++++++++------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 7545c25a2b..abd9146144 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -26,6 +26,11 @@ namespace cuvs::neighbors::cagra::detail { +using pq_val_t = half; +using pq_val_pack_t = half2; +using pq_val_pack_uint_t = uint32_t; +constexpr uint32_t pq_val_pack_num_elements = 2; + template (); + (1 << PQ_BITS) * PQ_LEN * utils::size_of() / pq_val_pack_num_elements; _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl, compute_distance_type* compute_distance_impl, @@ -178,19 +183,22 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, // Copy PQ table for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) { - half2 buf2; - buf2.x = r->pq_code_book_ptr()[i]; - buf2.y = r->pq_code_book_ptr()[i + 1]; - // Change the order of PQ code book array to reduce the // frequency of bank conflicts. - constexpr auto num_elements_per_bank = 4 / utils::size_of(); + constexpr auto num_elements_per_bank = + pq_val_pack_num_elements / + (utils::size_of() / utils::size_of()); constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank; const auto j = i / num_elements_per_bank; const auto smem_index = (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS); - device::sts(codebook_buf + smem_index * sizeof(half2), buf2); + if constexpr (std::is_same_v) { + half2 buf2; + buf2.x = r->pq_code_book_ptr()[i]; + buf2.y = r->pq_code_book_ptr()[i + 1]; + device::sts(codebook_buf + smem_index * sizeof(half2), buf2); + } } } @@ -286,22 +294,28 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( for (std::uint32_t v = 0; v < vlen; v++) { if (PQ_LEN * (v + k) >= dim) break; #pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN / 2; m++) { + for (std::uint32_t m = 0; m < PQ_LEN / pq_val_pack_num_elements; m++) { constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN); - const std::uint32_t d1 = m + (PQ_LEN / 2) * v; - const std::uint32_t d = - d1 * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; - half2 q2, c2; - // Loading query vector from smem - device::lds(q2, query_ptr + sizeof(half2) * d); + const std::uint32_t d1 = m + (PQ_LEN / pq_val_pack_num_elements) * v; + const std::uint32_t d = d1 * kQueryBlock + + elem_offset * (PQ_LEN / pq_val_pack_num_elements) + + e * TeamSize + laneId; + half2 q2; + pq_val_pack_t c2; // Loading PQ code book from smem device::lds(c2, pq_codebook_ptr + - sizeof(CODE_BOOK_T) * ((1 << PQ_BITS) * 2 * m + (2 * (pq_code & 0xff)))); - // L2 distance - auto dist = q2 - c2 - reinterpret_cast(vq_vals)[d1]; - dist = dist * dist; - norm += static_cast(dist.x + dist.y); + sizeof(pq_val_pack_uint_t) * ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); + + if constexpr (std::is_same_v) { + // Loading query vector from smem + device::lds(q2, query_ptr + sizeof(half2) * d); + // L2 distance + auto dist = q2 - c2 - reinterpret_cast(vq_vals)[d1]; + dist = dist * dist; + norm += static_cast(dist.x + dist.y); + } else { + } } pq_code >>= 8; } From 058abbb6e56a016efe21d32132a55227605cdd8a Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 22 Oct 2025 01:38:39 +0900 Subject: [PATCH 010/119] Fix fp_8bit4 constructor --- cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh index 4395c46408..359b638a41 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh @@ -114,7 +114,7 @@ struct fp_8bit { template struct fp_8bit4 { fp_8bit x, y, z, w; - HDI fp_8bit4() : x(0), y(0), z(0), w(0) {} + HDI fp_8bit4() : x(0.f), y(0.f), z(0.f), w(0.f) {} HDI uint32_t& as_u32() { return *reinterpret_cast(this); } HDI uint32_t as_u32() const { return *reinterpret_cast(this); } From e64f8c67af9c8c975fa611a48d986810eca61cfd Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 22 Oct 2025 01:39:09 +0900 Subject: [PATCH 011/119] Add sts for u32 --- cpp/src/neighbors/detail/cagra/device_common.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 882928add0..ed583d6f5a 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -310,6 +310,11 @@ RAFT_DEVICE_INLINE_FUNCTION void lds(uint4& x, const uint4* addr) lds(x, uint32_t(__cvta_generic_to_shared(addr))); } +RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const uint32_t& x) +{ + asm volatile("st.shared.u32 [%0], %1;" : : "r"(addr), "r"(reinterpret_cast(x))); +} + RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const half2& x) { asm volatile("st.shared.v2.u16 [%0], {%1, %2};" From 77492dd7439374fc08841538a0347ed7c4beb245 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 22 Oct 2025 01:40:08 +0900 Subject: [PATCH 012/119] Add f8 --- .../cagra/compute_distance_vpq-impl.cuh | 143 ++++++++++++------ 1 file changed, 93 insertions(+), 50 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index abd9146144..f5e66830ae 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -16,6 +16,7 @@ #pragma once +#include "../../ivf_pq/ivf_pq_fp_8bit.cuh" #include "compute_distance_vpq.hpp" #include @@ -26,10 +27,10 @@ namespace cuvs::neighbors::cagra::detail { -using pq_val_t = half; -using pq_val_pack_t = half2; +using pq_val_t = ivf_pq::detail::fp_8bit<5, true>; +using pq_val_pack_t = ivf_pq::detail::fp_8bit4<5, true>; using pq_val_pack_uint_t = uint32_t; -constexpr uint32_t pq_val_pack_num_elements = 2; +constexpr uint32_t pq_val_pack_num_elements = 4; template () / utils::size_of()); - constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank; - const auto j = i / num_elements_per_bank; - const auto smem_index = - (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS); - - if constexpr (std::is_same_v) { - half2 buf2; - buf2.x = r->pq_code_book_ptr()[i]; - buf2.y = r->pq_code_book_ptr()[i + 1]; - device::sts(codebook_buf + smem_index * sizeof(half2), buf2); + + if constexpr (PQ_LEN > num_elements_per_bank) { // safety + constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank; + const auto j = i / num_elements_per_bank; + const auto smem_index = + (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS); + + if constexpr (std::is_same_v) { + pq_val_pack_t buf2; + buf2.x = r->pq_code_book_ptr()[i]; + buf2.y = r->pq_code_book_ptr()[i + 1]; + device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2); + } else { + pq_val_pack_t buf4; + buf4.x = static_cast(static_cast(r->pq_code_book_ptr()[i])); + buf4.y = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); + buf4.z = static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); + buf4.w = static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); + device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf4.as_u32()); + } } } } @@ -275,49 +286,81 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( } // if constexpr (PQ_LEN % 2 == 0) { - // **** Use half2 for distance computation **** + if constexpr (PQ_LEN >= pq_val_pack_num_elements) { // safety + // **** Use half2 for distance computation **** #pragma unroll - for (std::uint32_t e = 0; e < nelem; e++) { - const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen; - if (k >= n_subspace) break; - // Loading VQ code-book - half2 vq_vals[PQ_LEN][vlen / 2]; + for (std::uint32_t e = 0; e < nelem; e++) { + const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen; + if (k >= n_subspace) break; + // Loading VQ code-book + half2 vq_vals[PQ_LEN][vlen / 2]; #pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN; m++) { - const uint32_t d = (vlen * m) + (PQ_LEN * k); - if (d >= dim) break; - device::ldg_ca(vq_vals[m], vq_code_book_ptr + d); - } - // Compute distance - PQ_CODEBOOK_LOAD_T pq_code = pq_codes[e]; + for (std::uint32_t m = 0; m < PQ_LEN; m++) { + const uint32_t d = (vlen * m) + (PQ_LEN * k); + if (d >= dim) break; + device::ldg_ca(vq_vals[m], vq_code_book_ptr + d); + } + // Compute distance + PQ_CODEBOOK_LOAD_T pq_code = pq_codes[e]; #pragma unroll - for (std::uint32_t v = 0; v < vlen; v++) { - if (PQ_LEN * (v + k) >= dim) break; + for (std::uint32_t v = 0; v < vlen; v++) { + if (PQ_LEN * (v + k) >= dim) break; #pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN / pq_val_pack_num_elements; m++) { - constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN); - const std::uint32_t d1 = m + (PQ_LEN / pq_val_pack_num_elements) * v; - const std::uint32_t d = d1 * kQueryBlock + - elem_offset * (PQ_LEN / pq_val_pack_num_elements) + - e * TeamSize + laneId; - half2 q2; - pq_val_pack_t c2; - // Loading PQ code book from smem - device::lds(c2, - pq_codebook_ptr + - sizeof(pq_val_pack_uint_t) * ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); - - if constexpr (std::is_same_v) { - // Loading query vector from smem - device::lds(q2, query_ptr + sizeof(half2) * d); - // L2 distance - auto dist = q2 - c2 - reinterpret_cast(vq_vals)[d1]; - dist = dist * dist; - norm += static_cast(dist.x + dist.y); - } else { + for (std::uint32_t m = 0; m < PQ_LEN / pq_val_pack_num_elements; m++) { + constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN); + std::uint32_t d1 = + m * (pq_val_pack_num_elements / 2) + (PQ_LEN / pq_val_pack_num_elements) * v; + std::uint32_t d = d1 * kQueryBlock + + elem_offset * (PQ_LEN / pq_val_pack_num_elements) + e * TeamSize + + laneId; + half2 q2; + // if constexpr (false) { + if constexpr (std::is_same_v) { + half2 c2; + // Loading PQ code book from smem + device::lds(c2, + pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * + ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); + + // Loading query vector from smem + device::lds(q2, query_ptr + sizeof(half2) * d); + half2 c2_ = c2; + // L2 distance + auto dist = q2 - c2_ - reinterpret_cast(vq_vals)[d1]; + dist = dist * dist; + norm += static_cast(dist.x + dist.y); + } else { + pq_val_pack_t c2; + // Loading PQ code book from smem + device::lds(c2.as_u32(), + pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * + ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); + + half2 c2_; + + // Loading query vector from smem + device::lds(q2, query_ptr + sizeof(half2) * d); + c2_.x = static_cast(c2.x); + c2_.y = static_cast(c2.y); + // L2 distance + auto dist = q2 - c2_ - reinterpret_cast(vq_vals)[d1]; + dist = dist * dist; + norm += static_cast(dist.x + dist.y); + + d1 += 1; + d += kQueryBlock; + + device::lds(q2, query_ptr + sizeof(half2) * d); + c2_.x = static_cast(c2.z); + c2_.y = static_cast(c2.w); + // L2 distance + dist = q2 - c2_ - reinterpret_cast(vq_vals)[d1]; + dist = dist * dist; + norm += static_cast(dist.x + dist.y); + } } + pq_code >>= 8; } - pq_code >>= 8; } } } else { From 4638eb28c716be8922accc1cb14ae18f0db8fab0 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 22 Oct 2025 11:00:39 +0900 Subject: [PATCH 013/119] Fix a bug --- .../detail/cagra/compute_distance_vpq-impl.cuh | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index f5e66830ae..088af6c01b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -27,10 +27,17 @@ namespace cuvs::neighbors::cagra::detail { +#if 1 using pq_val_t = ivf_pq::detail::fp_8bit<5, true>; using pq_val_pack_t = ivf_pq::detail::fp_8bit4<5, true>; using pq_val_pack_uint_t = uint32_t; constexpr uint32_t pq_val_pack_num_elements = 4; +#else +using pq_val_t = half; +using pq_val_pack_t = half2; +using pq_val_pack_uint_t = uint32_t; +constexpr uint32_t pq_val_pack_num_elements = 2; +#endif template ) { From a64f2642e941458704497e07e11e32ead919e33a Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 22 Oct 2025 13:01:16 +0900 Subject: [PATCH 014/119] Add native f8 support --- .../detail/cagra/compute_distance_vpq-impl.cuh | 4 ++-- cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 088af6c01b..21078e73da 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -28,8 +28,8 @@ namespace cuvs::neighbors::cagra::detail { #if 1 -using pq_val_t = ivf_pq::detail::fp_8bit<5, true>; -using pq_val_pack_t = ivf_pq::detail::fp_8bit4<5, true>; +using pq_val_pack_t = ivf_pq::detail::fp_8bit4<5, true, false>; +using pq_val_t = typename pq_val_pack_t::unit_t; using pq_val_pack_uint_t = uint32_t; constexpr uint32_t pq_val_pack_num_elements = 4; #else diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh index 359b638a41..c4e1c08c01 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh @@ -32,6 +32,7 @@ #include #include +#include namespace cuvs::neighbors::ivf_pq::detail { @@ -111,9 +112,20 @@ struct fp_8bit { } }; -template +template struct fp_8bit4 { - fp_8bit x, y, z, w; + using unit_t = fp_8bit; + unit_t x, y, z, w; + HDI fp_8bit4() : x(0.f), y(0.f), z(0.f), w(0.f) {} + + HDI uint32_t& as_u32() { return *reinterpret_cast(this); } + HDI uint32_t as_u32() const { return *reinterpret_cast(this); } +}; + +template <> +struct fp_8bit4<5, true, false> { + using unit_t = __nv_fp8_e5m2; + unit_t x, y, z, w; HDI fp_8bit4() : x(0.f), y(0.f), z(0.f), w(0.f) {} HDI uint32_t& as_u32() { return *reinterpret_cast(this); } From 60ba5e997397c010e7e6d3fc6cc25081a75f3c9b Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 22 Oct 2025 15:47:27 +0900 Subject: [PATCH 015/119] Fix VPQ init --- cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 21078e73da..afb9d0bbcc 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -198,7 +198,7 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, pq_val_pack_num_elements / (utils::size_of() / utils::size_of()); - if constexpr (PQ_LEN > num_elements_per_bank) { // safety + if constexpr (PQ_LEN >= num_elements_per_bank) { // safety constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank; const auto j = i / num_elements_per_bank; const auto smem_index = From f37a13160838fb9c26e0029fa9b2f759a187cd37 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 27 Aug 2025 01:14:46 +0900 Subject: [PATCH 016/119] Update clock measure --- .../neighbors/detail/cagra/device_common.hpp | 11 + .../cagra/search_multi_cta_kernel-inl.cuh | 1156 +++++++++-------- .../cagra/search_single_cta_kernel-inl.cuh | 30 +- 3 files changed, 610 insertions(+), 587 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index ed583d6f5a..9715a0473f 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -15,6 +15,8 @@ */ #pragma once +// #define _CLK_BREAKDOWN + #include "hashmap.hpp" #include "utils.hpp" @@ -186,6 +188,9 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( const IndexT* __restrict__ parent_indices, const IndexT* __restrict__ internal_topk_list, const uint32_t search_width, +#ifdef _CLK_BREAKDOWN + std::uint64_t& clk_compute_actual_distance, +#endif int* __restrict__ result_position = nullptr, const int max_result_position = 0) { @@ -238,11 +243,17 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( // > const auto child_dist = dataset_desc.compute_distance(child_id, child_id != invalid_index); // Instead, we manually inline this function for performance reasons. // This allows us to move the fetching of the arguments from shared memory out of the loop. +#ifdef _CLK_BREAKDOWN + const auto start_clock = clock64(); +#endif const DistanceT child_dist = device::team_sum( (child_id != invalid_index) ? compute_distance(args, child_id) : (lead_lane ? raft::upper_bound() : 0), team_size_bits); __syncwarp(); +#ifdef _CLK_BREAKDOWN + clk_compute_actual_distance += clock64() - start_clock; +#endif // Store the distance if (valid_i && lead_lane) { result_child_distances_ptr[j] = child_dist; } diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh index ea738b137b..7860a32eba 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -1,575 +1,581 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "search_multi_cta_kernel.cuh" - -#include "bitonic.hpp" -#include "compute_distance-ext.cuh" -#include "device_common.hpp" -#include "hashmap.hpp" -#include "search_plan.cuh" -#include "topk_for_cagra/topk.h" // TODO replace with raft topk if possible -#include "utils.hpp" - -#include -#include -#include -#include -#include - -#include - -#include - -// TODO: This shouldn't be invoking anything from spatial/knn -#include "../ann_utils.cuh" - -#include -#include // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp - -#include -#include -#include -#include -#include -#include -#include - -namespace cuvs::neighbors::cagra::detail { -namespace multi_cta_search { - -// #define _CLK_BREAKDOWN - -template -RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parent( - INDEX_T* const next_parent_indices, - INDEX_T* const itopk_indices, // [itopk_size * 2] - DISTANCE_T* const itopk_distances, // [itopk_size * 2] - INDEX_T* const hash_ptr, - const uint32_t hash_bitlen) -{ - constexpr uint32_t itopk_size = 32; - constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; - constexpr INDEX_T invalid_index = ~static_cast(0); - - const unsigned warp_id = threadIdx.x / 32; - if (warp_id > 0) { return; } - if (threadIdx.x == 0) { next_parent_indices[0] = invalid_index; } - __syncwarp(); - - int j = -1; - for (unsigned i = threadIdx.x; i < itopk_size * 2; i += 32) { - INDEX_T index = itopk_indices[i]; - int is_invalid = 0; - int is_candidate = 0; - if (index == invalid_index) { - is_invalid = 1; - } else if (index & index_msb_1_mask) { - } else { - is_candidate = 1; - } - - const auto ballot_mask = __ballot_sync(0xffffffff, is_candidate); - const auto candidate_id = __popc(ballot_mask & ((1 << threadIdx.x) - 1)); - for (int k = 0; k < __popc(ballot_mask); k++) { - int flag_done = 0; - if (is_candidate && candidate_id == k) { - is_candidate = 0; - if (hashmap::insert(hash_ptr, hash_bitlen, index)) { - // Use this candidate as next parent - index |= index_msb_1_mask; // set most significant bit as used node - if (i < itopk_size) { - next_parent_indices[0] = i; - itopk_indices[i] = index; - } else { - next_parent_indices[0] = j; - // Move the next parent node from i-th position to j-th position - itopk_indices[j] = index; - itopk_distances[j] = itopk_distances[i]; - itopk_indices[i] = invalid_index; - itopk_distances[i] = utils::get_max_value(); - } - flag_done = 1; - } else { - // Deactivate the node since it has been used by other CTA. - itopk_indices[i] = invalid_index; - itopk_distances[i] = utils::get_max_value(); - is_invalid = 1; - } - } - if (__any_sync(0xffffffff, (flag_done > 0))) { return; } - } - if (i < itopk_size) { - j = 31 - __clz(__ballot_sync(0xffffffff, is_invalid)); - if (j < 0) { return; } - } - } -} - -template -RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort(float* distances, // [num_elements] - INDEX_T* indices, // [num_elements] - const uint32_t num_elements) -{ - const unsigned warp_id = threadIdx.x / 32; - if (warp_id > 0) { return; } - const unsigned lane_id = threadIdx.x % 32; - constexpr unsigned N = (MAX_ELEMENTS + 31) / 32; - float key[N]; - INDEX_T val[N]; - for (unsigned i = 0; i < N; i++) { - unsigned j = lane_id + (32 * i); - if (j < num_elements) { - key[i] = distances[j]; - val[i] = indices[j]; - } else { - key[i] = utils::get_max_value(); - val[i] = ~static_cast(0); - } - } - /* Warp Sort */ - bitonic::warp_sort(key, val); - /* Store sorted results */ - for (unsigned i = 0; i < N; i++) { - unsigned j = (N * lane_id) + i; - if (j < num_elements) { - distances[j] = key[i]; - indices[j] = val[i]; - } - } -} - -// -// multiple CTAs per single query -// -template -RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( - typename DATASET_DESCRIPTOR_T::INDEX_T* const - result_indices_ptr, // [num_queries, num_cta_per_query, itopk_size] - typename DATASET_DESCRIPTOR_T::DISTANCE_T* const - result_distances_ptr, // [num_queries, num_cta_per_query, itopk_size] - const DATASET_DESCRIPTOR_T* dataset_desc, - const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph, // [dataset_size, graph_degree] - const uint32_t graph_degree, - const unsigned num_distilation, - const uint64_t rand_xor_mask, - const typename DATASET_DESCRIPTOR_T::INDEX_T* seed_ptr, // [num_queries, num_seeds] - const uint32_t num_seeds, - const uint32_t visited_hash_bitlen, - typename DATASET_DESCRIPTOR_T::INDEX_T* const - traversed_hashmap_ptr, // [num_queries, 1 << traversed_hash_bitlen] - const uint32_t traversed_hash_bitlen, - const uint32_t itopk_size, - const uint32_t min_iteration, - const uint32_t max_iteration, - uint32_t* const num_executed_iterations, /* stats */ - SAMPLE_FILTER_T sample_filter) -{ - using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; - using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; - using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; - - const auto num_queries = gridDim.y; - const auto query_id = blockIdx.y; - const auto num_cta_per_query = gridDim.x; - const auto cta_id = blockIdx.x; // local CTA ID - -#ifdef _CLK_BREAKDOWN - uint64_t clk_init = 0; - uint64_t clk_compute_1st_distance = 0; - uint64_t clk_topk = 0; - uint64_t clk_pickup_parents = 0; - uint64_t clk_compute_distance = 0; - uint64_t clk_start; -#define _CLK_START() clk_start = clock64() -#define _CLK_REC(V) V += clock64() - clk_start; -#else -#define _CLK_START() -#define _CLK_REC(V) -#endif - _CLK_START(); - - extern __shared__ uint8_t smem[]; - - // Layout of result_buffer - // +----------------+---------+---------------------------+ - // | internal_top_k | padding | neighbors of parent nodes | - // | | upto 32 | | - // +----------------+---------+---------------------------+ - // |<--- result_buffer_size_32 --->| - const auto result_buffer_size = itopk_size + graph_degree; - const auto result_buffer_size_32 = raft::round_up_safe(result_buffer_size, 32); - assert(result_buffer_size_32 <= MAX_ELEMENTS); - - // Set smem working buffer for the distance calculation - dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); - - auto* __restrict__ result_indices_buffer = - reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes()); - auto* __restrict__ result_distances_buffer = - reinterpret_cast(result_indices_buffer + result_buffer_size_32); - auto* __restrict__ local_visited_hashmap_ptr = - reinterpret_cast(result_distances_buffer + result_buffer_size_32); - auto* __restrict__ parent_indices_buffer = - reinterpret_cast(local_visited_hashmap_ptr + hashmap::get_size(visited_hash_bitlen)); - auto* __restrict__ result_position = reinterpret_cast(parent_indices_buffer + 1); - - INDEX_T* const local_traversed_hashmap_ptr = - traversed_hashmap_ptr + (hashmap::get_size(traversed_hash_bitlen) * query_id); - - constexpr INDEX_T invalid_index = ~static_cast(0); - constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; - - for (unsigned i = threadIdx.x; i < result_buffer_size_32; i += blockDim.x) { - result_indices_buffer[i] = invalid_index; - result_distances_buffer[i] = utils::get_max_value(); - } - hashmap::init(local_visited_hashmap_ptr, visited_hash_bitlen); - __syncthreads(); - _CLK_REC(clk_init); - - // compute distance to randomly selecting nodes - _CLK_START(); - const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr; - uint32_t block_id = cta_id + (num_cta_per_query * query_id); - uint32_t num_blocks = num_cta_per_query * num_queries; - - device::compute_distance_to_random_nodes(result_indices_buffer, - result_distances_buffer, - *dataset_desc, - graph_degree, - num_distilation, - rand_xor_mask, - local_seed_ptr, - num_seeds, - local_visited_hashmap_ptr, - visited_hash_bitlen, - local_traversed_hashmap_ptr, - traversed_hash_bitlen, - block_id, - num_blocks); - __syncthreads(); - _CLK_REC(clk_compute_1st_distance); - - uint32_t iter = 0; - while (1) { - _CLK_START(); - if (threadIdx.x < 32) { - // [1st warp] Topk with bitonic sort - topk_by_bitonic_sort( - result_distances_buffer, result_indices_buffer, result_buffer_size_32); - } - __syncthreads(); - _CLK_REC(clk_topk); - - if (iter + 1 >= max_iteration) { break; } - - _CLK_START(); - if (threadIdx.x < 32) { - // [1st warp] Pick up a next parent - pickup_next_parent(parent_indices_buffer, - result_indices_buffer, - result_distances_buffer, - local_traversed_hashmap_ptr, - traversed_hash_bitlen); - } else { - // [Other warps] Reset visited hashmap - hashmap::init(local_visited_hashmap_ptr, visited_hash_bitlen, 32); - } - __syncthreads(); - _CLK_REC(clk_pickup_parents); - - if ((parent_indices_buffer[0] == invalid_index) && (iter >= min_iteration)) { break; } - - _CLK_START(); - for (unsigned i = threadIdx.x; i < result_buffer_size_32; i += blockDim.x) { - INDEX_T index = result_indices_buffer[i]; - if (index == invalid_index) { continue; } - if ((i >= itopk_size) && (index & index_msb_1_mask)) { - // Remove nodes kicked out of the itopk list from the traversed hash table. - hashmap::remove( - local_traversed_hashmap_ptr, traversed_hash_bitlen, index & ~index_msb_1_mask); - result_indices_buffer[i] = invalid_index; - result_distances_buffer[i] = utils::get_max_value(); - } else { - // Restore visited hashmap by putting nodes on result buffer in it. - index &= ~index_msb_1_mask; - hashmap::insert(local_visited_hashmap_ptr, visited_hash_bitlen, index); - } - } - // Initialize buffer for compute_distance_to_child_nodes. - if (threadIdx.x == blockDim.x - 1) { result_position[0] = result_buffer_size_32; } - __syncthreads(); - - // Compute the norms between child nodes and query node - device::compute_distance_to_child_nodes( - result_indices_buffer, - result_distances_buffer, - *dataset_desc, - knn_graph, - graph_degree, - local_visited_hashmap_ptr, - visited_hash_bitlen, - local_traversed_hashmap_ptr, - traversed_hash_bitlen, - parent_indices_buffer, - result_indices_buffer, - 1, - result_position, - result_buffer_size_32); - // __syncthreads(); - - // Check the state of the nodes in the result buffer which were not updated - // by the compute_distance_to_child_nodes above, and if it cannot be used as - // a parent node, it is deactivated. - for (uint32_t i = threadIdx.x; i < result_position[0]; i += blockDim.x) { - INDEX_T index = result_indices_buffer[i]; - if (index == invalid_index || index & index_msb_1_mask) { continue; } - if (hashmap::search(local_traversed_hashmap_ptr, traversed_hash_bitlen, index)) { - result_indices_buffer[i] = invalid_index; - result_distances_buffer[i] = utils::get_max_value(); - } - } - __syncthreads(); - _CLK_REC(clk_compute_distance); - - // Filtering - if constexpr (!std::is_same::value) { - for (unsigned p = threadIdx.x; p < 1; p += blockDim.x) { - if (parent_indices_buffer[p] != invalid_index) { - const auto parent_id = - result_indices_buffer[parent_indices_buffer[p]] & ~index_msb_1_mask; - if (!sample_filter(query_id, parent_id)) { - // If the parent must not be in the resulting top-k list, remove from the parent list - result_distances_buffer[parent_indices_buffer[p]] = utils::get_max_value(); - result_indices_buffer[parent_indices_buffer[p]] = invalid_index; - } - } - } - __syncthreads(); - } - - iter++; - } - - // Filtering - if constexpr (!std::is_same::value) { - for (uint32_t i = threadIdx.x; i < result_buffer_size_32; i += blockDim.x) { - INDEX_T index = result_indices_buffer[i]; - if (index == invalid_index) { continue; } - index &= ~index_msb_1_mask; - if (!sample_filter(query_id, index)) { - result_indices_buffer[i] = invalid_index; - result_distances_buffer[i] = utils::get_max_value(); - } - } - __syncthreads(); - } - - // Output search results (1st warp only). - if (threadIdx.x < 32) { - uint32_t offset = 0; - for (uint32_t i = threadIdx.x; i < result_buffer_size_32; i += 32) { - INDEX_T index = result_indices_buffer[i]; - bool is_valid = false; - if (index != invalid_index) { - if (index & index_msb_1_mask) { - is_valid = true; - index &= ~index_msb_1_mask; - } else if ((offset < itopk_size) && - hashmap::insert( - local_traversed_hashmap_ptr, traversed_hash_bitlen, index)) { - // If a node that is not used as a parent can be inserted into - // the traversed hash table, it is considered a valid result. - is_valid = true; - } - } - const auto mask = __ballot_sync(0xffffffff, is_valid); - if (is_valid) { - const auto j = offset + __popc(mask & ((1 << threadIdx.x) - 1)); - if (j < itopk_size) { - uint32_t k = j + (itopk_size * (cta_id + (num_cta_per_query * query_id))); - result_indices_ptr[k] = index & ~index_msb_1_mask; - if (result_distances_ptr != nullptr) { - result_distances_ptr[k] = result_distances_buffer[i]; - } - } else { - // If it is valid and registered in the traversed hash table but is - // not output as a result, it is removed from the hash table. - hashmap::remove(local_traversed_hashmap_ptr, traversed_hash_bitlen, index); - } - } - offset += __popc(mask); - } - // If the number of outputs is insufficient, fill in with invalid results. - for (uint32_t i = offset + threadIdx.x; i < itopk_size; i += 32) { - uint32_t k = i + (itopk_size * (cta_id + (num_cta_per_query * query_id))); - result_indices_ptr[k] = invalid_index; - if (result_distances_ptr != nullptr) { - result_distances_ptr[k] = utils::get_max_value(); - } - } - } - - if (threadIdx.x == 0 && cta_id == 0 && num_executed_iterations != nullptr) { - num_executed_iterations[query_id] = iter + 1; - } - -#ifdef _CLK_BREAKDOWN - if ((threadIdx.x == 0 || threadIdx.x == blockDim.x - 1) && (blockIdx.x == 0) && - ((query_id * 3) % gridDim.y < 3)) { - printf( - "%s:%d " - "query, %d, thread, %d" - ", init, %lu" - ", 1st_distance, %lu" - ", topk, %lu" - ", pickup_parents, %lu" - ", distance, %lu" - "\n", - __FILE__, - __LINE__, - query_id, - threadIdx.x, - clk_init, - clk_compute_1st_distance, - clk_topk, - clk_pickup_parents, - clk_compute_distance); - } -#endif -} - -template -RAFT_KERNEL set_value_batch_kernel(T* const dev_ptr, - const std::size_t ld, - const T val, - const std::size_t count, - const std::size_t batch_size) -{ - const auto tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid >= count * batch_size) { return; } - const auto batch_id = tid / count; - const auto elem_id = tid % count; - dev_ptr[elem_id + ld * batch_id] = val; -} - -template -void set_value_batch(T* const dev_ptr, - const std::size_t ld, - const T val, - const std::size_t count, - const std::size_t batch_size, - cudaStream_t cuda_stream) -{ - constexpr std::uint32_t block_size = 256; - const auto grid_size = (count * batch_size + block_size - 1) / block_size; - set_value_batch_kernel - <<>>(dev_ptr, ld, val, count, batch_size); -} - -template -struct search_kernel_config { - // Search kernel function type. Note that the actual values for the template value - // parameters do not matter, because they are not part of the function signature. The - // second to fourth value parameters will be selected by the choose_* functions below. - using kernel_t = decltype(&search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>); - - static auto choose_buffer_size(unsigned result_buffer_size, unsigned block_size) -> kernel_t - { - if (result_buffer_size <= 64) { - return search_kernel<64, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; - } else if (result_buffer_size <= 128) { - return search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; - } else if (result_buffer_size <= 256) { - return search_kernel<256, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; - } - THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256); - } -}; - -template -void select_and_run(const dataset_descriptor_host& dataset_desc, - raft::device_matrix_view graph, - IndexT* topk_indices_ptr, // [num_queries, topk] - DistanceT* topk_distances_ptr, // [num_queries, topk] - const DataT* queries_ptr, // [num_queries, dataset_dim] - uint32_t num_queries, - const IndexT* dev_seed_ptr, // [num_queries, num_seeds] - uint32_t* num_executed_iterations, // [num_queries,] - const search_params& ps, - uint32_t topk, - // multi_cta_search (params struct) - uint32_t block_size, // - uint32_t result_buffer_size, - uint32_t smem_size, - uint32_t visited_hash_bitlen, - int64_t traversed_hash_bitlen, - IndexT* traversed_hashmap_ptr, - uint32_t num_cta_per_query, - uint32_t num_seeds, - SampleFilterT sample_filter, - cudaStream_t stream) -{ - auto kernel = - search_kernel_config, - SampleFilterT>::choose_buffer_size(result_buffer_size, block_size); - - RAFT_CUDA_TRY( - cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); - // Initialize hash table - const uint32_t traversed_hash_size = hashmap::get_size(traversed_hash_bitlen); - set_value_batch(traversed_hashmap_ptr, - traversed_hash_size, - ~static_cast(0), - traversed_hash_size, - num_queries, - stream); - - dim3 block_dims(block_size, 1, 1); - dim3 grid_dims(num_cta_per_query, num_queries, 1); - RAFT_LOG_DEBUG("Launching kernel with %u threads, (%u, %u) blocks %u smem", - block_size, - num_cta_per_query, - num_queries, - smem_size); - - kernel<<>>(topk_indices_ptr, - topk_distances_ptr, - dataset_desc.dev_ptr(stream), - queries_ptr, - graph.data_handle(), - graph.extent(1), - ps.num_random_samplings, - ps.rand_xor_mask, - dev_seed_ptr, - num_seeds, - visited_hash_bitlen, - traversed_hashmap_ptr, - traversed_hash_bitlen, - ps.itopk_size, - ps.min_iterations, - ps.max_iterations, - num_executed_iterations, - sample_filter); -} - -} // namespace multi_cta_search -} // namespace cuvs::neighbors::cagra::detail +/* + * Copyright (c) 2023-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "search_multi_cta_kernel.cuh" + +#include "bitonic.hpp" +#include "compute_distance-ext.cuh" +#include "device_common.hpp" +#include "hashmap.hpp" +#include "search_plan.cuh" +#include "topk_for_cagra/topk.h" // TODO replace with raft topk if possible +#include "utils.hpp" + +#include +#include +#include +#include +#include + +#include + +#include + +// TODO: This shouldn't be invoking anything from spatial/knn +#include "../ann_utils.cuh" + +#include +#include // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp + +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::neighbors::cagra::detail { +namespace multi_cta_search { + +// #define _CLK_BREAKDOWN + +template +RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parent( + INDEX_T* const next_parent_indices, + INDEX_T* const itopk_indices, // [itopk_size * 2] + DISTANCE_T* const itopk_distances, // [itopk_size * 2] + INDEX_T* const hash_ptr, + const uint32_t hash_bitlen) +{ + constexpr uint32_t itopk_size = 32; + constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; + constexpr INDEX_T invalid_index = ~static_cast(0); + + const unsigned warp_id = threadIdx.x / 32; + if (warp_id > 0) { return; } + if (threadIdx.x == 0) { next_parent_indices[0] = invalid_index; } + __syncwarp(); + + int j = -1; + for (unsigned i = threadIdx.x; i < itopk_size * 2; i += 32) { + INDEX_T index = itopk_indices[i]; + int is_invalid = 0; + int is_candidate = 0; + if (index == invalid_index) { + is_invalid = 1; + } else if (index & index_msb_1_mask) { + } else { + is_candidate = 1; + } + + const auto ballot_mask = __ballot_sync(0xffffffff, is_candidate); + const auto candidate_id = __popc(ballot_mask & ((1 << threadIdx.x) - 1)); + for (int k = 0; k < __popc(ballot_mask); k++) { + int flag_done = 0; + if (is_candidate && candidate_id == k) { + is_candidate = 0; + if (hashmap::insert(hash_ptr, hash_bitlen, index)) { + // Use this candidate as next parent + index |= index_msb_1_mask; // set most significant bit as used node + if (i < itopk_size) { + next_parent_indices[0] = i; + itopk_indices[i] = index; + } else { + next_parent_indices[0] = j; + // Move the next parent node from i-th position to j-th position + itopk_indices[j] = index; + itopk_distances[j] = itopk_distances[i]; + itopk_indices[i] = invalid_index; + itopk_distances[i] = utils::get_max_value(); + } + flag_done = 1; + } else { + // Deactivate the node since it has been used by other CTA. + itopk_indices[i] = invalid_index; + itopk_distances[i] = utils::get_max_value(); + is_invalid = 1; + } + } + if (__any_sync(0xffffffff, (flag_done > 0))) { return; } + } + if (i < itopk_size) { + j = 31 - __clz(__ballot_sync(0xffffffff, is_invalid)); + if (j < 0) { return; } + } + } +} + +template +RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort(float* distances, // [num_elements] + INDEX_T* indices, // [num_elements] + const uint32_t num_elements) +{ + const unsigned warp_id = threadIdx.x / 32; + if (warp_id > 0) { return; } + const unsigned lane_id = threadIdx.x % 32; + constexpr unsigned N = (MAX_ELEMENTS + 31) / 32; + float key[N]; + INDEX_T val[N]; + for (unsigned i = 0; i < N; i++) { + unsigned j = lane_id + (32 * i); + if (j < num_elements) { + key[i] = distances[j]; + val[i] = indices[j]; + } else { + key[i] = utils::get_max_value(); + val[i] = ~static_cast(0); + } + } + /* Warp Sort */ + bitonic::warp_sort(key, val); + /* Store sorted results */ + for (unsigned i = 0; i < N; i++) { + unsigned j = (N * lane_id) + i; + if (j < num_elements) { + distances[j] = key[i]; + indices[j] = val[i]; + } + } +} + +// +// multiple CTAs per single query +// +template +RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( + typename DATASET_DESCRIPTOR_T::INDEX_T* const + result_indices_ptr, // [num_queries, num_cta_per_query, itopk_size] + typename DATASET_DESCRIPTOR_T::DISTANCE_T* const + result_distances_ptr, // [num_queries, num_cta_per_query, itopk_size] + const DATASET_DESCRIPTOR_T* dataset_desc, + const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr, // [num_queries, dataset_dim] + const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph, // [dataset_size, graph_degree] + const uint32_t graph_degree, + const unsigned num_distilation, + const uint64_t rand_xor_mask, + const typename DATASET_DESCRIPTOR_T::INDEX_T* seed_ptr, // [num_queries, num_seeds] + const uint32_t num_seeds, + const uint32_t visited_hash_bitlen, + typename DATASET_DESCRIPTOR_T::INDEX_T* const + traversed_hashmap_ptr, // [num_queries, 1 << traversed_hash_bitlen] + const uint32_t traversed_hash_bitlen, + const uint32_t itopk_size, + const uint32_t min_iteration, + const uint32_t max_iteration, + uint32_t* const num_executed_iterations, /* stats */ + SAMPLE_FILTER_T sample_filter) +{ + using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; + using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; + using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; + + const auto num_queries = gridDim.y; + const auto query_id = blockIdx.y; + const auto num_cta_per_query = gridDim.x; + const auto cta_id = blockIdx.x; // local CTA ID + +#ifdef _CLK_BREAKDOWN + uint64_t clk_init = 0; + uint64_t clk_compute_1st_distance = 0; + uint64_t clk_topk = 0; + uint64_t clk_pickup_parents = 0; + uint64_t clk_compute_distance = 0; + uint64_t clk_compute_actual_distance = 0; + uint64_t clk_start; +#define _CLK_START() clk_start = clock64() +#define _CLK_REC(V) V += clock64() - clk_start; +#else +#define _CLK_START() +#define _CLK_REC(V) +#endif + _CLK_START(); + + extern __shared__ uint8_t smem[]; + + // Layout of result_buffer + // +----------------+---------+---------------------------+ + // | internal_top_k | padding | neighbors of parent nodes | + // | | upto 32 | | + // +----------------+---------+---------------------------+ + // |<--- result_buffer_size_32 --->| + const auto result_buffer_size = itopk_size + graph_degree; + const auto result_buffer_size_32 = raft::round_up_safe(result_buffer_size, 32); + assert(result_buffer_size_32 <= MAX_ELEMENTS); + + // Set smem working buffer for the distance calculation + dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); + + auto* __restrict__ result_indices_buffer = + reinterpret_cast(smem + dataset_desc->smem_ws_size_in_bytes()); + auto* __restrict__ result_distances_buffer = + reinterpret_cast(result_indices_buffer + result_buffer_size_32); + auto* __restrict__ local_visited_hashmap_ptr = + reinterpret_cast(result_distances_buffer + result_buffer_size_32); + auto* __restrict__ parent_indices_buffer = + reinterpret_cast(local_visited_hashmap_ptr + hashmap::get_size(visited_hash_bitlen)); + auto* __restrict__ result_position = reinterpret_cast(parent_indices_buffer + 1); + + INDEX_T* const local_traversed_hashmap_ptr = + traversed_hashmap_ptr + (hashmap::get_size(traversed_hash_bitlen) * query_id); + + constexpr INDEX_T invalid_index = ~static_cast(0); + constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; + + for (unsigned i = threadIdx.x; i < result_buffer_size_32; i += blockDim.x) { + result_indices_buffer[i] = invalid_index; + result_distances_buffer[i] = utils::get_max_value(); + } + hashmap::init(local_visited_hashmap_ptr, visited_hash_bitlen); + __syncthreads(); + _CLK_REC(clk_init); + + // compute distance to randomly selecting nodes + _CLK_START(); + const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr; + uint32_t block_id = cta_id + (num_cta_per_query * query_id); + uint32_t num_blocks = num_cta_per_query * num_queries; + + device::compute_distance_to_random_nodes(result_indices_buffer, + result_distances_buffer, + *dataset_desc, + graph_degree, + num_distilation, + rand_xor_mask, + local_seed_ptr, + num_seeds, + local_visited_hashmap_ptr, + visited_hash_bitlen, + local_traversed_hashmap_ptr, + traversed_hash_bitlen, + block_id, + num_blocks); + __syncthreads(); + _CLK_REC(clk_compute_1st_distance); + + uint32_t iter = 0; + while (1) { + _CLK_START(); + if (threadIdx.x < 32) { + // [1st warp] Topk with bitonic sort + topk_by_bitonic_sort( + result_distances_buffer, result_indices_buffer, result_buffer_size_32); + } + __syncthreads(); + _CLK_REC(clk_topk); + + if (iter + 1 >= max_iteration) { break; } + + _CLK_START(); + if (threadIdx.x < 32) { + // [1st warp] Pick up a next parent + pickup_next_parent(parent_indices_buffer, + result_indices_buffer, + result_distances_buffer, + local_traversed_hashmap_ptr, + traversed_hash_bitlen); + } else { + // [Other warps] Reset visited hashmap + hashmap::init(local_visited_hashmap_ptr, visited_hash_bitlen, 32); + } + __syncthreads(); + _CLK_REC(clk_pickup_parents); + + if ((parent_indices_buffer[0] == invalid_index) && (iter >= min_iteration)) { break; } + + _CLK_START(); + for (unsigned i = threadIdx.x; i < result_buffer_size_32; i += blockDim.x) { + INDEX_T index = result_indices_buffer[i]; + if (index == invalid_index) { continue; } + if ((i >= itopk_size) && (index & index_msb_1_mask)) { + // Remove nodes kicked out of the itopk list from the traversed hash table. + hashmap::remove( + local_traversed_hashmap_ptr, traversed_hash_bitlen, index & ~index_msb_1_mask); + result_indices_buffer[i] = invalid_index; + result_distances_buffer[i] = utils::get_max_value(); + } else { + // Restore visited hashmap by putting nodes on result buffer in it. + index &= ~index_msb_1_mask; + hashmap::insert(local_visited_hashmap_ptr, visited_hash_bitlen, index); + } + } + // Initialize buffer for compute_distance_to_child_nodes. + if (threadIdx.x == blockDim.x - 1) { result_position[0] = result_buffer_size_32; } + __syncthreads(); + + // Compute the norms between child nodes and query node + device::compute_distance_to_child_nodes( + result_indices_buffer, + result_distances_buffer, + *dataset_desc, + knn_graph, + graph_degree, + local_visited_hashmap_ptr, + visited_hash_bitlen, + local_traversed_hashmap_ptr, + traversed_hash_bitlen, + parent_indices_buffer, + result_indices_buffer, + 1, +#ifdef _CLK_BREAKDOWN + clk_compute_actual_distance, +#endif + result_position, + result_buffer_size_32); + // __syncthreads(); + + // Check the state of the nodes in the result buffer which were not updated + // by the compute_distance_to_child_nodes above, and if it cannot be used as + // a parent node, it is deactivated. + for (uint32_t i = threadIdx.x; i < result_position[0]; i += blockDim.x) { + INDEX_T index = result_indices_buffer[i]; + if (index == invalid_index || index & index_msb_1_mask) { continue; } + if (hashmap::search(local_traversed_hashmap_ptr, traversed_hash_bitlen, index)) { + result_indices_buffer[i] = invalid_index; + result_distances_buffer[i] = utils::get_max_value(); + } + } + __syncthreads(); + _CLK_REC(clk_compute_distance); + + // Filtering + if constexpr (!std::is_same::value) { + for (unsigned p = threadIdx.x; p < 1; p += blockDim.x) { + if (parent_indices_buffer[p] != invalid_index) { + const auto parent_id = + result_indices_buffer[parent_indices_buffer[p]] & ~index_msb_1_mask; + if (!sample_filter(query_id, parent_id)) { + // If the parent must not be in the resulting top-k list, remove from the parent list + result_distances_buffer[parent_indices_buffer[p]] = utils::get_max_value(); + result_indices_buffer[parent_indices_buffer[p]] = invalid_index; + } + } + } + __syncthreads(); + } + + iter++; + } + + // Filtering + if constexpr (!std::is_same::value) { + for (uint32_t i = threadIdx.x; i < result_buffer_size_32; i += blockDim.x) { + INDEX_T index = result_indices_buffer[i]; + if (index == invalid_index) { continue; } + index &= ~index_msb_1_mask; + if (!sample_filter(query_id, index)) { + result_indices_buffer[i] = invalid_index; + result_distances_buffer[i] = utils::get_max_value(); + } + } + __syncthreads(); + } + + // Output search results (1st warp only). + if (threadIdx.x < 32) { + uint32_t offset = 0; + for (uint32_t i = threadIdx.x; i < result_buffer_size_32; i += 32) { + INDEX_T index = result_indices_buffer[i]; + bool is_valid = false; + if (index != invalid_index) { + if (index & index_msb_1_mask) { + is_valid = true; + index &= ~index_msb_1_mask; + } else if ((offset < itopk_size) && + hashmap::insert( + local_traversed_hashmap_ptr, traversed_hash_bitlen, index)) { + // If a node that is not used as a parent can be inserted into + // the traversed hash table, it is considered a valid result. + is_valid = true; + } + } + const auto mask = __ballot_sync(0xffffffff, is_valid); + if (is_valid) { + const auto j = offset + __popc(mask & ((1 << threadIdx.x) - 1)); + if (j < itopk_size) { + uint32_t k = j + (itopk_size * (cta_id + (num_cta_per_query * query_id))); + result_indices_ptr[k] = index & ~index_msb_1_mask; + if (result_distances_ptr != nullptr) { + result_distances_ptr[k] = result_distances_buffer[i]; + } + } else { + // If it is valid and registered in the traversed hash table but is + // not output as a result, it is removed from the hash table. + hashmap::remove(local_traversed_hashmap_ptr, traversed_hash_bitlen, index); + } + } + offset += __popc(mask); + } + // If the number of outputs is insufficient, fill in with invalid results. + for (uint32_t i = offset + threadIdx.x; i < itopk_size; i += 32) { + uint32_t k = i + (itopk_size * (cta_id + (num_cta_per_query * query_id))); + result_indices_ptr[k] = invalid_index; + if (result_distances_ptr != nullptr) { + result_distances_ptr[k] = utils::get_max_value(); + } + } + } + + if (threadIdx.x == 0 && cta_id == 0 && num_executed_iterations != nullptr) { + num_executed_iterations[query_id] = iter + 1; + } + +#ifdef _CLK_BREAKDOWN + if ((threadIdx.x == 0 || threadIdx.x == blockDim.x - 1) && (blockIdx.x == 0) && + ((query_id * 3) % gridDim.y < 3)) { + printf( + "%s:%d " + "query, %d, thread, %d" + ", init, %lu" + ", 1st_distance, %lu" + ", topk, %lu" + ", pickup_parents, %lu" + ", distance, %lu" + ", hash, %lu" + "\n", + __FILE__, + __LINE__, + query_id, + threadIdx.x, + clk_init, + clk_compute_1st_distance, + clk_topk, + clk_pickup_parents, + clk_compute_actual_distance, + clk_compute_distance - clk_compute_actual_distance); + } +#endif +} + +template +RAFT_KERNEL set_value_batch_kernel(T* const dev_ptr, + const std::size_t ld, + const T val, + const std::size_t count, + const std::size_t batch_size) +{ + const auto tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= count * batch_size) { return; } + const auto batch_id = tid / count; + const auto elem_id = tid % count; + dev_ptr[elem_id + ld * batch_id] = val; +} + +template +void set_value_batch(T* const dev_ptr, + const std::size_t ld, + const T val, + const std::size_t count, + const std::size_t batch_size, + cudaStream_t cuda_stream) +{ + constexpr std::uint32_t block_size = 256; + const auto grid_size = (count * batch_size + block_size - 1) / block_size; + set_value_batch_kernel + <<>>(dev_ptr, ld, val, count, batch_size); +} + +template +struct search_kernel_config { + // Search kernel function type. Note that the actual values for the template value + // parameters do not matter, because they are not part of the function signature. The + // second to fourth value parameters will be selected by the choose_* functions below. + using kernel_t = decltype(&search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>); + + static auto choose_buffer_size(unsigned result_buffer_size, unsigned block_size) -> kernel_t + { + if (result_buffer_size <= 64) { + return search_kernel<64, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; + } else if (result_buffer_size <= 128) { + return search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; + } else if (result_buffer_size <= 256) { + return search_kernel<256, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>; + } + THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256); + } +}; + +template +void select_and_run(const dataset_descriptor_host& dataset_desc, + raft::device_matrix_view graph, + IndexT* topk_indices_ptr, // [num_queries, topk] + DistanceT* topk_distances_ptr, // [num_queries, topk] + const DataT* queries_ptr, // [num_queries, dataset_dim] + uint32_t num_queries, + const IndexT* dev_seed_ptr, // [num_queries, num_seeds] + uint32_t* num_executed_iterations, // [num_queries,] + const search_params& ps, + uint32_t topk, + // multi_cta_search (params struct) + uint32_t block_size, // + uint32_t result_buffer_size, + uint32_t smem_size, + uint32_t visited_hash_bitlen, + int64_t traversed_hash_bitlen, + IndexT* traversed_hashmap_ptr, + uint32_t num_cta_per_query, + uint32_t num_seeds, + SampleFilterT sample_filter, + cudaStream_t stream) +{ + auto kernel = + search_kernel_config, + SampleFilterT>::choose_buffer_size(result_buffer_size, block_size); + + RAFT_CUDA_TRY( + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); + // Initialize hash table + const uint32_t traversed_hash_size = hashmap::get_size(traversed_hash_bitlen); + set_value_batch(traversed_hashmap_ptr, + traversed_hash_size, + ~static_cast(0), + traversed_hash_size, + num_queries, + stream); + + dim3 block_dims(block_size, 1, 1); + dim3 grid_dims(num_cta_per_query, num_queries, 1); + RAFT_LOG_DEBUG("Launching kernel with %u threads, (%u, %u) blocks %u smem", + block_size, + num_cta_per_query, + num_queries, + smem_size); + + kernel<<>>(topk_indices_ptr, + topk_distances_ptr, + dataset_desc.dev_ptr(stream), + queries_ptr, + graph.data_handle(), + graph.extent(1), + ps.num_random_samplings, + ps.rand_xor_mask, + dev_seed_ptr, + num_seeds, + visited_hash_bitlen, + traversed_hashmap_ptr, + traversed_hash_bitlen, + ps.itopk_size, + ps.min_iterations, + ps.max_iterations, + num_executed_iterations, + sample_filter); +} + +} // namespace multi_cta_search +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index 21c88fd607..93a1048bb3 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,8 +66,6 @@ namespace cuvs::neighbors::cagra::detail { namespace single_cta_search { -// #define _CLK_BREAKDOWN - template RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parents(std::uint32_t* const terminate_flag, INDEX_T* const next_parent_indices, @@ -581,13 +579,14 @@ __device__ void search_core( using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T; #ifdef _CLK_BREAKDOWN - std::uint64_t clk_init = 0; - std::uint64_t clk_compute_1st_distance = 0; - std::uint64_t clk_topk = 0; - std::uint64_t clk_reset_hash = 0; - std::uint64_t clk_pickup_parents = 0; - std::uint64_t clk_restore_hash = 0; - std::uint64_t clk_compute_distance = 0; + std::uint64_t clk_init = 0; + std::uint64_t clk_compute_1st_distance = 0; + std::uint64_t clk_topk = 0; + std::uint64_t clk_reset_hash = 0; + std::uint64_t clk_pickup_parents = 0; + std::uint64_t clk_restore_hash = 0; + std::uint64_t clk_compute_distance = 0; + std::uint64_t clk_compute_actual_distance = 0; std::uint64_t clk_start; #define _CLK_START() clk_start = clock64() #define _CLK_REC(V) V += clock64() - clk_start; @@ -788,7 +787,12 @@ __device__ void search_core( 0, parent_list_buffer, result_indices_buffer, - search_width); + search_width +#ifdef _CLK_BREAKDOWN + , + clk_compute_actual_distance +#endif + ); __syncthreads(); _CLK_REC(clk_compute_distance); @@ -945,6 +949,7 @@ __device__ void search_core( ", pickup_parents, %lu" ", restore_hash, %lu" ", distance, %lu" + ", hash, %lu" "\n", __FILE__, __LINE__, @@ -956,7 +961,8 @@ __device__ void search_core( clk_reset_hash, clk_pickup_parents, clk_restore_hash, - clk_compute_distance); + clk_compute_actual_distance, + clk_compute_distance - clk_compute_actual_distance); } #endif } From 3b3c20ba5c6cd3371b4152e32708f3d766849364 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Fri, 24 Oct 2025 08:19:07 +0900 Subject: [PATCH 017/119] Add fp8x8 --- .../cagra/compute_distance_vpq-impl.cuh | 86 ++++++++++++------- .../neighbors/detail/cagra/device_common.hpp | 5 ++ cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh | 20 +++++ 3 files changed, 81 insertions(+), 30 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index afb9d0bbcc..5f8e9850a6 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -28,10 +28,10 @@ namespace cuvs::neighbors::cagra::detail { #if 1 -using pq_val_pack_t = ivf_pq::detail::fp_8bit4<5, true, false>; +using pq_val_pack_t = ivf_pq::detail::fp_8bit8<5, true, false>; using pq_val_t = typename pq_val_pack_t::unit_t; -using pq_val_pack_uint_t = uint32_t; -constexpr uint32_t pq_val_pack_num_elements = 4; +using pq_val_pack_uint_t = uint64_t; +constexpr uint32_t pq_val_pack_num_elements = 8; #else using pq_val_t = half; using pq_val_pack_t = half2; @@ -205,17 +205,21 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS); if constexpr (std::is_same_v) { - pq_val_pack_t buf2; - buf2.x = r->pq_code_book_ptr()[i]; - buf2.y = r->pq_code_book_ptr()[i + 1]; - device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2); + // pq_val_pack_t buf2; + // buf2.x = r->pq_code_book_ptr()[i]; + // buf2.y = r->pq_code_book_ptr()[i + 1]; + // device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2); } else { - pq_val_pack_t buf4; - buf4.x = static_cast(static_cast(r->pq_code_book_ptr()[i])); - buf4.y = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); - buf4.z = static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); - buf4.w = static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); - device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf4.as_u32()); + pq_val_pack_t buf8; + buf8.x0 = static_cast(static_cast(r->pq_code_book_ptr()[i])); + buf8.x1 = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); + buf8.x2 = static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); + buf8.x3 = static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); + buf8.x4 = static_cast(static_cast(r->pq_code_book_ptr()[i + 4])); + buf8.x5 = static_cast(static_cast(r->pq_code_book_ptr()[i + 5])); + buf8.x6 = static_cast(static_cast(r->pq_code_book_ptr()[i + 6])); + buf8.x7 = static_cast(static_cast(r->pq_code_book_ptr()[i + 7])); + device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf8.as_u64()); } } } @@ -322,43 +326,65 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( half2 q2; // if constexpr (false) { if constexpr (std::is_same_v) { - half2 c2; + // half2 c2; + //// Loading PQ code book from smem + // device::lds(c2, + // pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * + // ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); + + //// Loading query vector from smem + // device::lds(q2, query_ptr + sizeof(half2) * d); + // half2 c2_ = c2; + //// L2 distance + // auto dist = q2 - c2_ - reinterpret_cast(vq_vals)[d1]; dist = dist * dist; norm += static_cast(dist.x + // + dist.y); + } else { + pq_val_pack_t c_vec; // Loading PQ code book from smem - device::lds(c2, + device::lds(c_vec.as_u64(), pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); + half2 c2_; + // Loading query vector from smem device::lds(q2, query_ptr + sizeof(half2) * d); - half2 c2_ = c2; + c2_.x = static_cast(c_vec.x0); + c2_.y = static_cast(c_vec.x1); // L2 distance auto dist = q2 - c2_ - reinterpret_cast(vq_vals)[d1]; dist = dist * dist; norm += static_cast(dist.x + dist.y); - } else { - pq_val_pack_t c2; - // Loading PQ code book from smem - device::lds(c2.as_u32(), - pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * - ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); - half2 c2_; + d1 += 1; + d += kQueryBlock; - // Loading query vector from smem device::lds(q2, query_ptr + sizeof(half2) * d); - c2_.x = static_cast(c2.x); - c2_.y = static_cast(c2.y); + c2_.x = static_cast(c_vec.x2); + c2_.y = static_cast(c_vec.x3); // L2 distance - auto dist = q2 - c2_ - reinterpret_cast(vq_vals)[d1]; - dist = dist * dist; + dist = q2 - c2_ - reinterpret_cast(vq_vals)[d1]; + dist = dist * dist; + norm += static_cast(dist.x + dist.y); + + d1 += 1; + d += kQueryBlock; + + device::lds(q2, query_ptr + sizeof(half2) * d); + c2_.x = static_cast(c_vec.x4); + c2_.y = static_cast(c_vec.x5); + // L2 distance + dist = q2 - c2_ - reinterpret_cast(vq_vals)[d1]; + dist = dist * dist; norm += static_cast(dist.x + dist.y); d1 += 1; d += kQueryBlock; device::lds(q2, query_ptr + sizeof(half2) * d); - c2_.x = static_cast(c2.z); - c2_.y = static_cast(c2.w); + c2_.x = static_cast(c_vec.x6); + c2_.y = static_cast(c_vec.x7); // L2 distance dist = q2 - c2_ - reinterpret_cast(vq_vals)[d1]; dist = dist * dist; diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 9715a0473f..552164fc56 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -304,6 +304,11 @@ RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, uint32_t addr) asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "r"(addr)); } +RAFT_DEVICE_INLINE_FUNCTION void lds(uint64_t& x, uint32_t addr) +{ + asm volatile("ld.shared.u64 {%0}, [%1];" : "=l"(x) : "r"(addr)); +} + RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, const uint32_t* addr) { lds(x, uint32_t(__cvta_generic_to_shared(addr))); diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh index c4e1c08c01..73660f4fef 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh @@ -132,4 +132,24 @@ struct fp_8bit4<5, true, false> { HDI uint32_t as_u32() const { return *reinterpret_cast(this); } }; +template +struct fp_8bit8 { + using unit_t = fp_8bit; + unit_t x0, x1, x2, x3, x4, x5, x6, x7; + HDI fp_8bit8() : x0(0.f), x1(0.f), x2(0.f), x3(0.f), x4(0.f), x5(0.f), x6(0.f), x7(0.f) {} + + HDI uint64_t& as_u64() { return *reinterpret_cast(this); } + HDI uint64_t as_u64() const { return *reinterpret_cast(this); } +}; + +template <> +struct fp_8bit8<5, true, false> { + using unit_t = __nv_fp8_e5m2; + unit_t x0, x1, x2, x3, x4, x5, x6, x7; + HDI fp_8bit8() : x0(0.f), x1(0.f), x2(0.f), x3(0.f), x4(0.f), x5(0.f), x6(0.f), x7(0.f) {} + + HDI uint64_t& as_u64() { return *reinterpret_cast(this); } + HDI uint64_t as_u64() const { return *reinterpret_cast(this); } +}; + } // namespace cuvs::neighbors::ivf_pq::detail From bc572e03fb65dd60e77fd495e9e5ca5cfbba2586 Mon Sep 17 00:00:00 2001 From: Hiroyuki Ootomo Date: Fri, 24 Oct 2025 00:06:54 -0700 Subject: [PATCH 018/119] Fix a bug --- .../cagra/compute_distance_vpq-impl.cuh | 48 +++++++++++-------- .../neighbors/detail/cagra/device_common.hpp | 5 ++ 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 5f8e9850a6..3e2db41234 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -219,7 +219,7 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, buf8.x5 = static_cast(static_cast(r->pq_code_book_ptr()[i + 5])); buf8.x6 = static_cast(static_cast(r->pq_code_book_ptr()[i + 6])); buf8.x7 = static_cast(static_cast(r->pq_code_book_ptr()[i + 7])); - device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf8.as_u64()); + device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf8.as_u64()); } } } @@ -319,10 +319,13 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( if (PQ_LEN * (v + k) >= dim) break; #pragma unroll for (std::uint32_t m = 0; m < PQ_LEN / pq_val_pack_num_elements; m++) { - constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN); - std::uint32_t d1 = m * (pq_val_pack_num_elements / 2) + (PQ_LEN / 2) * v; - std::uint32_t d = - d1 * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; + constexpr uint32_t vq_val_pack_num_elements = 2; + constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN); + std::uint32_t vq_half2_index = + m * (pq_val_pack_num_elements / vq_val_pack_num_elements) + + (PQ_LEN / vq_val_pack_num_elements) * v; + std::uint32_t query_val_index = + vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; half2 q2; // if constexpr (false) { if constexpr (std::is_same_v) { @@ -345,48 +348,51 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( device::lds(c_vec.as_u64(), pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); - half2 c2_; // Loading query vector from smem - device::lds(q2, query_ptr + sizeof(half2) * d); + device::lds(q2, query_ptr + sizeof(half2) * query_val_index); c2_.x = static_cast(c_vec.x0); c2_.y = static_cast(c_vec.x1); // L2 distance - auto dist = q2 - c2_ - reinterpret_cast(vq_vals)[d1]; - dist = dist * dist; + auto dist = + q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; + dist = dist * dist; norm += static_cast(dist.x + dist.y); - d1 += 1; - d += kQueryBlock; + vq_half2_index += 1; + query_val_index += kQueryBlock; - device::lds(q2, query_ptr + sizeof(half2) * d); + device::lds(q2, query_ptr + sizeof(half2) * query_val_index); c2_.x = static_cast(c_vec.x2); c2_.y = static_cast(c_vec.x3); // L2 distance - dist = q2 - c2_ - reinterpret_cast(vq_vals)[d1]; + dist = + q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); - d1 += 1; - d += kQueryBlock; + vq_half2_index += 1; + query_val_index += kQueryBlock; - device::lds(q2, query_ptr + sizeof(half2) * d); + device::lds(q2, query_ptr + sizeof(half2) * query_val_index); c2_.x = static_cast(c_vec.x4); c2_.y = static_cast(c_vec.x5); // L2 distance - dist = q2 - c2_ - reinterpret_cast(vq_vals)[d1]; + dist = + q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); - d1 += 1; - d += kQueryBlock; + vq_half2_index += 1; + query_val_index += kQueryBlock; - device::lds(q2, query_ptr + sizeof(half2) * d); + device::lds(q2, query_ptr + sizeof(half2) * query_val_index); c2_.x = static_cast(c_vec.x6); c2_.y = static_cast(c_vec.x7); // L2 distance - dist = q2 - c2_ - reinterpret_cast(vq_vals)[d1]; + dist = + q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); } diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 552164fc56..83d4bdf161 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -331,6 +331,11 @@ RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const uint32_t& x) asm volatile("st.shared.u32 [%0], %1;" : : "r"(addr), "r"(reinterpret_cast(x))); } +RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const uint64_t& x) +{ + asm volatile("st.shared.u64 [%0], %1;" : : "r"(addr), "l"(reinterpret_cast(x))); +} + RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const half2& x) { asm volatile("st.shared.v2.u16 [%0], {%1, %2};" From ec275d47891ff83501bed1e074b36bfc44bada27 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Sat, 25 Oct 2025 08:29:51 +0900 Subject: [PATCH 019/119] Update 2, 4, 8 configs --- .../cagra/compute_distance_vpq-impl.cuh | 138 +++++++++++++----- cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh | 32 ++-- 2 files changed, 120 insertions(+), 50 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 3e2db41234..06d5fcd46b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -27,17 +27,29 @@ namespace cuvs::neighbors::cagra::detail { -#if 1 -using pq_val_pack_t = ivf_pq::detail::fp_8bit8<5, true, false>; -using pq_val_t = typename pq_val_pack_t::unit_t; -using pq_val_pack_uint_t = uint64_t; -constexpr uint32_t pq_val_pack_num_elements = 8; -#else -using pq_val_t = half; -using pq_val_pack_t = half2; -using pq_val_pack_uint_t = uint32_t; -constexpr uint32_t pq_val_pack_num_elements = 2; -#endif +template +struct pq_val_type_t {}; +template <> +struct pq_val_type_t<2> { + using pq_val_pack_t = half2; + using pq_val_t = half; + using pq_val_pack_uint_t = uint32_t; + static constexpr uint32_t pq_val_pack_num_elements = 2; +}; +template <> +struct pq_val_type_t<4> { + using pq_val_pack_t = ivf_pq::detail::fp_8bit8<5, true, false>; + using pq_val_t = typename pq_val_pack_t::unit_t; + using pq_val_pack_uint_t = typename pq_val_pack_t::uint_t; + static constexpr uint32_t pq_val_pack_num_elements = pq_val_pack_t::num_elements; +}; +template <> +struct pq_val_type_t<8> { + using pq_val_pack_t = ivf_pq::detail::fp_8bit8<5, true, false>; + using pq_val_t = typename pq_val_pack_t::unit_t; + using pq_val_pack_uint_t = typename pq_val_pack_t::uint_t; + static constexpr uint32_t pq_val_pack_num_elements = pq_val_pack_t::num_elements; +}; template () / pq_val_pack_num_elements; + (1 << PQ_BITS) * PQ_LEN * utils::size_of::pq_val_pack_uint_t>() / + pq_val_type_t::pq_val_pack_num_elements; _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl, compute_distance_type* compute_distance_impl, @@ -169,6 +182,10 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; constexpr auto PQ_BITS = DescriptorT::kPqBits; constexpr auto PQ_LEN = DescriptorT::kPqLen; + using pq_val_config = pq_val_type_t; + using pq_val_t = typename pq_val_config::pq_val_t; + using pq_val_pack_uint_t = typename pq_val_config::pq_val_pack_uint_t; + using pq_val_pack_t = typename pq_val_config::pq_val_pack_t; auto* r = reinterpret_cast(smem_ptr); @@ -190,13 +207,14 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, __syncthreads(); // Copy PQ table - for (unsigned i = threadIdx.x * pq_val_pack_num_elements; i < (1 << PQ_BITS) * PQ_LEN; - i += blockDim.x * pq_val_pack_num_elements) { + for (unsigned i = threadIdx.x * pq_val_config::pq_val_pack_num_elements; + i < (1 << PQ_BITS) * PQ_LEN; + i += blockDim.x * pq_val_config::pq_val_pack_num_elements) { // Change the order of PQ code book array to reduce the // frequency of bank conflicts. constexpr auto num_elements_per_bank = - pq_val_pack_num_elements / - (utils::size_of() / utils::size_of()); + pq_val_config::pq_val_pack_num_elements / + (utils::size_of() / utils::size_of()); if constexpr (PQ_LEN >= num_elements_per_bank) { // safety constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank; @@ -204,12 +222,21 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, const auto smem_index = (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS); - if constexpr (std::is_same_v) { - // pq_val_pack_t buf2; - // buf2.x = r->pq_code_book_ptr()[i]; - // buf2.y = r->pq_code_book_ptr()[i + 1]; - // device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2); + if constexpr (PQ_LEN == 2) { + half2 buf2; + buf2.x = r->pq_code_book_ptr()[i]; + buf2.y = r->pq_code_book_ptr()[i + 1]; + device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2); + } else if constexpr (PQ_LEN == 4) { + using pq_val_pack_t = ivf_pq::detail::fp_8bit4<5, true, false>; + pq_val_pack_t buf4; + buf4.x = static_cast(static_cast(r->pq_code_book_ptr()[i])); + buf4.y = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); + buf4.z = static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); + buf4.w = static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); + device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf4.as_uint()); } else { + using pq_val_pack_t = ivf_pq::detail::fp_8bit8<5, true, false>; pq_val_pack_t buf8; buf8.x0 = static_cast(static_cast(r->pq_code_book_ptr()[i])); buf8.x1 = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); @@ -219,7 +246,7 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, buf8.x5 = static_cast(static_cast(r->pq_code_book_ptr()[i + 5])); buf8.x6 = static_cast(static_cast(r->pq_code_book_ptr()[i + 6])); buf8.x7 = static_cast(static_cast(r->pq_code_book_ptr()[i + 7])); - device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf8.as_u64()); + device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf8.as_uint()); } } } @@ -267,6 +294,12 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( constexpr auto PQ_LEN = DescriptorT::kPqLen; using PQ_CODEBOOK_LOAD_T = uint32_t; + using pq_val_config = pq_val_type_t; + using pq_val_t = typename pq_val_config::pq_val_t; + using pq_val_pack_uint_t = typename pq_val_config::pq_val_pack_uint_t; + using pq_val_pack_t = typename pq_val_config::pq_val_pack_t; + constexpr uint32_t pq_val_pack_num_elements = pq_val_config::pq_val_pack_num_elements; + const uint32_t query_ptr = pq_codebook_ptr + DescriptorT::kSMemCodeBookSizeInBytes; static_assert(PQ_BITS == 8, "Only pq_bits == 8 is supported at the moment."); constexpr uint32_t vlen = utils::size_of() / utils::size_of(); @@ -328,24 +361,53 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; half2 q2; // if constexpr (false) { - if constexpr (std::is_same_v) { - // half2 c2; - //// Loading PQ code book from smem - // device::lds(c2, - // pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * - // ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); - - //// Loading query vector from smem - // device::lds(q2, query_ptr + sizeof(half2) * d); - // half2 c2_ = c2; - //// L2 distance - // auto dist = q2 - c2_ - reinterpret_cast(vq_vals)[d1]; dist = dist * dist; norm += static_cast(dist.x - // + dist.y); - } else { + if constexpr (PQ_LEN == 2) { + pq_val_pack_t c2; + // Loading PQ code book from smem + device::lds(c2, + pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * + ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); + + // Loading query vector from smem + device::lds(q2, query_ptr + sizeof(half2) * query_val_index); + // L2 distance + auto dist = + q2 - c2 - reinterpret_cast(vq_vals)[vq_half2_index]; + dist = dist * dist; + norm += static_cast(dist.x + dist.y); + } else if constexpr (PQ_LEN == 4) { + pq_val_pack_t c_vec; + // Loading PQ code book from smem + device::lds(c_vec.as_uint(), + pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * + ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); + half2 c2_; + + // Loading query vector from smem + device::lds(q2, query_ptr + sizeof(half2) * query_val_index); + c2_.x = static_cast(c_vec.x); + c2_.y = static_cast(c_vec.y); + // L2 distance + auto dist = + q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; + dist = dist * dist; + norm += static_cast(dist.x + dist.y); + + vq_half2_index += 1; + query_val_index += kQueryBlock; + + device::lds(q2, query_ptr + sizeof(half2) * query_val_index); + c2_.x = static_cast(c_vec.z); + c2_.y = static_cast(c_vec.w); + // L2 distance + dist = + q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; + dist = dist * dist; + norm += static_cast(dist.x + dist.y); + } else if constexpr (PQ_LEN == 8) { pq_val_pack_t c_vec; // Loading PQ code book from smem - device::lds(c_vec.as_u64(), + device::lds(c_vec.as_uint(), pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); half2 c2_; diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh index 73660f4fef..10d817dfd3 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh @@ -114,42 +114,50 @@ struct fp_8bit { template struct fp_8bit4 { - using unit_t = fp_8bit; + using unit_t = fp_8bit; + using uint_t = uint32_t; + static constexpr uint32_t num_elements = 4; unit_t x, y, z, w; HDI fp_8bit4() : x(0.f), y(0.f), z(0.f), w(0.f) {} - HDI uint32_t& as_u32() { return *reinterpret_cast(this); } - HDI uint32_t as_u32() const { return *reinterpret_cast(this); } + HDI uint_t& as_uint() { return *reinterpret_cast(this); } + HDI uint_t as_uint() const { return *reinterpret_cast(this); } }; template <> struct fp_8bit4<5, true, false> { - using unit_t = __nv_fp8_e5m2; + using unit_t = __nv_fp8_e5m2; + using uint_t = uint32_t; + static constexpr uint32_t num_elements = 4; unit_t x, y, z, w; HDI fp_8bit4() : x(0.f), y(0.f), z(0.f), w(0.f) {} - HDI uint32_t& as_u32() { return *reinterpret_cast(this); } - HDI uint32_t as_u32() const { return *reinterpret_cast(this); } + HDI uint_t& as_uint() { return *reinterpret_cast(this); } + HDI uint_t as_uint() const { return *reinterpret_cast(this); } }; template struct fp_8bit8 { - using unit_t = fp_8bit; + using unit_t = fp_8bit; + using uint_t = uint64_t; + static constexpr uint32_t num_elements = 8; unit_t x0, x1, x2, x3, x4, x5, x6, x7; HDI fp_8bit8() : x0(0.f), x1(0.f), x2(0.f), x3(0.f), x4(0.f), x5(0.f), x6(0.f), x7(0.f) {} - HDI uint64_t& as_u64() { return *reinterpret_cast(this); } - HDI uint64_t as_u64() const { return *reinterpret_cast(this); } + HDI uint_t& as_uint() { return *reinterpret_cast(this); } + HDI uint_t as_uint() const { return *reinterpret_cast(this); } }; template <> struct fp_8bit8<5, true, false> { - using unit_t = __nv_fp8_e5m2; + using unit_t = __nv_fp8_e5m2; + using uint_t = uint64_t; + static constexpr uint32_t num_elements = 8; unit_t x0, x1, x2, x3, x4, x5, x6, x7; HDI fp_8bit8() : x0(0.f), x1(0.f), x2(0.f), x3(0.f), x4(0.f), x5(0.f), x6(0.f), x7(0.f) {} - HDI uint64_t& as_u64() { return *reinterpret_cast(this); } - HDI uint64_t as_u64() const { return *reinterpret_cast(this); } + HDI uint_t& as_uint() { return *reinterpret_cast(this); } + HDI uint_t as_uint() const { return *reinterpret_cast(this); } }; } // namespace cuvs::neighbors::ivf_pq::detail From a85d8a3af212d6092ecf6664617476985297e8ac Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Sun, 26 Oct 2025 10:11:24 +0900 Subject: [PATCH 020/119] Fix a bug --- .../neighbors/detail/cagra/compute_distance_vpq-impl.cuh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 06d5fcd46b..259c348aa2 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -38,7 +38,7 @@ struct pq_val_type_t<2> { }; template <> struct pq_val_type_t<4> { - using pq_val_pack_t = ivf_pq::detail::fp_8bit8<5, true, false>; + using pq_val_pack_t = ivf_pq::detail::fp_8bit4<5, true, false>; using pq_val_t = typename pq_val_pack_t::unit_t; using pq_val_pack_uint_t = typename pq_val_pack_t::uint_t; static constexpr uint32_t pq_val_pack_num_elements = pq_val_pack_t::num_elements; @@ -228,15 +228,13 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, buf2.y = r->pq_code_book_ptr()[i + 1]; device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2); } else if constexpr (PQ_LEN == 4) { - using pq_val_pack_t = ivf_pq::detail::fp_8bit4<5, true, false>; pq_val_pack_t buf4; buf4.x = static_cast(static_cast(r->pq_code_book_ptr()[i])); buf4.y = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); buf4.z = static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); buf4.w = static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf4.as_uint()); - } else { - using pq_val_pack_t = ivf_pq::detail::fp_8bit8<5, true, false>; + } else if constexpr (PQ_LEN == 8) { pq_val_pack_t buf8; buf8.x0 = static_cast(static_cast(r->pq_code_book_ptr()[i])); buf8.x1 = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); From d1f628c74e149006b5e3e60a0838117aebce4006 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Mon, 27 Oct 2025 22:52:28 +0900 Subject: [PATCH 021/119] Add F8 query support --- .../cagra/compute_distance_vpq-impl.cuh | 157 ++++++++++++------ 1 file changed, 102 insertions(+), 55 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 259c348aa2..47fbcc0155 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -156,7 +156,9 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(dim, DatasetBlockDim) * sizeof(QUERY_T); + raft::round_up_safe(dim, DatasetBlockDim) * + utils::size_of::pq_val_pack_uint_t>() / + pq_val_type_t::pq_val_pack_num_elements; } }; @@ -176,16 +178,17 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, const typename DescriptorT::DATA_T* queries_ptr, uint32_t query_id) -> const DescriptorT* { - using QUERY_T = typename DescriptorT::QUERY_T; - using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; - using word_type = uint32_t; - constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; - constexpr auto PQ_BITS = DescriptorT::kPqBits; - constexpr auto PQ_LEN = DescriptorT::kPqLen; - using pq_val_config = pq_val_type_t; - using pq_val_t = typename pq_val_config::pq_val_t; - using pq_val_pack_uint_t = typename pq_val_config::pq_val_pack_uint_t; - using pq_val_pack_t = typename pq_val_config::pq_val_pack_t; + using QUERY_T = typename DescriptorT::QUERY_T; + using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; + using word_type = uint32_t; + constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; + constexpr auto PQ_BITS = DescriptorT::kPqBits; + constexpr auto PQ_LEN = DescriptorT::kPqLen; + using pq_val_config = pq_val_type_t; + using pq_val_t = typename pq_val_config::pq_val_t; + using pq_val_pack_uint_t = typename pq_val_config::pq_val_pack_uint_t; + using pq_val_pack_t = typename pq_val_config::pq_val_pack_t; + constexpr auto pq_val_pack_num_elements = pq_val_config::pq_val_pack_num_elements; auto* r = reinterpret_cast(smem_ptr); @@ -257,18 +260,59 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, auto smem_query_ptr = reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT) + DescriptorT::kSMemCodeBookSizeInBytes); - for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) { - half2 buf2{0, 0}; - if (i < dim) { buf2.x = mapping(queries_ptr[i]); } - if (i + 1 < dim) { buf2.y = mapping(queries_ptr[i + 1]); } - if constexpr ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) { + for (unsigned i = threadIdx.x * pq_val_config::pq_val_pack_num_elements; i < dim; + i += blockDim.x * pq_val_config::pq_val_pack_num_elements) { + pq_val_pack_t buf; + if constexpr (PQ_LEN == 2) { + if (i < dim) { static_cast(static_cast(buf.x = mapping(queries_ptr[i]))); } + if (i + 1 < dim) { + static_cast(static_cast(buf.y = mapping(queries_ptr[i + 1]))); + } + } else if constexpr (PQ_LEN == 4) { + if (i < dim) { buf.x = static_cast(static_cast(mapping(queries_ptr[i]))); } + if (i + 1 < dim) { + buf.y = static_cast(static_cast(mapping(queries_ptr[i + 1]))); + } + if (i + 2 < dim) { + buf.z = static_cast(static_cast(mapping(queries_ptr[i + 2]))); + } + if (i + 3 < dim) { + buf.w = static_cast(static_cast(mapping(queries_ptr[i + 3]))); + } + } else if constexpr (PQ_LEN == 8) { + if (i < dim) { buf.x0 = static_cast(static_cast(mapping(queries_ptr[i]))); } + if (i + 1 < dim) { + buf.x1 = static_cast(static_cast(mapping(queries_ptr[i + 1]))); + } + if (i + 2 < dim) { + buf.x2 = static_cast(static_cast(mapping(queries_ptr[i + 2]))); + } + if (i + 3 < dim) { + buf.x3 = static_cast(static_cast(mapping(queries_ptr[i + 3]))); + } + if (i + 4 < dim) { + buf.x4 = static_cast(static_cast(mapping(queries_ptr[i + 4]))); + } + if (i + 5 < dim) { + buf.x5 = static_cast(static_cast(mapping(queries_ptr[i + 5]))); + } + if (i + 6 < dim) { + buf.x6 = static_cast(static_cast(mapping(queries_ptr[i + 6]))); + } + if (i + 7 < dim) { + buf.x7 = static_cast(static_cast(mapping(queries_ptr[i + 7]))); + } + } + + if constexpr ((PQ_BITS == 8) && (PQ_LEN % pq_val_pack_num_elements == 0)) { // Transpose the queries buffer to avoid bank conflicts in compute_distance. constexpr uint32_t vlen = 4; // **** DO NOT CHANGE **** - constexpr auto kStride = vlen * PQ_LEN / 2; - reinterpret_cast(smem_query_ptr)[transpose(i / 2)] = - buf2; + constexpr auto kStride = vlen * PQ_LEN / pq_val_pack_num_elements; + reinterpret_cast( + smem_query_ptr)[transpose( + i / pq_val_pack_num_elements)] = buf; } else { - (reinterpret_cast(smem_query_ptr + i))[0] = buf2; + (reinterpret_cast(smem_query_ptr + i))[0] = buf; } } @@ -355,104 +399,107 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( std::uint32_t vq_half2_index = m * (pq_val_pack_num_elements / vq_val_pack_num_elements) + (PQ_LEN / vq_val_pack_num_elements) * v; - std::uint32_t query_val_index = - vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; - half2 q2; + std::uint32_t query_val_index = vq_half2_index * kQueryBlock + + elem_offset * (PQ_LEN / pq_val_pack_num_elements) + + e * TeamSize + laneId; // Index in pack_t // if constexpr (false) { if constexpr (PQ_LEN == 2) { - pq_val_pack_t c2; + pq_val_pack_t c2, q2; // Loading PQ code book from smem device::lds(c2, pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); // Loading query vector from smem - device::lds(q2, query_ptr + sizeof(half2) * query_val_index); + device::lds(q2, query_ptr + sizeof(pq_val_pack_t) * query_val_index); // L2 distance auto dist = q2 - c2 - reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); } else if constexpr (PQ_LEN == 4) { - pq_val_pack_t c_vec; + pq_val_pack_t c_vec, q_vec; // Loading PQ code book from smem device::lds(c_vec.as_uint(), pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); - half2 c2_; + device::lds(q_vec.as_uint(), + query_ptr + sizeof(pq_val_pack_uint_t) * query_val_index); + + half2 c2_, q2_; // Loading query vector from smem - device::lds(q2, query_ptr + sizeof(half2) * query_val_index); c2_.x = static_cast(c_vec.x); c2_.y = static_cast(c_vec.y); + q2_.x = static_cast(q_vec.x); + q2_.y = static_cast(q_vec.y); // L2 distance - auto dist = - q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; + auto dist = q2_ - c2_ - + reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); vq_half2_index += 1; - query_val_index += kQueryBlock; - - device::lds(q2, query_ptr + sizeof(half2) * query_val_index); c2_.x = static_cast(c_vec.z); c2_.y = static_cast(c_vec.w); + q2_.x = static_cast(q_vec.z); + q2_.y = static_cast(q_vec.w); // L2 distance - dist = - q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; + dist = q2_ - c2_ - + reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); } else if constexpr (PQ_LEN == 8) { - pq_val_pack_t c_vec; + pq_val_pack_t c_vec, q_vec; // Loading PQ code book from smem device::lds(c_vec.as_uint(), pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); - half2 c2_; + device::lds(q_vec.as_uint(), + query_ptr + sizeof(pq_val_pack_uint_t) * query_val_index); + half2 c2_, q2_; // Loading query vector from smem - device::lds(q2, query_ptr + sizeof(half2) * query_val_index); c2_.x = static_cast(c_vec.x0); c2_.y = static_cast(c_vec.x1); + q2_.x = static_cast(q_vec.x0); + q2_.y = static_cast(q_vec.x1); // L2 distance - auto dist = - q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; + auto dist = q2_ - c2_ - + reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); vq_half2_index += 1; - query_val_index += kQueryBlock; - - device::lds(q2, query_ptr + sizeof(half2) * query_val_index); c2_.x = static_cast(c_vec.x2); c2_.y = static_cast(c_vec.x3); + q2_.x = static_cast(q_vec.x2); + q2_.y = static_cast(q_vec.x3); // L2 distance - dist = - q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; + dist = q2_ - c2_ - + reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); vq_half2_index += 1; - query_val_index += kQueryBlock; - - device::lds(q2, query_ptr + sizeof(half2) * query_val_index); c2_.x = static_cast(c_vec.x4); c2_.y = static_cast(c_vec.x5); + q2_.x = static_cast(q_vec.x4); + q2_.y = static_cast(q_vec.x5); // L2 distance - dist = - q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; + dist = q2_ - c2_ - + reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); vq_half2_index += 1; - query_val_index += kQueryBlock; - - device::lds(q2, query_ptr + sizeof(half2) * query_val_index); c2_.x = static_cast(c_vec.x6); c2_.y = static_cast(c_vec.x7); + q2_.x = static_cast(q_vec.x6); + q2_.y = static_cast(q_vec.x7); // L2 distance - dist = - q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; + dist = q2_ - c2_ - + reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); } From e05dfb00d2faba492d6d84f82679155d4767a2c3 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Thu, 30 Oct 2025 22:55:18 +0900 Subject: [PATCH 022/119] Fix query vec id calc --- .../cagra/compute_distance_vpq-impl.cuh | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 47fbcc0155..8c68b00e99 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -258,10 +258,10 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, constexpr cuvs::spatial::knn::detail::utils::mapping mapping{}; auto smem_query_ptr = - reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT) + - DescriptorT::kSMemCodeBookSizeInBytes); - for (unsigned i = threadIdx.x * pq_val_config::pq_val_pack_num_elements; i < dim; - i += blockDim.x * pq_val_config::pq_val_pack_num_elements) { + reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT) + + DescriptorT::kSMemCodeBookSizeInBytes); + for (unsigned i = threadIdx.x * pq_val_pack_num_elements; i < dim; + i += blockDim.x * pq_val_pack_num_elements) { pq_val_pack_t buf; if constexpr (PQ_LEN == 2) { if (i < dim) { static_cast(static_cast(buf.x = mapping(queries_ptr[i]))); } @@ -399,10 +399,14 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( std::uint32_t vq_half2_index = m * (pq_val_pack_num_elements / vq_val_pack_num_elements) + (PQ_LEN / vq_val_pack_num_elements) * v; - std::uint32_t query_val_index = vq_half2_index * kQueryBlock + - elem_offset * (PQ_LEN / pq_val_pack_num_elements) + - e * TeamSize + laneId; // Index in pack_t - // if constexpr (false) { + const uint32_t query_vec_element_id = + (elem_offset + e * vlen * TeamSize + v + laneId * vlen) * PQ_LEN; + + constexpr auto kStride = vlen * PQ_LEN / pq_val_pack_num_elements; + const auto query_val_index = + transpose( + query_vec_element_id / pq_val_pack_num_elements); + if constexpr (PQ_LEN == 2) { pq_val_pack_t c2, q2; // Loading PQ code book from smem From 5f2b78f57dfd3bbf5d23f26c3b67f20ea288e892 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Fri, 31 Oct 2025 01:18:31 +0900 Subject: [PATCH 023/119] Improve performance --- .../detail/cagra/compute_distance_vpq-impl.cuh | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 8c68b00e99..dbe1e7c3c3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -399,13 +399,19 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( std::uint32_t vq_half2_index = m * (pq_val_pack_num_elements / vq_val_pack_num_elements) + (PQ_LEN / vq_val_pack_num_elements) * v; - const uint32_t query_vec_element_id = - (elem_offset + e * vlen * TeamSize + v + laneId * vlen) * PQ_LEN; - constexpr auto kStride = vlen * PQ_LEN / pq_val_pack_num_elements; - const auto query_val_index = - transpose( - query_vec_element_id / pq_val_pack_num_elements); + uint32_t query_val_index; + if constexpr (PQ_LEN == 2) { + query_val_index = + vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; + } else { + const uint32_t query_vec_element_id = + (elem_offset + e * vlen * TeamSize + v + laneId * vlen) * PQ_LEN / + pq_val_pack_num_elements; + constexpr auto kStride = vlen * PQ_LEN / pq_val_pack_num_elements; + query_val_index = transpose( + query_vec_element_id); + } if constexpr (PQ_LEN == 2) { pq_val_pack_t c2, q2; From 484f9e6279ec6a784ce6c3e4e8816d822634616b Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Fri, 31 Oct 2025 12:29:49 +0900 Subject: [PATCH 024/119] Improve performance --- .../cagra/compute_distance_vpq-impl.cuh | 42 ++++++------- cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh | 62 ++++++++++--------- 2 files changed, 51 insertions(+), 53 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 259c348aa2..448f4c3252 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -229,21 +229,21 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2); } else if constexpr (PQ_LEN == 4) { pq_val_pack_t buf4; - buf4.x = static_cast(static_cast(r->pq_code_book_ptr()[i])); - buf4.y = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); - buf4.z = static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); - buf4.w = static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); + buf4.data.x1[0] = static_cast(static_cast(r->pq_code_book_ptr()[i])); + buf4.data.x1[1] = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); + buf4.data.x1[2] = static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); + buf4.data.x1[3] = static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf4.as_uint()); } else if constexpr (PQ_LEN == 8) { pq_val_pack_t buf8; - buf8.x0 = static_cast(static_cast(r->pq_code_book_ptr()[i])); - buf8.x1 = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); - buf8.x2 = static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); - buf8.x3 = static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); - buf8.x4 = static_cast(static_cast(r->pq_code_book_ptr()[i + 4])); - buf8.x5 = static_cast(static_cast(r->pq_code_book_ptr()[i + 5])); - buf8.x6 = static_cast(static_cast(r->pq_code_book_ptr()[i + 6])); - buf8.x7 = static_cast(static_cast(r->pq_code_book_ptr()[i + 7])); + buf8.data.x1[0] = static_cast(static_cast(r->pq_code_book_ptr()[i])); + buf8.data.x1[1] = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); + buf8.data.x1[2] = static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); + buf8.data.x1[3] = static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); + buf8.data.x1[4] = static_cast(static_cast(r->pq_code_book_ptr()[i + 4])); + buf8.data.x1[5] = static_cast(static_cast(r->pq_code_book_ptr()[i + 5])); + buf8.data.x1[6] = static_cast(static_cast(r->pq_code_book_ptr()[i + 6])); + buf8.data.x1[7] = static_cast(static_cast(r->pq_code_book_ptr()[i + 7])); device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf8.as_uint()); } } @@ -383,8 +383,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( // Loading query vector from smem device::lds(q2, query_ptr + sizeof(half2) * query_val_index); - c2_.x = static_cast(c_vec.x); - c2_.y = static_cast(c_vec.y); + c2_ = c_vec.as_half2(0); // L2 distance auto dist = q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; @@ -395,8 +394,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( query_val_index += kQueryBlock; device::lds(q2, query_ptr + sizeof(half2) * query_val_index); - c2_.x = static_cast(c_vec.z); - c2_.y = static_cast(c_vec.w); + c2_ = c_vec.as_half2(1); // L2 distance dist = q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; @@ -412,8 +410,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( // Loading query vector from smem device::lds(q2, query_ptr + sizeof(half2) * query_val_index); - c2_.x = static_cast(c_vec.x0); - c2_.y = static_cast(c_vec.x1); + c2_ = c_vec.as_half2(0); // L2 distance auto dist = q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; @@ -424,8 +421,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( query_val_index += kQueryBlock; device::lds(q2, query_ptr + sizeof(half2) * query_val_index); - c2_.x = static_cast(c_vec.x2); - c2_.y = static_cast(c_vec.x3); + c2_ = c_vec.as_half2(1); // L2 distance dist = q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; @@ -436,8 +432,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( query_val_index += kQueryBlock; device::lds(q2, query_ptr + sizeof(half2) * query_val_index); - c2_.x = static_cast(c_vec.x4); - c2_.y = static_cast(c_vec.x5); + c2_ = c_vec.as_half2(2); // L2 distance dist = q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; @@ -448,8 +443,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( query_val_index += kQueryBlock; device::lds(q2, query_ptr + sizeof(half2) * query_val_index); - c2_.x = static_cast(c_vec.x6); - c2_.y = static_cast(c_vec.x7); + c2_ = c_vec.as_half2(3); // L2 distance dist = q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh index 10d817dfd3..78c8db6fff 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh @@ -112,52 +112,56 @@ struct fp_8bit { } }; -template -struct fp_8bit4 { - using unit_t = fp_8bit; - using uint_t = uint32_t; - static constexpr uint32_t num_elements = 4; - unit_t x, y, z, w; - HDI fp_8bit4() : x(0.f), y(0.f), z(0.f), w(0.f) {} - - HDI uint_t& as_uint() { return *reinterpret_cast(this); } - HDI uint_t as_uint() const { return *reinterpret_cast(this); } -}; +template +struct fp_8bit4 {}; template <> struct fp_8bit4<5, true, false> { using unit_t = __nv_fp8_e5m2; + using x2_t = __nv_fp8x2_storage_t; using uint_t = uint32_t; static constexpr uint32_t num_elements = 4; - unit_t x, y, z, w; - HDI fp_8bit4() : x(0.f), y(0.f), z(0.f), w(0.f) {} - HDI uint_t& as_uint() { return *reinterpret_cast(this); } - HDI uint_t as_uint() const { return *reinterpret_cast(this); } -}; + union { + unit_t x1[4]; + x2_t x2[2]; + uint_t u; + } data; -template -struct fp_8bit8 { - using unit_t = fp_8bit; - using uint_t = uint64_t; - static constexpr uint32_t num_elements = 8; - unit_t x0, x1, x2, x3, x4, x5, x6, x7; - HDI fp_8bit8() : x0(0.f), x1(0.f), x2(0.f), x3(0.f), x4(0.f), x5(0.f), x6(0.f), x7(0.f) {} + HDI fp_8bit4() { data.u = 0; } - HDI uint_t& as_uint() { return *reinterpret_cast(this); } - HDI uint_t as_uint() const { return *reinterpret_cast(this); } + HDI uint_t& as_uint() { return data.u; } + HDI uint_t as_uint() const { return data.u; } + HDI half2 as_half2(const uint32_t i) const + { + return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2); + } }; +template +struct fp_8bit8 {}; + template <> struct fp_8bit8<5, true, false> { using unit_t = __nv_fp8_e5m2; + using x2_t = __nv_fp8x2_storage_t; using uint_t = uint64_t; static constexpr uint32_t num_elements = 8; - unit_t x0, x1, x2, x3, x4, x5, x6, x7; - HDI fp_8bit8() : x0(0.f), x1(0.f), x2(0.f), x3(0.f), x4(0.f), x5(0.f), x6(0.f), x7(0.f) {} - HDI uint_t& as_uint() { return *reinterpret_cast(this); } - HDI uint_t as_uint() const { return *reinterpret_cast(this); } + union { + unit_t x1[8]; + x2_t x2[4]; + uint_t u; + } data; + + HDI fp_8bit8() { data.u = 0; } + + HDI uint_t& as_uint() { return data.u; } + HDI uint_t as_uint() const { return data.u; } + HDI half2 as_half2(const uint32_t i) const + { + return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2); + } }; } // namespace cuvs::neighbors::ivf_pq::detail From 6bcb0e6d83a073d084760543170722ab8970d29a Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Sun, 2 Nov 2025 13:05:02 +0900 Subject: [PATCH 025/119] Fix template switch --- .../detail/cagra/compute_distance_vpq-impl.cuh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 2333f2e905..b7bcd9e5f2 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -214,19 +214,19 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, const auto smem_index = (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS); - if constexpr (PQ_LEN == 2) { + if constexpr (pq_val_config::pq_val_pack_num_elements == 2) { half2 buf2; buf2.x = r->pq_code_book_ptr()[i]; buf2.y = r->pq_code_book_ptr()[i + 1]; device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2); - } else if constexpr (PQ_LEN == 4) { + } else if constexpr (pq_val_config::pq_val_pack_num_elements == 4) { pq_val_pack_t buf4; buf4.data.x1[0] = static_cast(static_cast(r->pq_code_book_ptr()[i])); buf4.data.x1[1] = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); buf4.data.x1[2] = static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); buf4.data.x1[3] = static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf4.as_uint()); - } else if constexpr (PQ_LEN == 8) { + } else if constexpr (pq_val_config::pq_val_pack_num_elements == 8) { pq_val_pack_t buf8; buf8.data.x1[0] = static_cast(static_cast(r->pq_code_book_ptr()[i])); buf8.data.x1[1] = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); @@ -252,12 +252,12 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, for (unsigned i = threadIdx.x * pq_val_pack_num_elements; i < dim; i += blockDim.x * pq_val_pack_num_elements) { pq_val_pack_t buf; - if constexpr (PQ_LEN == 2) { + if constexpr (pq_val_config::pq_val_pack_num_elements == 2) { if (i < dim) { static_cast(static_cast(buf.x = mapping(queries_ptr[i]))); } if (i + 1 < dim) { static_cast(static_cast(buf.y = mapping(queries_ptr[i + 1]))); } - } else if constexpr (PQ_LEN == 4) { + } else if constexpr (pq_val_config::pq_val_pack_num_elements == 4) { if (i < dim) { buf.data.x1[0] = static_cast(static_cast(mapping(queries_ptr[i]))); } @@ -270,7 +270,7 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, if (i + 3 < dim) { buf.data.x1[3] = static_cast(static_cast(mapping(queries_ptr[i + 3]))); } - } else if constexpr (PQ_LEN == 8) { + } else if constexpr (pq_val_config::pq_val_pack_num_elements == 8) { if (i < dim) { buf.data.x1[0] = static_cast(static_cast(mapping(queries_ptr[i]))); } @@ -406,7 +406,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( query_vec_element_id); } - if constexpr (PQ_LEN == 2) { + if constexpr (pq_val_pack_num_elements == 2) { pq_val_pack_t c2, q2; // Loading PQ code book from smem device::lds(c2, @@ -420,7 +420,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( q2 - c2 - reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); - } else if constexpr (PQ_LEN == 4) { + } else if constexpr (pq_val_pack_num_elements == 4) { pq_val_pack_t c_vec, q_vec; // Loading PQ code book from smem device::lds(c_vec.as_uint(), @@ -448,7 +448,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); - } else if constexpr (PQ_LEN == 8) { + } else if constexpr (pq_val_pack_num_elements == 8) { pq_val_pack_t c_vec, q_vec; // Loading PQ code book from smem device::lds(c_vec.as_uint(), From 581eba1ea34ae8855c8bbf38ab03532201710f48 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Sun, 2 Nov 2025 15:38:53 +0900 Subject: [PATCH 026/119] Fix pq_val_config --- .../cagra/compute_distance_vpq-impl.cuh | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 48b49073d1..e9e536bbfb 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -17,9 +17,7 @@ namespace cuvs::neighbors::cagra::detail { template -struct pq_val_type_t {}; -template <> -struct pq_val_type_t<2> { +struct pq_val_type_t { using pq_val_pack_t = half2; using pq_val_t = half; using pq_val_pack_uint_t = uint32_t; @@ -165,16 +163,17 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, const typename DescriptorT::DATA_T* queries_ptr, uint32_t query_id) -> const DescriptorT* { - using QUERY_T = typename DescriptorT::QUERY_T; - using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; - using word_type = uint32_t; - constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; - constexpr auto PQ_BITS = DescriptorT::kPqBits; - constexpr auto PQ_LEN = DescriptorT::kPqLen; - using pq_val_config = pq_val_type_t; - using pq_val_t = typename pq_val_config::pq_val_t; - using pq_val_pack_uint_t = typename pq_val_config::pq_val_pack_uint_t; - using pq_val_pack_t = typename pq_val_config::pq_val_pack_t; + using QUERY_T = typename DescriptorT::QUERY_T; + using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; + using word_type = uint32_t; + constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; + constexpr auto PQ_BITS = DescriptorT::kPqBits; + constexpr auto PQ_LEN = DescriptorT::kPqLen; + using pq_val_config = pq_val_type_t; + using pq_val_t = typename pq_val_config::pq_val_t; + using pq_val_pack_uint_t = typename pq_val_config::pq_val_pack_uint_t; + using pq_val_pack_t = typename pq_val_config::pq_val_pack_t; + constexpr auto pq_val_pack_num_elements = pq_val_config::pq_val_pack_num_elements; auto* r = reinterpret_cast(smem_ptr); @@ -211,19 +210,19 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, const auto smem_index = (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS); - if constexpr (PQ_LEN == 2) { + if constexpr (pq_val_pack_num_elements == 2) { half2 buf2; buf2.x = r->pq_code_book_ptr()[i]; buf2.y = r->pq_code_book_ptr()[i + 1]; device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2); - } else if constexpr (PQ_LEN == 4) { + } else if constexpr (pq_val_pack_num_elements == 4) { pq_val_pack_t buf4; buf4.data.x1[0] = static_cast(static_cast(r->pq_code_book_ptr()[i])); buf4.data.x1[1] = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); buf4.data.x1[2] = static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); buf4.data.x1[3] = static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf4.as_uint()); - } else if constexpr (PQ_LEN == 8) { + } else if constexpr (pq_val_pack_num_elements == 8) { pq_val_pack_t buf8; buf8.data.x1[0] = static_cast(static_cast(r->pq_code_book_ptr()[i])); buf8.data.x1[1] = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); @@ -348,7 +347,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; half2 q2; // if constexpr (false) { - if constexpr (PQ_LEN == 2) { + if constexpr (pq_val_pack_num_elements == 2) { pq_val_pack_t c2; // Loading PQ code book from smem device::lds(c2, @@ -362,7 +361,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( q2 - c2 - reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); - } else if constexpr (PQ_LEN == 4) { + } else if constexpr (pq_val_pack_num_elements == 4) { pq_val_pack_t c_vec; // Loading PQ code book from smem device::lds(c_vec.as_uint(), @@ -389,7 +388,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( q2 - c2_ - reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); - } else if constexpr (PQ_LEN == 8) { + } else if constexpr (pq_val_pack_num_elements == 8) { pq_val_pack_t c_vec; // Loading PQ code book from smem device::lds(c_vec.as_uint(), From a088cfd3a40df04cb53c1a516a0b8c2a0f043e51 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Mon, 3 Nov 2025 14:52:26 +0900 Subject: [PATCH 027/119] Improve smem index calculation --- cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index c63476f3c3..0a3cd9e4df 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -395,6 +395,10 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( if constexpr (PQ_LEN == 2) { query_val_index = vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; + } else if constexpr (PQ_LEN == pq_val_pack_num_elements) { + query_val_index = elem_offset + + v * (DatasetBlockDim / (pq_val_pack_num_elements * vlen)) + + e * TeamSize + laneId; } else { const uint32_t query_vec_element_id = (elem_offset + e * vlen * TeamSize + v + laneId * vlen) * PQ_LEN / From 12809f3633bdc948facaa1c5aea7023c7e04bbb7 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Tue, 11 Nov 2025 12:27:40 +0900 Subject: [PATCH 028/119] Update fp8 pack dtype --- .../cagra/compute_distance_vpq-impl.cuh | 234 +++++++++--------- .../neighbors/detail/cagra/device_common.hpp | 37 +++ cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh | 53 ---- 3 files changed, 155 insertions(+), 169 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 0a3cd9e4df..d626fbb422 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -5,7 +5,6 @@ #pragma once -#include "../../ivf_pq/ivf_pq_fp_8bit.cuh" #include "compute_distance_vpq.hpp" #include @@ -17,25 +16,18 @@ namespace cuvs::neighbors::cagra::detail { template -struct pq_val_type_t { - using pq_val_pack_t = half2; - using pq_val_t = half; - using pq_val_pack_uint_t = uint32_t; - static constexpr uint32_t pq_val_pack_num_elements = 2; +struct smem_val_type_t { + using smem_val_pack_t = device::fp8xN; + using smem_val_t = typename smem_val_pack_t::unit_t; + using smem_val_pack_uint_t = typename smem_val_pack_t::uint_t; + static constexpr uint32_t num_packed_elements = smem_val_pack_t::num_elements; }; template <> -struct pq_val_type_t<4> { - using pq_val_pack_t = ivf_pq::detail::fp_8bit4<5, true, false>; - using pq_val_t = typename pq_val_pack_t::unit_t; - using pq_val_pack_uint_t = typename pq_val_pack_t::uint_t; - static constexpr uint32_t pq_val_pack_num_elements = pq_val_pack_t::num_elements; -}; -template <> -struct pq_val_type_t<8> { - using pq_val_pack_t = ivf_pq::detail::fp_8bit8<5, true, false>; - using pq_val_t = typename pq_val_pack_t::unit_t; - using pq_val_pack_uint_t = typename pq_val_pack_t::uint_t; - static constexpr uint32_t pq_val_pack_num_elements = pq_val_pack_t::num_elements; +struct smem_val_type_t<2> { + using smem_val_pack_t = half2; + using smem_val_t = half; + using smem_val_pack_uint_t = uint32_t; + static constexpr uint32_t num_packed_elements = 2; }; template ::pq_val_pack_uint_t>() / - pq_val_type_t::pq_val_pack_num_elements; + (1 << PQ_BITS) * PQ_LEN * + utils::size_of::smem_val_pack_uint_t>() / + smem_val_type_t::num_packed_elements; _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl, compute_distance_type* compute_distance_impl, @@ -144,8 +137,8 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(dim, DatasetBlockDim) * - utils::size_of::pq_val_pack_uint_t>() / - pq_val_type_t::pq_val_pack_num_elements; + utils::size_of::smem_val_pack_uint_t>() / + smem_val_type_t::num_packed_elements; } }; @@ -165,17 +158,17 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, const typename DescriptorT::DATA_T* queries_ptr, uint32_t query_id) -> const DescriptorT* { - using QUERY_T = typename DescriptorT::QUERY_T; - using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; - using word_type = uint32_t; - constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; - constexpr auto PQ_BITS = DescriptorT::kPqBits; - constexpr auto PQ_LEN = DescriptorT::kPqLen; - using pq_val_config = pq_val_type_t; - using pq_val_t = typename pq_val_config::pq_val_t; - using pq_val_pack_uint_t = typename pq_val_config::pq_val_pack_uint_t; - using pq_val_pack_t = typename pq_val_config::pq_val_pack_t; - constexpr auto pq_val_pack_num_elements = pq_val_config::pq_val_pack_num_elements; + using QUERY_T = typename DescriptorT::QUERY_T; + using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; + using word_type = uint32_t; + constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; + constexpr auto PQ_BITS = DescriptorT::kPqBits; + constexpr auto PQ_LEN = DescriptorT::kPqLen; + using smem_val_config = smem_val_type_t; + using smem_val_t = typename smem_val_config::smem_val_t; + using smem_val_pack_uint_t = typename smem_val_config::smem_val_pack_uint_t; + using smem_val_pack_t = typename smem_val_config::smem_val_pack_t; + constexpr auto num_packed_elements = smem_val_config::num_packed_elements; auto* r = reinterpret_cast(smem_ptr); @@ -197,14 +190,14 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, __syncthreads(); // Copy PQ table - for (unsigned i = threadIdx.x * pq_val_config::pq_val_pack_num_elements; + for (unsigned i = threadIdx.x * smem_val_config::num_packed_elements; i < (1 << PQ_BITS) * PQ_LEN; - i += blockDim.x * pq_val_config::pq_val_pack_num_elements) { + i += blockDim.x * smem_val_config::num_packed_elements) { // Change the order of PQ code book array to reduce the // frequency of bank conflicts. constexpr auto num_elements_per_bank = - pq_val_config::pq_val_pack_num_elements / - (utils::size_of() / utils::size_of()); + smem_val_config::num_packed_elements / + (utils::size_of() / utils::size_of()); if constexpr (PQ_LEN >= num_elements_per_bank) { // safety constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank; @@ -212,29 +205,39 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, const auto smem_index = (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS); - if constexpr (pq_val_pack_num_elements == 2) { + if constexpr (num_packed_elements == 2) { half2 buf2; buf2.x = r->pq_code_book_ptr()[i]; buf2.y = r->pq_code_book_ptr()[i + 1]; - device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_t), buf2); - } else if constexpr (pq_val_pack_num_elements == 4) { - pq_val_pack_t buf4; - buf4.data.x1[0] = static_cast(static_cast(r->pq_code_book_ptr()[i])); - buf4.data.x1[1] = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); - buf4.data.x1[2] = static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); - buf4.data.x1[3] = static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); - device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf4.as_uint()); - } else if constexpr (pq_val_pack_num_elements == 8) { - pq_val_pack_t buf8; - buf8.data.x1[0] = static_cast(static_cast(r->pq_code_book_ptr()[i])); - buf8.data.x1[1] = static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); - buf8.data.x1[2] = static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); - buf8.data.x1[3] = static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); - buf8.data.x1[4] = static_cast(static_cast(r->pq_code_book_ptr()[i + 4])); - buf8.data.x1[5] = static_cast(static_cast(r->pq_code_book_ptr()[i + 5])); - buf8.data.x1[6] = static_cast(static_cast(r->pq_code_book_ptr()[i + 6])); - buf8.data.x1[7] = static_cast(static_cast(r->pq_code_book_ptr()[i + 7])); - device::sts(codebook_buf + smem_index * sizeof(pq_val_pack_uint_t), buf8.as_uint()); + device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_t), buf2); + } else if constexpr (num_packed_elements == 4) { + smem_val_pack_t buf4; + buf4.data.x1[0] = static_cast(static_cast(r->pq_code_book_ptr()[i])); + buf4.data.x1[1] = + static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); + buf4.data.x1[2] = + static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); + buf4.data.x1[3] = + static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); + device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_uint_t), buf4.as_uint()); + } else if constexpr (num_packed_elements == 8) { + smem_val_pack_t buf8; + buf8.data.x1[0] = static_cast(static_cast(r->pq_code_book_ptr()[i])); + buf8.data.x1[1] = + static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); + buf8.data.x1[2] = + static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); + buf8.data.x1[3] = + static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); + buf8.data.x1[4] = + static_cast(static_cast(r->pq_code_book_ptr()[i + 4])); + buf8.data.x1[5] = + static_cast(static_cast(r->pq_code_book_ptr()[i + 5])); + buf8.data.x1[6] = + static_cast(static_cast(r->pq_code_book_ptr()[i + 6])); + buf8.data.x1[7] = + static_cast(static_cast(r->pq_code_book_ptr()[i + 7])); + device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_uint_t), buf8.as_uint()); } } } @@ -245,65 +248,65 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, constexpr cuvs::spatial::knn::detail::utils::mapping mapping{}; auto smem_query_ptr = - reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT) + - DescriptorT::kSMemCodeBookSizeInBytes); - for (unsigned i = threadIdx.x * pq_val_pack_num_elements; i < dim; - i += blockDim.x * pq_val_pack_num_elements) { - pq_val_pack_t buf; - if constexpr (pq_val_config::pq_val_pack_num_elements == 2) { - if (i < dim) { static_cast(static_cast(buf.x = mapping(queries_ptr[i]))); } + reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT) + + DescriptorT::kSMemCodeBookSizeInBytes); + for (unsigned i = threadIdx.x * num_packed_elements; i < dim; + i += blockDim.x * num_packed_elements) { + smem_val_pack_t buf; + if constexpr (smem_val_config::num_packed_elements == 2) { + if (i < dim) { static_cast(static_cast(buf.x = mapping(queries_ptr[i]))); } if (i + 1 < dim) { - static_cast(static_cast(buf.y = mapping(queries_ptr[i + 1]))); + static_cast(static_cast(buf.y = mapping(queries_ptr[i + 1]))); } - } else if constexpr (pq_val_config::pq_val_pack_num_elements == 4) { + } else if constexpr (smem_val_config::num_packed_elements == 4) { if (i < dim) { - buf.data.x1[0] = static_cast(static_cast(mapping(queries_ptr[i]))); + buf.data.x1[0] = static_cast(static_cast(mapping(queries_ptr[i]))); } if (i + 1 < dim) { - buf.data.x1[1] = static_cast(static_cast(mapping(queries_ptr[i + 1]))); + buf.data.x1[1] = static_cast(static_cast(mapping(queries_ptr[i + 1]))); } if (i + 2 < dim) { - buf.data.x1[2] = static_cast(static_cast(mapping(queries_ptr[i + 2]))); + buf.data.x1[2] = static_cast(static_cast(mapping(queries_ptr[i + 2]))); } if (i + 3 < dim) { - buf.data.x1[3] = static_cast(static_cast(mapping(queries_ptr[i + 3]))); + buf.data.x1[3] = static_cast(static_cast(mapping(queries_ptr[i + 3]))); } - } else if constexpr (pq_val_config::pq_val_pack_num_elements == 8) { + } else if constexpr (smem_val_config::num_packed_elements == 8) { if (i < dim) { - buf.data.x1[0] = static_cast(static_cast(mapping(queries_ptr[i]))); + buf.data.x1[0] = static_cast(static_cast(mapping(queries_ptr[i]))); } if (i + 1 < dim) { - buf.data.x1[1] = static_cast(static_cast(mapping(queries_ptr[i + 1]))); + buf.data.x1[1] = static_cast(static_cast(mapping(queries_ptr[i + 1]))); } if (i + 2 < dim) { - buf.data.x1[2] = static_cast(static_cast(mapping(queries_ptr[i + 2]))); + buf.data.x1[2] = static_cast(static_cast(mapping(queries_ptr[i + 2]))); } if (i + 3 < dim) { - buf.data.x1[3] = static_cast(static_cast(mapping(queries_ptr[i + 3]))); + buf.data.x1[3] = static_cast(static_cast(mapping(queries_ptr[i + 3]))); } if (i + 4 < dim) { - buf.data.x1[4] = static_cast(static_cast(mapping(queries_ptr[i + 4]))); + buf.data.x1[4] = static_cast(static_cast(mapping(queries_ptr[i + 4]))); } if (i + 5 < dim) { - buf.data.x1[5] = static_cast(static_cast(mapping(queries_ptr[i + 5]))); + buf.data.x1[5] = static_cast(static_cast(mapping(queries_ptr[i + 5]))); } if (i + 6 < dim) { - buf.data.x1[6] = static_cast(static_cast(mapping(queries_ptr[i + 6]))); + buf.data.x1[6] = static_cast(static_cast(mapping(queries_ptr[i + 6]))); } if (i + 7 < dim) { - buf.data.x1[7] = static_cast(static_cast(mapping(queries_ptr[i + 7]))); + buf.data.x1[7] = static_cast(static_cast(mapping(queries_ptr[i + 7]))); } } - if constexpr ((PQ_BITS == 8) && (PQ_LEN % pq_val_pack_num_elements == 0)) { + if constexpr ((PQ_BITS == 8) && (PQ_LEN % num_packed_elements == 0)) { // Transpose the queries buffer to avoid bank conflicts in compute_distance. constexpr uint32_t vlen = 4; // **** DO NOT CHANGE **** - constexpr auto kStride = vlen * PQ_LEN / pq_val_pack_num_elements; - reinterpret_cast( - smem_query_ptr)[transpose( - i / pq_val_pack_num_elements)] = buf; + constexpr auto kStride = vlen * PQ_LEN / num_packed_elements; + reinterpret_cast( + smem_query_ptr)[transpose( + i / num_packed_elements)] = buf; } else { - (reinterpret_cast(smem_query_ptr + i))[0] = buf; + (reinterpret_cast(smem_query_ptr + i))[0] = buf; } } @@ -327,11 +330,11 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( constexpr auto PQ_LEN = DescriptorT::kPqLen; using PQ_CODEBOOK_LOAD_T = uint32_t; - using pq_val_config = pq_val_type_t; - using pq_val_t = typename pq_val_config::pq_val_t; - using pq_val_pack_uint_t = typename pq_val_config::pq_val_pack_uint_t; - using pq_val_pack_t = typename pq_val_config::pq_val_pack_t; - constexpr uint32_t pq_val_pack_num_elements = pq_val_config::pq_val_pack_num_elements; + using smem_val_config = smem_val_type_t; + using smem_val_t = typename smem_val_config::smem_val_t; + using smem_val_pack_uint_t = typename smem_val_config::smem_val_pack_uint_t; + using smem_val_pack_t = typename smem_val_config::smem_val_pack_t; + constexpr uint32_t num_packed_elements = smem_val_config::num_packed_elements; const uint32_t query_ptr = pq_codebook_ptr + DescriptorT::kSMemCodeBookSizeInBytes; static_assert(PQ_BITS == 8, "Only pq_bits == 8 is supported at the moment."); @@ -364,7 +367,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( } // if constexpr (PQ_LEN % 2 == 0) { - if constexpr (PQ_LEN >= pq_val_pack_num_elements) { // safety + if constexpr (PQ_LEN >= num_packed_elements) { // safety // **** Use half2 for distance computation **** #pragma unroll for (std::uint32_t e = 0; e < nelem; e++) { @@ -384,52 +387,51 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( for (std::uint32_t v = 0; v < vlen; v++) { if (PQ_LEN * (v + k) >= dim) break; #pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN / pq_val_pack_num_elements; m++) { + for (std::uint32_t m = 0; m < PQ_LEN / num_packed_elements; m++) { constexpr uint32_t vq_val_pack_num_elements = 2; constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN); - std::uint32_t vq_half2_index = - m * (pq_val_pack_num_elements / vq_val_pack_num_elements) + - (PQ_LEN / vq_val_pack_num_elements) * v; + std::uint32_t vq_half2_index = m * (num_packed_elements / vq_val_pack_num_elements) + + (PQ_LEN / vq_val_pack_num_elements) * v; uint32_t query_val_index; if constexpr (PQ_LEN == 2) { query_val_index = vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; - } else if constexpr (PQ_LEN == pq_val_pack_num_elements) { + } else if constexpr (PQ_LEN == num_packed_elements) { query_val_index = elem_offset + - v * (DatasetBlockDim / (pq_val_pack_num_elements * vlen)) + + v * (DatasetBlockDim / (num_packed_elements * vlen)) + e * TeamSize + laneId; } else { const uint32_t query_vec_element_id = (elem_offset + e * vlen * TeamSize + v + laneId * vlen) * PQ_LEN / - pq_val_pack_num_elements; - constexpr auto kStride = vlen * PQ_LEN / pq_val_pack_num_elements; - query_val_index = transpose( - query_vec_element_id); + num_packed_elements; + constexpr auto kStride = vlen * PQ_LEN / num_packed_elements; + query_val_index = + transpose(query_vec_element_id); } - if constexpr (pq_val_pack_num_elements == 2) { - pq_val_pack_t c2, q2; + if constexpr (num_packed_elements == 2) { + smem_val_pack_t c2, q2; // Loading PQ code book from smem device::lds(c2, - pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * + pq_codebook_ptr + sizeof(smem_val_pack_uint_t) * ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); // Loading query vector from smem - device::lds(q2, query_ptr + sizeof(pq_val_pack_t) * query_val_index); + device::lds(q2, query_ptr + sizeof(smem_val_pack_t) * query_val_index); // L2 distance auto dist = q2 - c2 - reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); - } else if constexpr (pq_val_pack_num_elements == 4) { - pq_val_pack_t c_vec, q_vec; + } else if constexpr (num_packed_elements == 4) { + smem_val_pack_t c_vec, q_vec; // Loading PQ code book from smem device::lds(c_vec.as_uint(), - pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * + pq_codebook_ptr + sizeof(smem_val_pack_uint_t) * ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); device::lds(q_vec.as_uint(), - query_ptr + sizeof(pq_val_pack_uint_t) * query_val_index); + query_ptr + sizeof(smem_val_pack_uint_t) * query_val_index); half2 c2_, q2_; @@ -450,14 +452,14 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); - } else if constexpr (pq_val_pack_num_elements == 8) { - pq_val_pack_t c_vec, q_vec; + } else if constexpr (num_packed_elements == 8) { + smem_val_pack_t c_vec, q_vec; // Loading PQ code book from smem device::lds(c_vec.as_uint(), - pq_codebook_ptr + sizeof(pq_val_pack_uint_t) * + pq_codebook_ptr + sizeof(smem_val_pack_uint_t) * ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); device::lds(q_vec.as_uint(), - query_ptr + sizeof(pq_val_pack_uint_t) * query_val_index); + query_ptr + sizeof(smem_val_pack_uint_t) * query_val_index); half2 c2_, q2_; // Loading query vector from smem @@ -521,8 +523,8 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( #pragma unroll for (std::uint32_t v = 0; v < vlen; v++) { if (PQ_LEN * (v + k) >= dim) break; - CODE_BOOK_T pq_vals[PQ_LEN]; - device::lds(pq_vals, pq_codebook_ptr + sizeof(CODE_BOOK_T) * PQ_LEN * (pq_code & 0xff)); + CODE_BOOK_T smem_vals[PQ_LEN]; + device::lds(smem_vals, pq_codebook_ptr + sizeof(CODE_BOOK_T) * PQ_LEN * (pq_code & 0xff)); #pragma unroll for (std::uint32_t m = 0; m < PQ_LEN; m++) { const std::uint32_t d1 = m + (PQ_LEN * v); @@ -530,7 +532,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( // if (d >= dataset_dim) break; DISTANCE_T diff; device::lds(diff, query_ptr + sizeof(QUERY_T) * d); - diff -= static_cast(pq_vals[m]); + diff -= static_cast(smem_vals[m]); diff -= static_cast(reinterpret_cast(vq_vals)[d1]); norm += diff * diff; diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index aa852803b0..aa0e3847b5 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -249,6 +249,43 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes( } } +template +struct uintN_t {}; +template <> +struct uintN_t<32> { + using type = uint32_t; +}; +template <> +struct uintN_t<64> { + using type = uint64_t; +}; + +template +struct fp8xN {}; + +template +struct fp8xN { + using uint_t = typename uintN_t<8 * NumPacked>::type; + using unit_t = __nv_fp8_e5m2; + using x2_t = __nv_fp8x2_storage_t; + static constexpr uint32_t num_elements = NumPacked; + + union { + unit_t x1[num_elements]; + x2_t x2[num_elements / 2]; + uint_t u; + } data; + + HDI fp8xN() { data.u = 0; } + + HDI uint_t& as_uint() { return data.u; } + HDI uint_t as_uint() const { return data.u; } + HDI half2 as_half2(const uint32_t i) const + { + return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2); + } +}; + RAFT_DEVICE_INLINE_FUNCTION void lds(float& x, uint32_t addr) { asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "r"(addr)); diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh index 2ebf562488..61b9e595fc 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh @@ -100,57 +100,4 @@ struct fp_8bit { return r; } }; - -template -struct fp_8bit4 {}; - -template <> -struct fp_8bit4<5, true, false> { - using unit_t = __nv_fp8_e5m2; - using x2_t = __nv_fp8x2_storage_t; - using uint_t = uint32_t; - static constexpr uint32_t num_elements = 4; - - union { - unit_t x1[4]; - x2_t x2[2]; - uint_t u; - } data; - - HDI fp_8bit4() { data.u = 0; } - - HDI uint_t& as_uint() { return data.u; } - HDI uint_t as_uint() const { return data.u; } - HDI half2 as_half2(const uint32_t i) const - { - return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2); - } -}; - -template -struct fp_8bit8 {}; - -template <> -struct fp_8bit8<5, true, false> { - using unit_t = __nv_fp8_e5m2; - using x2_t = __nv_fp8x2_storage_t; - using uint_t = uint64_t; - static constexpr uint32_t num_elements = 8; - - union { - unit_t x1[8]; - x2_t x2[4]; - uint_t u; - } data; - - HDI fp_8bit8() { data.u = 0; } - - HDI uint_t& as_uint() { return data.u; } - HDI uint_t as_uint() const { return data.u; } - HDI half2 as_half2(const uint32_t i) const - { - return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2); - } -}; - } // namespace cuvs::neighbors::ivf_pq::detail From f91d0413becb4fffd37c1aea9b63f2a89eebb8c8 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Tue, 11 Nov 2025 12:41:57 +0900 Subject: [PATCH 029/119] Refactoring --- .../cagra/compute_distance_vpq-impl.cuh | 158 ++++-------------- 1 file changed, 29 insertions(+), 129 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index d626fbb422..b0b022c5d3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -210,34 +210,13 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, buf2.x = r->pq_code_book_ptr()[i]; buf2.y = r->pq_code_book_ptr()[i + 1]; device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_t), buf2); - } else if constexpr (num_packed_elements == 4) { - smem_val_pack_t buf4; - buf4.data.x1[0] = static_cast(static_cast(r->pq_code_book_ptr()[i])); - buf4.data.x1[1] = - static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); - buf4.data.x1[2] = - static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); - buf4.data.x1[3] = - static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); - device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_uint_t), buf4.as_uint()); - } else if constexpr (num_packed_elements == 8) { - smem_val_pack_t buf8; - buf8.data.x1[0] = static_cast(static_cast(r->pq_code_book_ptr()[i])); - buf8.data.x1[1] = - static_cast(static_cast(r->pq_code_book_ptr()[i + 1])); - buf8.data.x1[2] = - static_cast(static_cast(r->pq_code_book_ptr()[i + 2])); - buf8.data.x1[3] = - static_cast(static_cast(r->pq_code_book_ptr()[i + 3])); - buf8.data.x1[4] = - static_cast(static_cast(r->pq_code_book_ptr()[i + 4])); - buf8.data.x1[5] = - static_cast(static_cast(r->pq_code_book_ptr()[i + 5])); - buf8.data.x1[6] = - static_cast(static_cast(r->pq_code_book_ptr()[i + 6])); - buf8.data.x1[7] = - static_cast(static_cast(r->pq_code_book_ptr()[i + 7])); - device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_uint_t), buf8.as_uint()); + } else if constexpr (num_packed_elements == 4 || num_packed_elements == 8) { + smem_val_pack_t buf; +#pragma unroll + for (uint32_t k = 0; k < num_packed_elements; k++) { + buf.data.x1[k] = static_cast(static_cast(r->pq_code_book_ptr()[k])); + } + device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_uint_t), buf.as_uint()); } } } @@ -253,48 +232,17 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, for (unsigned i = threadIdx.x * num_packed_elements; i < dim; i += blockDim.x * num_packed_elements) { smem_val_pack_t buf; - if constexpr (smem_val_config::num_packed_elements == 2) { + if constexpr (num_packed_elements == 2) { if (i < dim) { static_cast(static_cast(buf.x = mapping(queries_ptr[i]))); } if (i + 1 < dim) { static_cast(static_cast(buf.y = mapping(queries_ptr[i + 1]))); } - } else if constexpr (smem_val_config::num_packed_elements == 4) { - if (i < dim) { - buf.data.x1[0] = static_cast(static_cast(mapping(queries_ptr[i]))); - } - if (i + 1 < dim) { - buf.data.x1[1] = static_cast(static_cast(mapping(queries_ptr[i + 1]))); - } - if (i + 2 < dim) { - buf.data.x1[2] = static_cast(static_cast(mapping(queries_ptr[i + 2]))); - } - if (i + 3 < dim) { - buf.data.x1[3] = static_cast(static_cast(mapping(queries_ptr[i + 3]))); - } - } else if constexpr (smem_val_config::num_packed_elements == 8) { - if (i < dim) { - buf.data.x1[0] = static_cast(static_cast(mapping(queries_ptr[i]))); - } - if (i + 1 < dim) { - buf.data.x1[1] = static_cast(static_cast(mapping(queries_ptr[i + 1]))); - } - if (i + 2 < dim) { - buf.data.x1[2] = static_cast(static_cast(mapping(queries_ptr[i + 2]))); - } - if (i + 3 < dim) { - buf.data.x1[3] = static_cast(static_cast(mapping(queries_ptr[i + 3]))); - } - if (i + 4 < dim) { - buf.data.x1[4] = static_cast(static_cast(mapping(queries_ptr[i + 4]))); - } - if (i + 5 < dim) { - buf.data.x1[5] = static_cast(static_cast(mapping(queries_ptr[i + 5]))); - } - if (i + 6 < dim) { - buf.data.x1[6] = static_cast(static_cast(mapping(queries_ptr[i + 6]))); - } - if (i + 7 < dim) { - buf.data.x1[7] = static_cast(static_cast(mapping(queries_ptr[i + 7]))); + } else if constexpr (num_packed_elements == 4 || num_packed_elements == 8) { +#pragma unroll + for (uint32_t k = 0; k < num_packed_elements; k++) { + if (i + k < dim) { + buf.data.x1[k] = static_cast(static_cast(mapping(queries_ptr[i + k]))); + } } } @@ -424,7 +372,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( q2 - c2 - reinterpret_cast(vq_vals)[vq_half2_index]; dist = dist * dist; norm += static_cast(dist.x + dist.y); - } else if constexpr (num_packed_elements == 4) { + } else if constexpr (num_packed_elements == 4 || num_packed_elements == 8) { smem_val_pack_t c_vec, q_vec; // Loading PQ code book from smem device::lds(c_vec.as_uint(), @@ -435,68 +383,20 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( half2 c2_, q2_; - // Loading query vector from smem - c2_ = c_vec.as_half2(0); - q2_ = q_vec.as_half2(0); - // L2 distance - auto dist = q2_ - c2_ - - reinterpret_cast(vq_vals)[vq_half2_index]; - dist = dist * dist; - norm += static_cast(dist.x + dist.y); - - vq_half2_index += 1; - c2_ = c_vec.as_half2(1); - q2_ = q_vec.as_half2(1); - // L2 distance - dist = q2_ - c2_ - - reinterpret_cast(vq_vals)[vq_half2_index]; - dist = dist * dist; - norm += static_cast(dist.x + dist.y); - } else if constexpr (num_packed_elements == 8) { - smem_val_pack_t c_vec, q_vec; - // Loading PQ code book from smem - device::lds(c_vec.as_uint(), - pq_codebook_ptr + sizeof(smem_val_pack_uint_t) * - ((1 << PQ_BITS) * m + ((pq_code & 0xff)))); - device::lds(q_vec.as_uint(), - query_ptr + sizeof(smem_val_pack_uint_t) * query_val_index); - half2 c2_, q2_; - - // Loading query vector from smem - c2_ = c_vec.as_half2(0); - q2_ = q_vec.as_half2(0); - // L2 distance - auto dist = q2_ - c2_ - - reinterpret_cast(vq_vals)[vq_half2_index]; - dist = dist * dist; - norm += static_cast(dist.x + dist.y); - - vq_half2_index += 1; - c2_ = c_vec.as_half2(1); - q2_ = q_vec.as_half2(1); - // L2 distance - dist = q2_ - c2_ - - reinterpret_cast(vq_vals)[vq_half2_index]; - dist = dist * dist; - norm += static_cast(dist.x + dist.y); - - vq_half2_index += 1; - c2_ = c_vec.as_half2(2); - q2_ = q_vec.as_half2(2); - // L2 distance - dist = q2_ - c2_ - - reinterpret_cast(vq_vals)[vq_half2_index]; - dist = dist * dist; - norm += static_cast(dist.x + dist.y); - - vq_half2_index += 1; - c2_ = c_vec.as_half2(3); - q2_ = q_vec.as_half2(3); - // L2 distance - dist = q2_ - c2_ - - reinterpret_cast(vq_vals)[vq_half2_index]; - dist = dist * dist; - norm += static_cast(dist.x + dist.y); +#pragma unroll + for (uint32_t bi = 0; bi < num_packed_elements / 2; bi++) { + // Loading query vector from smem + c2_ = c_vec.as_half2(bi); + q2_ = q_vec.as_half2(bi); + // L2 distance + auto dist = + q2_ - c2_ - + reinterpret_cast(vq_vals)[vq_half2_index]; + dist = dist * dist; + norm += static_cast(dist.x + dist.y); + + vq_half2_index += 1; + } } } pq_code >>= 8; From 7c8ecd461e44a29ee7bb734d076f101a622e22e9 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Tue, 11 Nov 2025 16:06:11 +0900 Subject: [PATCH 030/119] Fix a bug --- cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index b0b022c5d3..786a294df7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -214,7 +214,8 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, smem_val_pack_t buf; #pragma unroll for (uint32_t k = 0; k < num_packed_elements; k++) { - buf.data.x1[k] = static_cast(static_cast(r->pq_code_book_ptr()[k])); + buf.data.x1[k] = + static_cast(static_cast(r->pq_code_book_ptr()[i + k])); } device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_uint_t), buf.as_uint()); } From 7b461156e77e63af45d9d62053d643e863ef4475 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Tue, 11 Nov 2025 18:37:42 +0900 Subject: [PATCH 031/119] Add EnableFP8 flag --- .../cagra/compute_distance_vpq-impl.cuh | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 786a294df7..d017372924 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -15,21 +15,27 @@ namespace cuvs::neighbors::cagra::detail { -template -struct smem_val_type_t { - using smem_val_pack_t = device::fp8xN; - using smem_val_t = typename smem_val_pack_t::unit_t; - using smem_val_pack_uint_t = typename smem_val_pack_t::uint_t; - static constexpr uint32_t num_packed_elements = smem_val_pack_t::num_elements; -}; -template <> -struct smem_val_type_t<2> { +template +struct smem_val_type_t; + +template +struct smem_val_type_t> { using smem_val_pack_t = half2; using smem_val_t = half; using smem_val_pack_uint_t = uint32_t; static constexpr uint32_t num_packed_elements = 2; }; +template +struct smem_val_type_t> { + using smem_val_pack_t = device::fp8xN; + using smem_val_t = typename smem_val_pack_t::unit_t; + using smem_val_pack_uint_t = typename smem_val_pack_t::uint_t; + static constexpr uint32_t num_packed_elements = smem_val_pack_t::num_elements; +}; + template + typename DistanceT, + bool EnableFP8 = true> struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t { using base_type = dataset_descriptor_base_t; using CODE_BOOK_T = CodebookT; @@ -57,6 +64,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t, "Only CODE_BOOK_T = `half` is supported now"); @@ -101,8 +109,8 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t::smem_val_pack_uint_t>() / - smem_val_type_t::num_packed_elements; + utils::size_of::smem_val_pack_uint_t>() / + smem_val_type_t::num_packed_elements; _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl, compute_distance_type* compute_distance_impl, @@ -137,8 +145,8 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(dim, DatasetBlockDim) * - utils::size_of::smem_val_pack_uint_t>() / - smem_val_type_t::num_packed_elements; + utils::size_of::smem_val_pack_uint_t>() / + smem_val_type_t::num_packed_elements; } }; @@ -164,7 +172,8 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that, constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; constexpr auto PQ_BITS = DescriptorT::kPqBits; constexpr auto PQ_LEN = DescriptorT::kPqLen; - using smem_val_config = smem_val_type_t; + constexpr auto EnableFP8 = DescriptorT::kEnableFP8; + using smem_val_config = smem_val_type_t; using smem_val_t = typename smem_val_config::smem_val_t; using smem_val_pack_uint_t = typename smem_val_config::smem_val_pack_uint_t; using smem_val_pack_t = typename smem_val_config::smem_val_pack_t; @@ -277,9 +286,10 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim; constexpr auto PQ_BITS = DescriptorT::kPqBits; constexpr auto PQ_LEN = DescriptorT::kPqLen; + constexpr auto EnableFP8 = DescriptorT::kEnableFP8; using PQ_CODEBOOK_LOAD_T = uint32_t; - using smem_val_config = smem_val_type_t; + using smem_val_config = smem_val_type_t; using smem_val_t = typename smem_val_config::smem_val_t; using smem_val_pack_uint_t = typename smem_val_config::smem_val_pack_uint_t; using smem_val_pack_t = typename smem_val_config::smem_val_pack_t; From d49959c8a72747f393b3ee7afe4691ab7f573282 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Tue, 11 Nov 2025 19:17:51 +0900 Subject: [PATCH 032/119] Fix a bug --- cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index d017372924..13566627fb 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -353,7 +353,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker( (PQ_LEN / vq_val_pack_num_elements) * v; uint32_t query_val_index; - if constexpr (PQ_LEN == 2) { + if constexpr (num_packed_elements == 2) { query_val_index = vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; } else if constexpr (PQ_LEN == num_packed_elements) { From ed906f7cac8f8c2ba060b8daf00137e6b5055c5d Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 12 Nov 2025 11:25:38 +0900 Subject: [PATCH 033/119] Fix a bug in compute_distance_00_generate.py --- cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py index 7beb86c5a9..da16bac177 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -8,7 +8,8 @@ * SPDX-FileCopyrightText: Copyright (c) 2024-{datetime.datetime.today().year}, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ - +""" +template += """ /* * NOTE: this file is generated by compute_distance_00_generate.py * From c0e9ddd2273d413bcf5f5d5bd978c1b3b4ef18ac Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 12 Nov 2025 12:00:08 +0900 Subject: [PATCH 034/119] Update VPQ instances --- cpp/CMakeLists.txt | 144 +- .../detail/cagra/compute_distance-ext.cuh | 1344 +++++++++++++++-- .../detail/cagra/compute_distance.cu | 360 ++++- .../cagra/compute_distance_00_generate.py | 19 +- .../cagra/compute_distance_vpq-impl.cuh | 20 +- .../detail/cagra/compute_distance_vpq.hpp | 3 +- ...float_uint32_dim1024_t32_8pq_8subd_half.cu | 41 - ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu | 31 + ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu | 31 + ...d_float_uint32_dim128_t4_8pq_8subd_half.cu | 41 - ...int32_dim128_t4_8pq_8subd_half_fp8false.cu | 31 + ...uint32_dim128_t4_8pq_8subd_half_fp8true.cu | 31 + ...int32_dim128_t8_8pq_2subd_half_fp8false.cu | 31 + ...uint32_dim128_t8_8pq_2subd_half_fp8true.cu | 31 + ...int32_dim128_t8_8pq_4subd_half_fp8false.cu | 31 + ...uint32_dim128_t8_8pq_4subd_half_fp8true.cu | 31 + ...nt32_dim256_t16_8pq_2subd_half_fp8false.cu | 31 + ...int32_dim256_t16_8pq_2subd_half_fp8true.cu | 31 + ...nt32_dim256_t16_8pq_4subd_half_fp8false.cu | 31 + ...int32_dim256_t16_8pq_4subd_half_fp8true.cu | 31 + ...d_float_uint32_dim256_t8_8pq_8subd_half.cu | 41 - ...int32_dim256_t8_8pq_8subd_half_fp8false.cu | 31 + ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu | 31 + ..._float_uint32_dim512_t16_8pq_8subd_half.cu | 41 - ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu | 31 + ...int32_dim512_t16_8pq_8subd_half_fp8true.cu | 31 + ...nt32_dim512_t32_8pq_2subd_half_fp8false.cu | 31 + ...int32_dim512_t32_8pq_2subd_half_fp8true.cu | 31 + ...nt32_dim512_t32_8pq_4subd_half_fp8false.cu | 31 + ...int32_dim512_t32_8pq_4subd_half_fp8true.cu | 31 + ...ed_float_uint32_dim64_t4_8pq_2subd_half.cu | 41 - ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu | 31 + ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu | 31 + ...ed_float_uint32_dim64_t4_8pq_4subd_half.cu | 41 - ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu | 31 + ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu | 31 + ..._half_uint32_dim1024_t32_8pq_8subd_half.cu | 41 - ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu | 31 + ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu | 31 + ...ed_half_uint32_dim128_t4_8pq_8subd_half.cu | 41 - ...int32_dim128_t4_8pq_8subd_half_fp8false.cu | 31 + ...int32_dim128_t4_8pq_8subd_half_fp8true.cu} | 11 +- ...nt32_dim128_t8_8pq_2subd_half_fp8false.cu} | 5 +- ...int32_dim128_t8_8pq_2subd_half_fp8true.cu} | 7 +- ...int32_dim128_t8_8pq_4subd_half_fp8false.cu | 31 + ...int32_dim128_t8_8pq_4subd_half_fp8true.cu} | 5 +- ...t32_dim256_t16_8pq_2subd_half_fp8false.cu} | 5 +- ...nt32_dim256_t16_8pq_2subd_half_fp8true.cu} | 7 +- ...t32_dim256_t16_8pq_4subd_half_fp8false.cu} | 5 +- ...nt32_dim256_t16_8pq_4subd_half_fp8true.cu} | 7 +- ...ed_half_uint32_dim256_t8_8pq_8subd_half.cu | 41 - ...int32_dim256_t8_8pq_8subd_half_fp8false.cu | 31 + ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu | 31 + ...d_half_uint32_dim512_t16_8pq_8subd_half.cu | 41 - ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu | 31 + ...int32_dim512_t16_8pq_8subd_half_fp8true.cu | 31 + ...t32_dim512_t32_8pq_2subd_half_fp8false.cu} | 5 +- ...nt32_dim512_t32_8pq_2subd_half_fp8true.cu} | 7 +- ...t32_dim512_t32_8pq_4subd_half_fp8false.cu} | 5 +- ...nt32_dim512_t32_8pq_4subd_half_fp8true.cu} | 7 +- ...ded_half_uint32_dim64_t4_8pq_2subd_half.cu | 41 - ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu | 31 + ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu | 31 + ...ded_half_uint32_dim64_t4_8pq_4subd_half.cu | 41 - ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu | 31 + ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu | 31 + ..._int8_uint32_dim1024_t32_8pq_8subd_half.cu | 41 - ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu | 31 + ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu | 31 + ...ed_int8_uint32_dim128_t4_8pq_8subd_half.cu | 41 - ...int32_dim128_t4_8pq_8subd_half_fp8false.cu | 31 + ...int32_dim128_t4_8pq_8subd_half_fp8true.cu} | 9 +- ...int32_dim128_t8_8pq_2subd_half_fp8false.cu | 31 + ...int32_dim128_t8_8pq_2subd_half_fp8true.cu} | 5 +- ...int32_dim128_t8_8pq_4subd_half_fp8false.cu | 31 + ...uint32_dim128_t8_8pq_4subd_half_fp8true.cu | 31 + ...nt32_dim256_t16_8pq_2subd_half_fp8false.cu | 31 + ...nt32_dim256_t16_8pq_2subd_half_fp8true.cu} | 5 +- ...nt32_dim256_t16_8pq_4subd_half_fp8false.cu | 31 + ...nt32_dim256_t16_8pq_4subd_half_fp8true.cu} | 5 +- ...ed_int8_uint32_dim256_t8_8pq_8subd_half.cu | 41 - ...int32_dim256_t8_8pq_8subd_half_fp8false.cu | 31 + ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu | 31 + ...d_int8_uint32_dim512_t16_8pq_8subd_half.cu | 41 - ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu | 31 + ...int32_dim512_t16_8pq_8subd_half_fp8true.cu | 31 + ...nt32_dim512_t32_8pq_2subd_half_fp8false.cu | 31 + ...nt32_dim512_t32_8pq_2subd_half_fp8true.cu} | 5 +- ...nt32_dim512_t32_8pq_4subd_half_fp8false.cu | 31 + ...nt32_dim512_t32_8pq_4subd_half_fp8true.cu} | 5 +- ...ded_int8_uint32_dim64_t4_8pq_2subd_half.cu | 41 - ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu | 31 + ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu | 31 + ...ded_int8_uint32_dim64_t4_8pq_4subd_half.cu | 41 - ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu | 31 + ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu | 31 + ...uint8_uint32_dim1024_t32_8pq_8subd_half.cu | 41 - ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu | 31 + ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu | 31 + ...d_uint8_uint32_dim128_t4_8pq_8subd_half.cu | 41 - ...int32_dim128_t4_8pq_8subd_half_fp8false.cu | 31 + ...int32_dim128_t4_8pq_8subd_half_fp8true.cu} | 9 +- ...int32_dim128_t8_8pq_2subd_half_fp8false.cu | 31 + ...int32_dim128_t8_8pq_2subd_half_fp8true.cu} | 5 +- ...int32_dim128_t8_8pq_4subd_half_fp8false.cu | 31 + ...uint32_dim128_t8_8pq_4subd_half_fp8true.cu | 31 + ...nt32_dim256_t16_8pq_2subd_half_fp8false.cu | 31 + ...nt32_dim256_t16_8pq_2subd_half_fp8true.cu} | 5 +- ...nt32_dim256_t16_8pq_4subd_half_fp8false.cu | 31 + ...nt32_dim256_t16_8pq_4subd_half_fp8true.cu} | 5 +- ...d_uint8_uint32_dim256_t8_8pq_8subd_half.cu | 41 - ...int32_dim256_t8_8pq_8subd_half_fp8false.cu | 31 + ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu | 31 + ..._uint8_uint32_dim512_t16_8pq_8subd_half.cu | 41 - ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu | 31 + ...int32_dim512_t16_8pq_8subd_half_fp8true.cu | 31 + ...nt32_dim512_t32_8pq_2subd_half_fp8false.cu | 31 + ...nt32_dim512_t32_8pq_2subd_half_fp8true.cu} | 5 +- ...nt32_dim512_t32_8pq_4subd_half_fp8false.cu | 31 + ...nt32_dim512_t32_8pq_4subd_half_fp8true.cu} | 5 +- ...ed_uint8_uint32_dim64_t4_8pq_2subd_half.cu | 41 - ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu | 31 + ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu | 31 + ...ed_uint8_uint32_dim64_t4_8pq_4subd_half.cu | 41 - ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu | 31 + ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu | 31 + 126 files changed, 3949 insertions(+), 1301 deletions(-) delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8true.cu} (82%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8false.cu} (82%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8true.cu} (82%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8true.cu} (82%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8false.cu} (82%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8true.cu} (82%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8false.cu} (82%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8true.cu} (82%) delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8false.cu} (82%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8true.cu} (82%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8false.cu} (82%) rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8true.cu} (82%) delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu} (82%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu} (82%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu} (82%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu} (82%) delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu} (82%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu} (82%) delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu} (82%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu} (82%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu} (82%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu} (82%) delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu} (82%) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu rename cpp/src/neighbors/detail/cagra/{compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu} (82%) delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f0bdb78987..3d6fb22558 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -253,54 +253,102 @@ if(NOT BUILD_CPU_ONLY) src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu - src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu + src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh index 32a57766da..c0fef4f0cf 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh @@ -83,7 +83,28 @@ extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; +extern template struct vpq_descriptor_spec; extern template struct vpq_descriptor_spec; + float, + false>; extern template struct vpq_descriptor_spec; + float, + true>; +extern template struct vpq_descriptor_spec; extern template struct standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, @@ -719,18 +1268,30 @@ extern template struct instance_selector< standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, @@ -740,18 +1301,93 @@ extern template struct instance_selector< standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, @@ -761,18 +1397,174 @@ extern template struct instance_selector< standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec>; @@ -787,18 +1579,39 @@ using descriptor_instances = instance_selector< standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, @@ -808,18 +1621,30 @@ using descriptor_instances = instance_selector< standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, @@ -829,18 +1654,93 @@ using descriptor_instances = instance_selector< standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, @@ -850,18 +1750,174 @@ using descriptor_instances = instance_selector< standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec>; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu index fa708f2a4f..ec4eec28ac 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu @@ -28,18 +28,39 @@ template struct instance_selector< standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, @@ -49,18 +70,30 @@ template struct instance_selector< standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, @@ -70,18 +103,93 @@ template struct instance_selector< standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, @@ -91,18 +199,174 @@ template struct instance_selector< standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec, standard_descriptor_spec>; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py index da16bac177..0365145022 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py @@ -89,15 +89,16 @@ for code_book_t in code_book_types: for pq_bit in pq_bits: for metric in ['L2Expanded']: - path = f"compute_distance_vpq_{metric}_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}.cu" - includes = '#include "compute_distance_vpq-impl.cuh"' - params = f"{metric_prefix}{metric}, {team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}" - spec = f"vpq_descriptor_spec<{params}>" - content = f"""template struct {spec};""" - specs.append(spec) - with open(path, "w") as f: - f.write(template.format(includes=includes, content=content)) - cmake_list.append(f" src/neighbors/detail/cagra/{path}") + for enable_fp8 in ['true', 'false']: + path = f"compute_distance_vpq_{metric}_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}_fp8{enable_fp8}.cu" + includes = '#include "compute_distance_vpq-impl.cuh"' + params = f"{metric_prefix}{metric}, {team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}, {enable_fp8}" + spec = f"vpq_descriptor_spec<{params}>" + content = f"""template struct {spec};""" + specs.append(spec) + with open(path, "w") as f: + f.write(template.format(includes=includes, content=content)) + cmake_list.append(f" src/neighbors/detail/cagra/{path}") # CAGRA (Binary Hamming distance) for (mxdim, team) in mxdim_team: diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 13566627fb..5de2478702 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -45,7 +45,7 @@ template + bool EnableFP8> struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t { using base_type = dataset_descriptor_base_t; using CODE_BOOK_T = CodebookT; @@ -481,7 +481,8 @@ template + typename DistanceT, + bool EnableFP8> RAFT_KERNEL __launch_bounds__(1, 1) vpq_dataset_descriptor_init_kernel(dataset_descriptor_base_t* out, const std::uint8_t* encoded_dataset_ptr, @@ -499,7 +500,8 @@ RAFT_KERNEL __launch_bounds__(1, 1) CodebookT, DataT, IndexT, - DistanceT>; + DistanceT, + EnableFP8>; using base_type = typename desc_type::base_type; new (out) desc_type( reinterpret_cast(&setup_workspace_vpq), @@ -520,7 +522,8 @@ template + typename DistanceT, + bool EnableFP8> dataset_descriptor_host vpq_descriptor_spec::init_(const cagra::search_params& params, + DistanceT, + EnableFP8>::init_(const cagra::search_params& params, const std::uint8_t* encoded_dataset_ptr, uint32_t encoded_dataset_dim, const CodebookT* vq_code_book_ptr, @@ -546,7 +550,8 @@ vpq_descriptor_spec; + DistanceT, + EnableFP8>; using base_type = typename desc_type::base_type; desc_type dd_host{nullptr, @@ -568,7 +573,8 @@ vpq_descriptor_spec + DistanceT, + EnableFP8> <<<1, 1, 0, stream>>>(dev_ptr, encoded_dataset_ptr, encoded_dataset_dim, diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp index 2b69a1cef4..ece7323907 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp @@ -21,7 +21,8 @@ template + typename DistanceT, + bool EnableFP8> struct vpq_descriptor_spec : public instance_spec { using base_type = instance_spec; using typename base_type::data_type; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu deleted file mode 100644 index 5c458a281a..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..0eeba4602c --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu new file mode 100644 index 0000000000..4a059f133e --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu deleted file mode 100644 index d5579a2be0..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..314f233573 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu new file mode 100644 index 0000000000..369b44f743 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu new file mode 100644 index 0000000000..87927cd478 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu new file mode 100644 index 0000000000..33232e7a64 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu new file mode 100644 index 0000000000..f7290b2b5e --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu new file mode 100644 index 0000000000..d4b0360c01 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu new file mode 100644 index 0000000000..07b9021ad7 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu new file mode 100644 index 0000000000..92aafecd4f --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu new file mode 100644 index 0000000000..75f433ed4a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu new file mode 100644 index 0000000000..25cdfcf44b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu deleted file mode 100644 index ae33cdc65a..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..12c1166902 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu new file mode 100644 index 0000000000..4fd44ce5a8 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu deleted file mode 100644 index dcd1d6a074..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..c2a3b9f565 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu new file mode 100644 index 0000000000..29a694b72d --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu new file mode 100644 index 0000000000..48782764f2 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu new file mode 100644 index 0000000000..da99ab9173 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu new file mode 100644 index 0000000000..0164636430 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu new file mode 100644 index 0000000000..d6918aab34 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu deleted file mode 100644 index 740ad40f21..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu new file mode 100644 index 0000000000..9ba5ae5005 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu new file mode 100644 index 0000000000..b9a4f4ebdf --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu deleted file mode 100644 index 6a01a5c0d5..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu new file mode 100644 index 0000000000..65a1455dca --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu new file mode 100644 index 0000000000..fc41ff9109 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu deleted file mode 100644 index a9766124f8..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..ed1f9afc26 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu new file mode 100644 index 0000000000..37fd1ad8c5 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu deleted file mode 100644 index c5d6c72a6f..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..0a50234576 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8true.cu index 680f594261..56b4a2f6fd 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -18,13 +18,14 @@ namespace cuvs::neighbors::cagra::detail { using namespace cuvs::distance; template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8false.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8false.cu index bbbc147de7..02c0559dd9 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8false.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + false>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8true.cu index 9b29bb8ffc..fa6c5305d2 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -23,8 +23,9 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu new file mode 100644 index 0000000000..4680d19ab9 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8true.cu index b0883184c1..11e75f61b6 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8false.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8false.cu index 6d7850cf8c..42bb7660d9 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8false.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + false>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8true.cu index 08cd7590bc..520e36c602 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -23,8 +23,9 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8false.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8false.cu index 040fa4456d..023bed430d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8false.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + false>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8true.cu index 6610d1d87b..c40e843fa5 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -23,8 +23,9 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu deleted file mode 100644 index 2114941e3e..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..f2e07f0c5e --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu new file mode 100644 index 0000000000..123c117c32 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu deleted file mode 100644 index 0b78982890..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..4d94ea3c71 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu new file mode 100644 index 0000000000..9a55456931 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8false.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8false.cu index 70ae484456..8fac7c2659 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8false.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + false>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8true.cu index e251d13331..c83b911d36 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -23,8 +23,9 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8false.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8false.cu index c0f889af28..2b801907b9 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8false.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + false>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8true.cu index 0b37928ab7..c07ede51e9 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -23,8 +23,9 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu deleted file mode 100644 index be699ca98a..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu new file mode 100644 index 0000000000..2652edfc8c --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu new file mode 100644 index 0000000000..95aadfc5d9 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu deleted file mode 100644 index 36592482e1..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu new file mode 100644 index 0000000000..85f46ec0f5 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu new file mode 100644 index 0000000000..db6c599e14 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu deleted file mode 100644 index e2d68ae772..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..b9b38960af --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu new file mode 100644 index 0000000000..1bc6a46138 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu deleted file mode 100644 index 65cdfb0998..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..4b856ff203 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu index d59e7c9078..2e84b879e8 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -18,13 +18,14 @@ namespace cuvs::neighbors::cagra::detail { using namespace cuvs::distance; template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu new file mode 100644 index 0000000000..1a03321b2f --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu index f4bb7d1e31..b46995999d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu new file mode 100644 index 0000000000..05d9febaeb --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu new file mode 100644 index 0000000000..4e3a5322d3 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu new file mode 100644 index 0000000000..99a955fcba --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu index ff0672de06..b0eb39d62b 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu new file mode 100644 index 0000000000..e8fe498589 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu index bc160382be..e24dc1ef20 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu deleted file mode 100644 index bf24d343fc..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..8c40f8482e --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu new file mode 100644 index 0000000000..b857508f52 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu deleted file mode 100644 index 6cfa5ede30..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..cc3e33adc1 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu new file mode 100644 index 0000000000..de0860b278 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu new file mode 100644 index 0000000000..c07ce1ff7a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu index 894a1eae7b..ff7158f5d2 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu new file mode 100644 index 0000000000..ea1a6e975b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu index 4aa48daee0..c21c9c5c10 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu deleted file mode 100644 index aedfb0ef44..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu new file mode 100644 index 0000000000..b707ad056b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu new file mode 100644 index 0000000000..9c273805d9 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu deleted file mode 100644 index 42a56de4a5..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu new file mode 100644 index 0000000000..3fa2ef8170 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu new file mode 100644 index 0000000000..c1d4456e2d --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu deleted file mode 100644 index 6217a0047c..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..1e09109eb8 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu new file mode 100644 index 0000000000..9ea862c9bc --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu deleted file mode 100644 index 6d06771052..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..bfcc48f462 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu index b0e824d788..238572cf5f 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -18,13 +18,14 @@ namespace cuvs::neighbors::cagra::detail { using namespace cuvs::distance; template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu new file mode 100644 index 0000000000..58698a9760 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu index fc9f5043ac..8388bae580 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu new file mode 100644 index 0000000000..584a58fcf1 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu new file mode 100644 index 0000000000..2f8b58b9e1 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu new file mode 100644 index 0000000000..b735134e70 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu index b7755c2d17..71d93ebe04 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu new file mode 100644 index 0000000000..ba28f84414 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu index 5457ea76e7..70653e69e2 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu deleted file mode 100644 index 4da8b5c0d7..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..81a29015de --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu new file mode 100644 index 0000000000..6254aae41a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu deleted file mode 100644 index c1f63841b4..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu new file mode 100644 index 0000000000..2223290eff --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu new file mode 100644 index 0000000000..f3f7c0ae07 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu new file mode 100644 index 0000000000..cc487728cd --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu index 4225ea81a3..0da175b065 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu new file mode 100644 index 0000000000..690b8a90f7 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu similarity index 82% rename from cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu index dfcecd31b3..d3c5e032f8 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8true.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -25,6 +25,7 @@ template struct vpq_descriptor_spec; + float, + true>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu deleted file mode 100644 index 3d458c0c94..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu new file mode 100644 index 0000000000..b5ae8f18d8 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu new file mode 100644 index 0000000000..97f100c53f --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu deleted file mode 100644 index 6f59b47bbe..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu new file mode 100644 index 0000000000..f17eae07db --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu new file mode 100644 index 0000000000..b94a11d287 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance_vpq-impl.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; +template struct vpq_descriptor_spec; + +} // namespace cuvs::neighbors::cagra::detail From 35259e3fe5476abb7ab244b08f310108dadb71b3 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 12 Nov 2025 13:09:01 +0900 Subject: [PATCH 035/119] Add `smem_dtype` option --- cpp/include/cuvs/neighbors/cagra.hpp | 6 ++++++ cpp/src/neighbors/detail/cagra/cagra_search.cuh | 4 ++++ cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp | 6 ++++++ 3 files changed, 16 insertions(+) diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index 6192b263c3..5458bbccc7 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -194,6 +194,8 @@ enum class search_algo { enum class hash_mode { HASH = 0, SMALL = 1, AUTO = 100 }; +enum class internal_dtype { F16 = 0, E5M2 = 1, AUTO = 100 }; + struct search_params : cuvs::neighbors::search_params { /** Maximum number of queries to search at the same time (batch size). Auto select when 0.*/ size_t max_queries = 0; @@ -267,6 +269,10 @@ struct search_params : cuvs::neighbors::search_params { * negative, in which case the filtering rate is automatically calculated. */ float filtering_rate = -1.0; + + /** Data type of the query vector and codebook table on shared memory. Currently, only VPQ + * supports FP8. **/ + internal_dtype smem_dtype = internal_dtype::AUTO; }; /** diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index 26e0aafd2d..1213736a21 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -148,6 +148,10 @@ void search_main(raft::resources const& res, // Dispatch search parameters based on the dataset kind. if (auto* strided_dset = dynamic_cast*>(&index.data()); strided_dset != nullptr) { + if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO || + params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::F16) { + RAFT_LOG_WARN("In this search mode, only AUTO or F16 are supported as the smem_dtype."); + } // Search using a plain (strided) row-major dataset RAFT_EXPECTS(index.metric() != cuvs::distance::DistanceType::CosineExpanded || index.dataset_norms().has_value(), diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp index ece7323907..0f55b3efb2 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp @@ -64,12 +64,18 @@ struct vpq_descriptor_spec : public instance_spec { const DatasetT& dataset, cuvs::distance::DistanceType metric) -> double { + const auto fp8_natively_supported = raft::getComputeCapability().first >= 9; + const auto use_fp8 = + params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::E5M2 || + (params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::AUTO && fp8_natively_supported); + // If explicit team_size is specified and doesn't match the instance, discard it if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } if (cuvs::distance::DistanceType::L2Expanded != metric) { return -1.0; } // Match codebook params if (dataset.pq_bits() != PqBits) { return -1.0; } if (dataset.pq_len() != PqLen) { return -1.0; } + if (use_fp8 != EnableFP8) { return -1.0; } // Otherwise, favor the closest dataset dimensionality. constexpr std::uint32_t preferred_load_elmes_per_thread = 16; /*magic number that is good based on experiments.*/ From 7639d0205f8b20726e381be87735dbcdbcd8a10d Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 12 Nov 2025 13:29:37 +0900 Subject: [PATCH 036/119] Remove unnecessary include --- cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh index 61b9e595fc..7f38342461 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -21,7 +21,6 @@ #include #include -#include namespace cuvs::neighbors::ivf_pq::detail { @@ -100,4 +99,5 @@ struct fp_8bit { return r; } }; + } // namespace cuvs::neighbors::ivf_pq::detail From 710232ae9124bab5195118877d7e92ab203171e9 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 12 Nov 2025 15:30:41 +0900 Subject: [PATCH 037/119] Remove unnecessary files --- ..._L2Expanded_dim128_t8_uint32_t_uint64_t.cu | 41 ------------------- ...L2Expanded_dim256_t16_uint32_t_uint64_t.cu | 41 ------------------- ...L2Expanded_dim512_t32_uint32_t_uint64_t.cu | 41 ------------------- ...q_L2Expanded_dim64_t4_uint32_t_uint64_t.cu | 41 ------------------- 4 files changed, 164 deletions(-) delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu deleted file mode 100644 index 10ddfc0163..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim128_t8_uint32_t_uint64_t.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vrabitq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vrabitq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu deleted file mode 100644 index e057457a6a..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim256_t16_uint32_t_uint64_t.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vrabitq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vrabitq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu deleted file mode 100644 index c30bd76785..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim512_t32_uint32_t_uint64_t.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vrabitq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vrabitq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu deleted file mode 100644 index 472dd9821f..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vrabitq_L2Expanded_dim64_t4_uint32_t_uint64_t.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vrabitq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vrabitq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail From d183be67dbe8d07dc6ed8657701e6b1cb4d63ac0 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Thu, 13 Nov 2025 01:16:28 +0900 Subject: [PATCH 038/119] Remove unnecessary file --- cpp/tests/neighbors/vpq_utils.cuh | 77 ------------------------------- 1 file changed, 77 deletions(-) delete mode 100644 cpp/tests/neighbors/vpq_utils.cuh diff --git a/cpp/tests/neighbors/vpq_utils.cuh b/cpp/tests/neighbors/vpq_utils.cuh deleted file mode 100644 index 383e5ef063..0000000000 --- a/cpp/tests/neighbors/vpq_utils.cuh +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include - -namespace cuvs::neighbors { -template -__global__ void decode_vpq_dataset_kernel(data_t* const decoded_dataset_ptr, - const uint32_t ldd, - const math_t* const vq_codebook_ptr, - const uint32_t ldv, - const math_t* const pq_codebook_ptr, - const uint32_t pq_subspace_dim, - const uint32_t pq_table_size, - const uint32_t dataset_dim, - const size_t dataset_size, - const uint8_t* const data_ptr, - const uint32_t ldi) -{ - constexpr uint32_t warp_size = 32; - const size_t batch_id = (blockIdx.x * blockDim.x + threadIdx.x) / warp_size; - if (batch_id >= dataset_size) { return; } - - const auto local_data_ptr = data_ptr + ldi * batch_id; - const auto vq_code = *reinterpret_cast(local_data_ptr); - const auto pq_code_ptr = local_data_ptr + sizeof(uint32_t); - const auto vq_vec_ptr = vq_codebook_ptr + vq_code * ldv; - auto local_dst_ptr = decoded_dataset_ptr + batch_id * ldd; - - const auto lane_id = threadIdx.x % warp_size; - for (uint32_t i = lane_id; i < dataset_dim; i += warp_size) { - const auto pq_code = pq_code_ptr[i / pq_subspace_dim]; - const auto pq_v = pq_codebook_ptr[pq_code * pq_subspace_dim + (i % pq_subspace_dim)]; - - local_dst_ptr[i] = static_cast(vq_vec_ptr[i]) + static_cast(pq_v); - } -} - -template -void decode_vpq_dataset(raft::device_matrix_view decoded_dataset, - const cuvs::neighbors::vpq_dataset& vpq_dataset, - cudaStream_t cuda_stream) -{ - const auto dataset_size = decoded_dataset.extent(0); - RAFT_EXPECTS(vpq_dataset.data.extent(0) == dataset_size, "Dataset sizes mismatch"); - - constexpr uint32_t block_size = 256; - constexpr uint32_t warp_size = 32; - constexpr int64_t vecs_per_cta = block_size / warp_size; - const auto grid_size = raft::div_rounding_up_safe(decoded_dataset.extent(0), vecs_per_cta); - - decode_vpq_dataset_kernel - <<>>(decoded_dataset.data_handle(), - decoded_dataset.stride(0), - vpq_dataset.vq_code_book.data_handle(), - vpq_dataset.vq_code_book.stride(0), - vpq_dataset.pq_code_book.data_handle(), - vpq_dataset.pq_len(), - 1u << vpq_dataset.pq_bits(), - vpq_dataset.dim(), - dataset_size, - vpq_dataset.data.data_handle(), - vpq_dataset.data.stride(0)); -} -} // namespace cuvs::neighbors From e7d3d42c5de6a512f48163cffdbf5dfb146b10f7 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Thu, 13 Nov 2025 01:36:12 +0900 Subject: [PATCH 039/119] Revert "Remove unnecessary file" This reverts commit d183be67dbe8d07dc6ed8657701e6b1cb4d63ac0. --- cpp/tests/neighbors/vpq_utils.cuh | 77 +++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 cpp/tests/neighbors/vpq_utils.cuh diff --git a/cpp/tests/neighbors/vpq_utils.cuh b/cpp/tests/neighbors/vpq_utils.cuh new file mode 100644 index 0000000000..383e5ef063 --- /dev/null +++ b/cpp/tests/neighbors/vpq_utils.cuh @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +namespace cuvs::neighbors { +template +__global__ void decode_vpq_dataset_kernel(data_t* const decoded_dataset_ptr, + const uint32_t ldd, + const math_t* const vq_codebook_ptr, + const uint32_t ldv, + const math_t* const pq_codebook_ptr, + const uint32_t pq_subspace_dim, + const uint32_t pq_table_size, + const uint32_t dataset_dim, + const size_t dataset_size, + const uint8_t* const data_ptr, + const uint32_t ldi) +{ + constexpr uint32_t warp_size = 32; + const size_t batch_id = (blockIdx.x * blockDim.x + threadIdx.x) / warp_size; + if (batch_id >= dataset_size) { return; } + + const auto local_data_ptr = data_ptr + ldi * batch_id; + const auto vq_code = *reinterpret_cast(local_data_ptr); + const auto pq_code_ptr = local_data_ptr + sizeof(uint32_t); + const auto vq_vec_ptr = vq_codebook_ptr + vq_code * ldv; + auto local_dst_ptr = decoded_dataset_ptr + batch_id * ldd; + + const auto lane_id = threadIdx.x % warp_size; + for (uint32_t i = lane_id; i < dataset_dim; i += warp_size) { + const auto pq_code = pq_code_ptr[i / pq_subspace_dim]; + const auto pq_v = pq_codebook_ptr[pq_code * pq_subspace_dim + (i % pq_subspace_dim)]; + + local_dst_ptr[i] = static_cast(vq_vec_ptr[i]) + static_cast(pq_v); + } +} + +template +void decode_vpq_dataset(raft::device_matrix_view decoded_dataset, + const cuvs::neighbors::vpq_dataset& vpq_dataset, + cudaStream_t cuda_stream) +{ + const auto dataset_size = decoded_dataset.extent(0); + RAFT_EXPECTS(vpq_dataset.data.extent(0) == dataset_size, "Dataset sizes mismatch"); + + constexpr uint32_t block_size = 256; + constexpr uint32_t warp_size = 32; + constexpr int64_t vecs_per_cta = block_size / warp_size; + const auto grid_size = raft::div_rounding_up_safe(decoded_dataset.extent(0), vecs_per_cta); + + decode_vpq_dataset_kernel + <<>>(decoded_dataset.data_handle(), + decoded_dataset.stride(0), + vpq_dataset.vq_code_book.data_handle(), + vpq_dataset.vq_code_book.stride(0), + vpq_dataset.pq_code_book.data_handle(), + vpq_dataset.pq_len(), + 1u << vpq_dataset.pq_bits(), + vpq_dataset.dim(), + dataset_size, + vpq_dataset.data.data_handle(), + vpq_dataset.data.stride(0)); +} +} // namespace cuvs::neighbors From 24089bc21fc215ca3e413001cd516d90eee903ae Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Thu, 13 Nov 2025 01:37:01 +0900 Subject: [PATCH 040/119] Fix Copyright --- cpp/tests/neighbors/vpq_utils.cuh | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/cpp/tests/neighbors/vpq_utils.cuh b/cpp/tests/neighbors/vpq_utils.cuh index 383e5ef063..8ceb371413 100644 --- a/cpp/tests/neighbors/vpq_utils.cuh +++ b/cpp/tests/neighbors/vpq_utils.cuh @@ -1,17 +1,6 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 */ #include From 13725b301aaab5c85b8a0e1b7f9d84e2ea2e70c3 Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Fri, 6 Feb 2026 05:28:57 -0800 Subject: [PATCH 041/119] add a new max_node_id parameter to the CAGRA search API, allowing users to constrain random seed node selection to a subset of the dataset. This is useful when the graph is smaller than the dataset, such as during iterative build with compression. --- cpp/include/cuvs/neighbors/cagra.hpp | 7 +++++++ .../neighbors/detail/cagra/device_common.hpp | 6 ++++-- .../cagra/search_multi_cta_kernel-inl.cuh | 9 ++++++--- .../cagra/search_single_cta_kernel-inl.cuh | 20 +++++++++++++------ 4 files changed, 31 insertions(+), 11 deletions(-) diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index 9b9e8eb0e6..ef55507869 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -247,6 +247,13 @@ struct search_params : cuvs::neighbors::search_params { /** Bit mask used for initial random seed node selection. */ uint64_t rand_xor_mask = 0x128394; + /** + * Maximum node ID for random seed selection. + * When > 0, random seeds are constrained to [0, max_node_id) instead of [0, dataset_size). + * This is useful when the graph is smaller than the dataset (e.g., iterative build with compression). + * Default 0 means no constraint (use dataset_size). + */ + uint32_t max_node_id = 0; /** Whether to use the persistent version of the kernel (only SINGLE_CTA is supported a.t.m.) */ bool persistent = false; /** Persistent kernel: time in seconds before the kernel stops if no requests received. */ diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 8a5bb6ba1f..df22b28081 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -102,11 +102,13 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( IndexT* __restrict__ traversed_hash_ptr, const uint32_t traversed_hash_bitlen, const uint32_t block_id = 0, - const uint32_t num_blocks = 1) + const uint32_t num_blocks = 1, + const IndexT max_node_id = 0) { const auto team_size_bits = dataset_desc.team_size_bitshift_from_smem(); const auto max_i = raft::round_up_safe(num_pickup, warp_size >> team_size_bits); const auto compute_distance = dataset_desc.compute_distance_impl; + const IndexT seed_index_limit = max_node_id > 0 ? max_node_id : dataset_desc.size; for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += (blockDim.x >> team_size_bits)) { const bool valid_i = (i < num_pickup); @@ -122,7 +124,7 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( if (seed_ptr && (gid < num_seeds)) { seed_index = seed_ptr[gid]; } else { - seed_index = device::xorshift64(gid ^ rand_xor_mask) % dataset_desc.size; + seed_index = device::xorshift64(gid ^ rand_xor_mask) % seed_index_limit; } } diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh index c8b885dffe..916767c00b 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -193,7 +193,8 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( const uint32_t min_iteration, const uint32_t max_iteration, uint32_t* const num_executed_iterations, /* stats */ - SAMPLE_FILTER_T sample_filter) + SAMPLE_FILTER_T sample_filter, + const typename DATASET_DESCRIPTOR_T::INDEX_T max_node_id = 0) { using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; @@ -281,7 +282,8 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( local_traversed_hashmap_ptr, traversed_hash_bitlen, block_id, - num_blocks); + num_blocks, + max_node_id); __syncthreads(); _CLK_REC(clk_compute_1st_distance); @@ -627,7 +629,8 @@ void select_and_run(const dataset_descriptor_host& dat ps.min_iterations, ps.max_iterations, num_executed_iterations, - sample_filter); + sample_filter, + ps.max_node_id); } } // namespace multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index 404817e582..956f9ac3e4 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -702,7 +702,8 @@ RAFT_DEVICE_INLINE_FUNCTION void search_core( const std::uint32_t small_hash_bitlen, const std::uint32_t small_hash_reset_interval, const std::uint32_t query_id, - SAMPLE_FILTER_T sample_filter) + SAMPLE_FILTER_T sample_filter, + const typename DATASET_DESCRIPTOR_T::INDEX_T max_node_id = 0) { using LOAD_T = device::LOAD_128BIT_T; @@ -791,7 +792,10 @@ RAFT_DEVICE_INLINE_FUNCTION void search_core( local_visited_hashmap_ptr, hash_bitlen, (INDEX_T*)nullptr, - 0); + 0, + 0, + 1, + max_node_id); __syncthreads(); _CLK_REC(clk_compute_1st_distance); @@ -1124,7 +1128,8 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( const std::uint32_t hash_bitlen, const std::uint32_t small_hash_bitlen, const std::uint32_t small_hash_reset_interval, - SAMPLE_FILTER_T sample_filter) + SAMPLE_FILTER_T sample_filter, + const typename DATASET_DESCRIPTOR_T::INDEX_T max_node_id = 0) { const auto query_id = blockIdx.y; search_core Date: Fri, 6 Feb 2026 06:51:31 -0800 Subject: [PATCH 042/119] Changed the max node id parameter name to graph_size for clarity; removed max_node_id from the search parameters structure --- cpp/include/cuvs/neighbors/cagra.hpp | 7 ------- cpp/src/neighbors/detail/cagra/device_common.hpp | 4 ++-- .../detail/cagra/search_multi_cta_kernel-inl.cuh | 6 +++--- .../detail/cagra/search_single_cta_kernel-inl.cuh | 13 ++++++------- 4 files changed, 11 insertions(+), 19 deletions(-) diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index ef55507869..9b9e8eb0e6 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -247,13 +247,6 @@ struct search_params : cuvs::neighbors::search_params { /** Bit mask used for initial random seed node selection. */ uint64_t rand_xor_mask = 0x128394; - /** - * Maximum node ID for random seed selection. - * When > 0, random seeds are constrained to [0, max_node_id) instead of [0, dataset_size). - * This is useful when the graph is smaller than the dataset (e.g., iterative build with compression). - * Default 0 means no constraint (use dataset_size). - */ - uint32_t max_node_id = 0; /** Whether to use the persistent version of the kernel (only SINGLE_CTA is supported a.t.m.) */ bool persistent = false; /** Persistent kernel: time in seconds before the kernel stops if no requests received. */ diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index df22b28081..8cbbe1d366 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -103,12 +103,12 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( const uint32_t traversed_hash_bitlen, const uint32_t block_id = 0, const uint32_t num_blocks = 1, - const IndexT max_node_id = 0) + const IndexT graph_size = 0) { const auto team_size_bits = dataset_desc.team_size_bitshift_from_smem(); const auto max_i = raft::round_up_safe(num_pickup, warp_size >> team_size_bits); const auto compute_distance = dataset_desc.compute_distance_impl; - const IndexT seed_index_limit = max_node_id > 0 ? max_node_id : dataset_desc.size; + const IndexT seed_index_limit = graph_size > 0 ? graph_size : dataset_desc.size; for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += (blockDim.x >> team_size_bits)) { const bool valid_i = (i < num_pickup); diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh index 916767c00b..a2e9a43ff0 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -194,7 +194,7 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( const uint32_t max_iteration, uint32_t* const num_executed_iterations, /* stats */ SAMPLE_FILTER_T sample_filter, - const typename DATASET_DESCRIPTOR_T::INDEX_T max_node_id = 0) + const typename DATASET_DESCRIPTOR_T::INDEX_T graph_size = 0) { using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; @@ -283,7 +283,7 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( traversed_hash_bitlen, block_id, num_blocks, - max_node_id); + graph_size); __syncthreads(); _CLK_REC(clk_compute_1st_distance); @@ -630,7 +630,7 @@ void select_and_run(const dataset_descriptor_host& dat ps.max_iterations, num_executed_iterations, sample_filter, - ps.max_node_id); + static_cast(graph.extent(0))); } } // namespace multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index 956f9ac3e4..5d465c25b5 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -703,7 +703,7 @@ RAFT_DEVICE_INLINE_FUNCTION void search_core( const std::uint32_t small_hash_reset_interval, const std::uint32_t query_id, SAMPLE_FILTER_T sample_filter, - const typename DATASET_DESCRIPTOR_T::INDEX_T max_node_id = 0) + const typename DATASET_DESCRIPTOR_T::INDEX_T graph_size = 0) { using LOAD_T = device::LOAD_128BIT_T; @@ -795,7 +795,7 @@ RAFT_DEVICE_INLINE_FUNCTION void search_core( 0, 0, 1, - max_node_id); + graph_size); __syncthreads(); _CLK_REC(clk_compute_1st_distance); @@ -1129,7 +1129,7 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( const std::uint32_t small_hash_bitlen, const std::uint32_t small_hash_reset_interval, SAMPLE_FILTER_T sample_filter, - const typename DATASET_DESCRIPTOR_T::INDEX_T max_node_id = 0) + const typename DATASET_DESCRIPTOR_T::INDEX_T graph_size = 0) { const auto query_id = blockIdx.y; search_core(graph.extent(0))); RAFT_CUDA_TRY(cudaPeekAtLastError()); } } From 70a69d922ec757a7093cecde07eb5a87f3c44de2 Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Fri, 6 Feb 2026 08:55:07 -0800 Subject: [PATCH 043/119] wrote test --- .../detail/cagra/search_multi_kernel.cuh | 16 +- cpp/tests/CMakeLists.txt | 1 + .../bug_graph_smaller_than_dataset.cu | 159 ++++++++++++++++++ 3 files changed, 171 insertions(+), 5 deletions(-) create mode 100644 cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index f7d353d864..0ee7439ac6 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -104,7 +104,8 @@ RAFT_KERNEL random_pickup_kernel( typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr, // [num_queries, ldr] const std::uint32_t ldr, // (*) ldr >= num_pickup typename DATASET_DESCRIPTOR_T::INDEX_T* const visited_hashmap_ptr, // [num_queries, 1 << bitlen] - const std::uint32_t hash_bitlen) + const std::uint32_t hash_bitlen, + const typename DATASET_DESCRIPTOR_T::INDEX_T graph_size = 0) { using DATA_T = typename DATASET_DESCRIPTOR_T::DATA_T; using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; @@ -119,6 +120,8 @@ RAFT_KERNEL random_pickup_kernel( dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); __syncthreads(); + const INDEX_T seed_index_limit = graph_size > 0 ? graph_size : dataset_desc->size; + INDEX_T best_index_team_local; DISTANCE_T best_norm2_team_local = utils::get_max_value(); for (unsigned i = 0; i < num_distilation; i++) { @@ -128,7 +131,7 @@ RAFT_KERNEL random_pickup_kernel( } else { // Chose a seed node randomly seed_index = - device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % dataset_desc->size; + device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % seed_index_limit; } DISTANCE_T norm2 = dataset_desc->compute_distance(seed_index, true); @@ -166,7 +169,8 @@ void random_pickup(const dataset_descriptor_host& data std::size_t ldr, // (*) ldr >= num_pickup IndexT* visited_hashmap_ptr, // [num_queries, 1 << bitlen] std::uint32_t hash_bitlen, - cudaStream_t cuda_stream) + cudaStream_t cuda_stream, + IndexT graph_size = 0) { const auto block_size = 256u; const auto num_teams_per_threadblock = block_size / dataset_desc.team_size; @@ -185,7 +189,8 @@ void random_pickup(const dataset_descriptor_host& data result_distances_ptr, ldr, visited_hashmap_ptr, - hash_bitlen); + hash_bitlen, + graph_size); } template @@ -826,7 +831,8 @@ struct search result_buffer_allocation_size, hashmap.data(), hash_bitlen, - stream); + stream, + static_cast(this->dataset_size)); unsigned iter = 0; while (1) { diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 9fc620b4cb..3643b2e12d 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -156,6 +156,7 @@ ConfigureTest( ConfigureTest( NAME NEIGHBORS_ANN_CAGRA_TEST_BUGS PATH neighbors/ann_cagra/bug_extreme_inputs_oob.cu neighbors/ann_cagra/bug_multi_cta_crash.cu + neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu GPUS 1 PERCENT 100 ) diff --git a/cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu b/cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu new file mode 100644 index 0000000000..8a29779bbd --- /dev/null +++ b/cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu @@ -0,0 +1,159 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include + +#include +#include +#include +#include +#include + +#include + +namespace cuvs::neighbors::cagra { + +/** + * @brief Test verifying graph.extent(0) is used for random seed selection + * + * This test ensures that CAGRA search kernels correctly use graph.extent(0) + * (graph size) rather than dataset.size for random seed node selection. + * + * The bug: random seed selection previously used dataset_desc.size, which + * could cause OOB access if the graph size differed from dataset size + * (e.g., in CAGRA-Q iterative builds with compression). + * + * The fix: kernels now receive graph.extent(0) as graph_size parameter, + * ensuring seeds are always within valid graph node range [0, graph_size). + */ +class cagra_graph_smaller_than_dataset_test : public ::testing::Test { + public: + using data_type = float; + using index_type = uint32_t; + + protected: + void run() + { + // Create a dataset with 10000 points + constexpr int64_t n_dataset = 10000; + constexpr int64_t n_dim = 128; + constexpr int64_t n_queries = 100; + constexpr int64_t k = 10; + + // Build index normally + auto dataset = raft::make_device_matrix(res, n_dataset, n_dim); + raft::random::RngState r(1234ULL); + raft::random::uniform(res, r, dataset.data_handle(), n_dataset * n_dim, data_type(-1), data_type(1)); + + cagra::index_params index_params; + index_params.graph_degree = 32; + index_params.intermediate_graph_degree = 64; + + auto index = cagra::build(res, index_params, raft::make_const_mdspan(dataset.view())); + raft::resource::sync_stream(res); + + // Get the graph from the index + auto original_graph = index.graph(); + ASSERT_EQ(original_graph.extent(0), n_dataset); + + // Recreate the bug scenario: LARGE dataset, SMALL graph + // (like iterative_build_graph does in intermediate iterations) + constexpr int64_t n_graph = n_dataset / 2; // Only 5000 nodes in graph + + // Step 1: Build index on SMALL subset (5000 points) + auto small_dataset_view = raft::make_device_matrix_view( + dataset.data_handle(), n_graph, n_dim); + + cagra::index_params small_index_params; + small_index_params.graph_degree = 32; + auto small_index = cagra::build(res, small_index_params, small_dataset_view); + raft::resource::sync_stream(res); + + // Step 2: Update to FULL dataset (10000 points) but keep small graph (5000 nodes) + // This creates the exact bug scenario: dataset.size=10000, graph.extent(0)=5000 + small_index.update_dataset(res, raft::make_const_mdspan(dataset.view())); + + // Verify the mismatch - THIS IS THE BUG SCENARIO! + ASSERT_EQ(small_index.graph().extent(0), n_graph); // Graph has 5000 nodes + ASSERT_EQ(small_index.size(), n_dataset); // Dataset has 10000 points + ASSERT_NE(small_index.graph().extent(0), small_index.size()); // Mismatch! + + // Create queries + auto queries = raft::make_device_matrix(res, n_queries, n_dim); + raft::random::uniform(res, r, queries.data_handle(), n_queries * n_dim, data_type(-1), data_type(1)); + + // Allocate output + auto neighbors = raft::make_device_matrix(res, n_queries, k); + auto distances = raft::make_device_matrix(res, n_queries, k); + + // Setup search params + cagra::search_params search_params; + search_params.itopk_size = 64; + search_params.search_width = 1; + search_params.max_iterations = 10; + search_params.algo = cagra::search_algo::SINGLE_CTA; + + // THIS SHOULD NOT CRASH OR CAUSE OOB ACCESS + // Before fix: random seeds use dataset.size (10000) -> tries to access graph[7000] -> CRASH! + // After fix: random seeds use graph.extent(0) (5000) -> only accesses graph[0-4999] -> SAFE! + cagra::search(res, + search_params, + small_index, + raft::make_const_mdspan(queries.view()), + neighbors.view(), + distances.view()); + + raft::resource::sync_stream(res); + + // Verify results are valid (neighbors should be < graph size) + auto neighbors_host = raft::make_host_matrix(n_queries, k); + raft::copy(neighbors_host.data_handle(), + neighbors.data_handle(), + n_queries * k, + raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + + // All neighbor indices should be valid (< n_graph) + for (int64_t i = 0; i < n_queries * k; i++) { + ASSERT_LT(neighbors_host.data_handle()[i], n_graph) + << "Neighbor index " << neighbors_host.data_handle()[i] + << " is >= graph size " << n_graph; + } + + // Test with MULTI_CTA algorithm as well (also had the same bug) + search_params.algo = cagra::search_algo::MULTI_CTA; + + cagra::search(res, + search_params, + small_index, + raft::make_const_mdspan(queries.view()), + neighbors.view(), + distances.view()); + + raft::resource::sync_stream(res); + + // Verify again + raft::copy(neighbors_host.data_handle(), + neighbors.data_handle(), + n_queries * k, + raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + + for (int64_t i = 0; i < n_queries * k; i++) { + ASSERT_LT(neighbors_host.data_handle()[i], n_graph) + << "Neighbor index " << neighbors_host.data_handle()[i] + << " is >= graph size " << n_graph << " (MULTI_CTA)"; + } + } + + private: + raft::resources res; +}; + +TEST_F(cagra_graph_smaller_than_dataset_test, search_with_smaller_graph) { this->run(); } + +} // namespace cuvs::neighbors::cagra From f428e54bf64ad1ba1f67ae01be6ce68742ac85ed Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Fri, 6 Feb 2026 08:55:34 -0800 Subject: [PATCH 044/119] minor pre-commit changes --- .../detail/cagra/search_multi_kernel.cuh | 2 +- .../bug_graph_smaller_than_dataset.cu | 45 ++++++++++--------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index 0ee7439ac6..045c63fe59 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once diff --git a/cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu b/cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu index 8a29779bbd..b06c1cba92 100644 --- a/cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu +++ b/cpp/tests/neighbors/ann_cagra/bug_graph_smaller_than_dataset.cu @@ -19,14 +19,14 @@ namespace cuvs::neighbors::cagra { /** * @brief Test verifying graph.extent(0) is used for random seed selection - * + * * This test ensures that CAGRA search kernels correctly use graph.extent(0) * (graph size) rather than dataset.size for random seed node selection. - * + * * The bug: random seed selection previously used dataset_desc.size, which * could cause OOB access if the graph size differed from dataset size * (e.g., in CAGRA-Q iterative builds with compression). - * + * * The fix: kernels now receive graph.extent(0) as graph_size parameter, * ensuring seeds are always within valid graph node range [0, graph_size). */ @@ -47,12 +47,13 @@ class cagra_graph_smaller_than_dataset_test : public ::testing::Test { // Build index normally auto dataset = raft::make_device_matrix(res, n_dataset, n_dim); raft::random::RngState r(1234ULL); - raft::random::uniform(res, r, dataset.data_handle(), n_dataset * n_dim, data_type(-1), data_type(1)); + raft::random::uniform( + res, r, dataset.data_handle(), n_dataset * n_dim, data_type(-1), data_type(1)); cagra::index_params index_params; index_params.graph_degree = 32; index_params.intermediate_graph_degree = 64; - + auto index = cagra::build(res, index_params, raft::make_const_mdspan(dataset.view())); raft::resource::sync_stream(res); @@ -63,28 +64,29 @@ class cagra_graph_smaller_than_dataset_test : public ::testing::Test { // Recreate the bug scenario: LARGE dataset, SMALL graph // (like iterative_build_graph does in intermediate iterations) constexpr int64_t n_graph = n_dataset / 2; // Only 5000 nodes in graph - + // Step 1: Build index on SMALL subset (5000 points) auto small_dataset_view = raft::make_device_matrix_view( dataset.data_handle(), n_graph, n_dim); - + cagra::index_params small_index_params; small_index_params.graph_degree = 32; - auto small_index = cagra::build(res, small_index_params, small_dataset_view); + auto small_index = cagra::build(res, small_index_params, small_dataset_view); raft::resource::sync_stream(res); - + // Step 2: Update to FULL dataset (10000 points) but keep small graph (5000 nodes) // This creates the exact bug scenario: dataset.size=10000, graph.extent(0)=5000 small_index.update_dataset(res, raft::make_const_mdspan(dataset.view())); - + // Verify the mismatch - THIS IS THE BUG SCENARIO! - ASSERT_EQ(small_index.graph().extent(0), n_graph); // Graph has 5000 nodes - ASSERT_EQ(small_index.size(), n_dataset); // Dataset has 10000 points - ASSERT_NE(small_index.graph().extent(0), small_index.size()); // Mismatch! + ASSERT_EQ(small_index.graph().extent(0), n_graph); // Graph has 5000 nodes + ASSERT_EQ(small_index.size(), n_dataset); // Dataset has 10000 points + ASSERT_NE(small_index.graph().extent(0), small_index.size()); // Mismatch! // Create queries auto queries = raft::make_device_matrix(res, n_queries, n_dim); - raft::random::uniform(res, r, queries.data_handle(), n_queries * n_dim, data_type(-1), data_type(1)); + raft::random::uniform( + res, r, queries.data_handle(), n_queries * n_dim, data_type(-1), data_type(1)); // Allocate output auto neighbors = raft::make_device_matrix(res, n_queries, k); @@ -92,10 +94,10 @@ class cagra_graph_smaller_than_dataset_test : public ::testing::Test { // Setup search params cagra::search_params search_params; - search_params.itopk_size = 64; - search_params.search_width = 1; + search_params.itopk_size = 64; + search_params.search_width = 1; search_params.max_iterations = 10; - search_params.algo = cagra::search_algo::SINGLE_CTA; + search_params.algo = cagra::search_algo::SINGLE_CTA; // THIS SHOULD NOT CRASH OR CAUSE OOB ACCESS // Before fix: random seeds use dataset.size (10000) -> tries to access graph[7000] -> CRASH! @@ -120,13 +122,12 @@ class cagra_graph_smaller_than_dataset_test : public ::testing::Test { // All neighbor indices should be valid (< n_graph) for (int64_t i = 0; i < n_queries * k; i++) { ASSERT_LT(neighbors_host.data_handle()[i], n_graph) - << "Neighbor index " << neighbors_host.data_handle()[i] - << " is >= graph size " << n_graph; + << "Neighbor index " << neighbors_host.data_handle()[i] << " is >= graph size " << n_graph; } // Test with MULTI_CTA algorithm as well (also had the same bug) search_params.algo = cagra::search_algo::MULTI_CTA; - + cagra::search(res, search_params, small_index, @@ -145,8 +146,8 @@ class cagra_graph_smaller_than_dataset_test : public ::testing::Test { for (int64_t i = 0; i < n_queries * k; i++) { ASSERT_LT(neighbors_host.data_handle()[i], n_graph) - << "Neighbor index " << neighbors_host.data_handle()[i] - << " is >= graph size " << n_graph << " (MULTI_CTA)"; + << "Neighbor index " << neighbors_host.data_handle()[i] << " is >= graph size " << n_graph + << " (MULTI_CTA)"; } } From 9b1dff803df5831b11edf6c8ff6fd8773439cde7 Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Tue, 13 Jan 2026 09:12:51 -0800 Subject: [PATCH 045/119] started updating the index building for itrative cagra q build --- cpp/.clangd | 65 ---- cpp/.clangd_headers/cuda_runtime.h | 52 +++ cpp/bench/ann/src/common/benchmark.hpp | 1 + .../neighbors/detail/cagra/cagra_build.cuh | 139 +++++--- .../neighbors/detail/cagra/cagra_search.cuh | 2 + .../detail/cagra/compute_distance.hpp | 2 + .../detail/cagra/search_multi_kernel.cuh | 2 + cpp/tests/neighbors/ann_cagra.cuh | 311 +++++++++--------- python/cuvs_bench/cuvs_bench/run/__main__.py | 1 + python/cuvs_bench/cuvs_bench/run/run.py | 2 + 10 files changed, 315 insertions(+), 262 deletions(-) delete mode 100644 cpp/.clangd create mode 100644 cpp/.clangd_headers/cuda_runtime.h diff --git a/cpp/.clangd b/cpp/.clangd deleted file mode 100644 index 7c4fe036dd..0000000000 --- a/cpp/.clangd +++ /dev/null @@ -1,65 +0,0 @@ -# https://clangd.llvm.org/config - -# Apply a config conditionally to all C files -If: - PathMatch: .*\.(c|h)$ - ---- - -# Apply a config conditionally to all C++ files -If: - PathMatch: .*\.(c|h)pp - ---- - -# Apply a config conditionally to all CUDA files -If: - PathMatch: .*\.cuh? -CompileFlags: - Add: - - "-x" - - "cuda" - # No error on unknown CUDA versions - - "-Wno-unknown-cuda-version" - # Allow variadic CUDA functions - - "-Xclang=-fcuda-allow-variadic-functions" -Diagnostics: - Suppress: - - "variadic_device_fn" - - "attributes_not_allowed" - ---- - -# Tweak the clangd parse settings for all files -CompileFlags: - Add: - # report all errors - - "-ferror-limit=0" - - "-fmacro-backtrace-limit=0" - - "-ftemplate-backtrace-limit=0" - # Skip the CUDA version check - - "--no-cuda-version-check" - Remove: - # remove gcc's -fcoroutines - - -fcoroutines - # remove nvc++ flags unknown to clang - - "-gpu=*" - - "-stdpar*" - # remove nvcc flags unknown to clang - - "-arch*" - - "-gencode*" - - "--generate-code*" - - "-ccbin*" - - "-t=*" - - "--threads*" - - "-Xptxas*" - - "-Xcudafe*" - - "-Xfatbin*" - - "-Xcompiler*" - - "--diag-suppress*" - - "--diag_suppress*" - - "--compiler-options*" - - "--expt-extended-lambda" - - "--expt-relaxed-constexpr" - - "-forward-unknown-to-host-compiler" - - "-Werror=cross-execution-space-call" diff --git a/cpp/.clangd_headers/cuda_runtime.h b/cpp/.clangd_headers/cuda_runtime.h new file mode 100644 index 0000000000..b3bdecf5a3 --- /dev/null +++ b/cpp/.clangd_headers/cuda_runtime.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include + +// Basic CUDA types needed for RAFT/cuVS analysis +enum cudaError_t { cudaSuccess = 0, cudaErrorMemoryAllocation = 2 }; +enum cudaMemcpyKind { cudaMemcpyHostToHost = 0, cudaMemcpyHostToDevice = 1, cudaMemcpyDeviceToHost = 2, cudaMemcpyDeviceToDevice = 3 }; + +// Global memory type enum (often used outside the struct) +// Moved out of struct to avoid type mismatch errors +enum cudaMemoryType { + cudaMemoryTypeHost = 1, + cudaMemoryTypeDevice = 2, + cudaMemoryTypeManaged = 3, + cudaMemoryTypeUnregistered = 4 +}; + +struct cudaPointerAttributes { + // Remove internal enum definition to avoid "different enumeration types" error + // Use the global enum type + enum cudaMemoryType type; + int device; + void* devicePointer; + void* hostPointer; + int isManaged; +}; + +typedef struct CUstream_st* cudaStream_t; +typedef struct cudaDeviceProp* cudaDeviceProp_t; // Incomplete type is usually enough for pointers + +// Stub functions (declarations only) +inline cudaError_t cudaPointerGetAttributes(cudaPointerAttributes* attributes, const void* ptr) { return cudaSuccess; } +inline cudaError_t cudaMalloc(void** devPtr, size_t size) { return cudaSuccess; } +inline cudaError_t cudaFree(void* devPtr) { return cudaSuccess; } +inline cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) { return cudaSuccess; } +inline cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0) { return cudaSuccess; } +inline cudaError_t cudaStreamSynchronize(cudaStream_t stream) { return cudaSuccess; } +inline cudaError_t cudaDeviceSynchronize() { return cudaSuccess; } + +// Error handling stubs +inline cudaError_t cudaGetLastError() { return cudaSuccess; } +inline cudaError_t cudaPeekAtLastError() { return cudaSuccess; } +inline const char* cudaGetErrorName(cudaError_t error) { return "cudaSuccess"; } +inline const char* cudaGetErrorString(cudaError_t error) { return "no error"; } + +// Defines that might be checked +#define __CUDACC__ 1 +#define __host__ +#define __device__ +#define __global__ +#define __forceinline__ inline diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index 22859e9ab8..1d5239ec9b 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -647,6 +647,7 @@ inline auto run_main(int argc, char** argv) -> int char* conf_path = argv[--argc]; std::ifstream conf_stream(conf_path); + for (int i = 1; i < argc; i++) { if (parse_bool_flag(argv[i], "--force", force_overwrite) || parse_bool_flag(argv[i], "--build", build_mode) || diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 97d7bb1bac..77846f202b 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1972,6 +1972,62 @@ struct mmap_owner { size_t size_; }; +template +void search_and_optimize(raft::resources const& res, + const cuvs::neighbors::cagra::search_params& search_params, + const index& idx, + raft::device_matrix_view dev_query_view, + raft::device_matrix_view dev_neighbors, + raft::device_matrix_view dev_distances, + raft::host_matrix_view neighbors_view, + raft::host_matrix& cagra_graph, + size_t curr_query_size, + size_t next_graph_degree, + size_t curr_topk, + uint64_t max_chunk_size, + bool flag_last, + const index_params& params) +{ + // Search. + // Since there are many queries, divide them into batches and search them. + cuvs::spatial::knn::detail::utils::batch_load_iterator query_batch( + dev_query_view.data_handle(), + curr_query_size, + dev_query_view.extent(1), + max_chunk_size, + raft::resource::get_cuda_stream(res), + raft::resource::get_workspace_resource(res)); + for (const auto& batch : query_batch) { + auto batch_dev_query_view = raft::make_device_matrix_view( + batch.data(), batch.size(), dev_query_view.extent(1)); + auto batch_dev_neighbors_view = raft::make_device_matrix_view( + dev_neighbors.data_handle(), batch.size(), curr_topk); + auto batch_dev_distances_view = raft::make_device_matrix_view( + dev_distances.data_handle(), batch.size(), curr_topk); + + cuvs::neighbors::cagra::search(res, + search_params, + idx, + batch_dev_query_view, + batch_dev_neighbors_view, + batch_dev_distances_view); + + auto batch_neighbors_view = raft::make_host_matrix_view( + neighbors_view.data_handle() + batch.offset() * curr_topk, batch.size(), curr_topk); + raft::copy(batch_neighbors_view.data_handle(), + batch_dev_neighbors_view.data_handle(), + batch_neighbors_view.size(), + raft::resource::get_cuda_stream(res)); + } + + // Optimize graph + auto next_graph_size = curr_query_size; + cagra_graph = raft::make_host_matrix(0, 0); // delete existing grahp + cagra_graph = raft::make_host_matrix(next_graph_size, next_graph_degree); + optimize( + res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0); +} + template > idx_opt; + if (params.compression.has_value()) { + auto start = std::chrono::high_resolution_clock::now(); + RAFT_EXPECTS(params.metric == cuvs::distance::DistanceType::L2Expanded, + "VPQ compression is only supported with L2Expanded distance mertric"); + idx_opt.emplace(res, params.metric); + //idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view())); + idx_opt->update_dataset( + res, + // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later + cuvs::neighbors::vpq_build( + res, *params.compression, dev_dataset)); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_ms = std::chrono::duration_cast(end - start).count(); + RAFT_LOG_INFO("# VPQ compression time: %.3lf sec", (double)elapsed_ms / 1000); + } while (true) { auto start = std::chrono::high_resolution_clock::now(); auto curr_query_size = std::min(2 * curr_graph_size, final_graph_size); @@ -2097,9 +2171,13 @@ auto iterative_build_graph( // search results (neighbors). auto dev_dataset_view = raft::make_device_matrix_view( dev_dataset.data_handle(), (int64_t)curr_graph_size, dev_dataset.extent(1)); - - auto idx = index( - res, params.metric, dev_dataset_view, raft::make_const_mdspan(cagra_graph.view())); + // No compression, create mdspan index + if (!params.compression.has_value()) { + idx_opt.emplace(res, params.metric, dev_dataset_view, raft::make_const_mdspan(cagra_graph.view())); + } else { + idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view())); + } + const auto& idx = *idx_opt; auto dev_query_view = raft::make_device_matrix_view( dev_dataset.data_handle(), (int64_t)curr_query_size, dev_dataset.extent(1)); @@ -2107,44 +2185,22 @@ auto iterative_build_graph( auto neighbors_view = raft::make_host_matrix_view(neighbors_ptr, curr_query_size, curr_topk); - // Search. - // Since there are many queries, divide them into batches and search them. - cuvs::spatial::knn::detail::utils::batch_load_iterator query_batch( - dev_query_view.data_handle(), - curr_query_size, - dev_query_view.extent(1), - max_chunk_size, - raft::resource::get_cuda_stream(res), - raft::resource::get_workspace_resource(res)); - for (const auto& batch : query_batch) { - auto batch_dev_query_view = raft::make_device_matrix_view( - batch.data(), batch.size(), dev_query_view.extent(1)); - auto batch_dev_neighbors_view = raft::make_device_matrix_view( - dev_neighbors.data_handle(), batch.size(), curr_topk); - auto batch_dev_distances_view = raft::make_device_matrix_view( - dev_distances.data_handle(), batch.size(), curr_topk); - - cuvs::neighbors::cagra::search(res, - search_params, - idx, - batch_dev_query_view, - batch_dev_neighbors_view, - batch_dev_distances_view); - - auto batch_neighbors_view = raft::make_host_matrix_view( - neighbors_view.data_handle() + batch.offset() * curr_topk, batch.size(), curr_topk); - raft::copy(batch_neighbors_view.data_handle(), - batch_dev_neighbors_view.data_handle(), - batch_neighbors_view.size(), - raft::resource::get_cuda_stream(res)); - } - - // Optimize graph - auto next_graph_size = curr_query_size; - cagra_graph = raft::make_host_matrix(0, 0); // delete existing grahp - cagra_graph = raft::make_host_matrix(next_graph_size, next_graph_degree); - optimize( - res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0); + + + search_and_optimize(res, + search_params, + idx, + dev_query_view, + dev_neighbors.view(), + dev_distances.view(), + neighbors_view, + cagra_graph, + curr_query_size, + next_graph_degree, + curr_topk, + max_chunk_size, + flag_last, + params); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_ms = std::chrono::duration_cast(end - start).count(); @@ -2152,6 +2208,7 @@ auto iterative_build_graph( if (flag_last) { break; } flag_last = (curr_graph_size == final_graph_size); + auto next_graph_size = curr_query_size; curr_graph_size = next_graph_size; } diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index 2d383a2429..a80afdca95 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -153,6 +153,7 @@ void search_main(raft::resources const& res, if (auto* strided_dset = dynamic_cast*>(&index.data()); strided_dset != nullptr) { // Search using a plain (strided) row-major dataset + RAFT_LOG_INFO("Searching with strided dataset"); RAFT_EXPECTS(index.metric() != cuvs::distance::DistanceType::CosineExpanded || index.dataset_norms().has_value(), "Dataset norms must be provided for CosineExpanded metric"); @@ -179,6 +180,7 @@ void search_main(raft::resources const& res, RAFT_FAIL("FP32 VPQ dataset support is coming soon"); } else if (auto* vpq_dset = dynamic_cast*>(&index.data()); vpq_dset != nullptr) { + RAFT_LOG_INFO("Searching with VPQ dataset"); auto desc = dataset_descriptor_init_with_cache( res, params, *vpq_dset, index.metric(), nullptr); search_main_core( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index f9974fa3df..c314d4d0a3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -239,11 +239,13 @@ struct dataset_descriptor_host { template state(InitF init, size_t size) : ready{false}, value{std::make_tuple(init, size)} { + // RAFT_LOG_INFO("trying to create a descriptor state %p", reinterpret_cast(this)); } ~state() noexcept { if (std::holds_alternative(value)) { + // RAFT_LOG_INFO("trying to free descriptor state %p", reinterpret_cast(this)); auto& [ptr, stream] = std::get(value); RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(ptr, stream)); } diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index 045c63fe59..e3ef7bacc1 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -118,6 +118,8 @@ RAFT_KERNEL random_pickup_kernel( if (global_team_index >= num_pickup) { return; } extern __shared__ uint8_t smem[]; dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); + // Set the resulting random index limit to the modulo wrap value if it is set + INDEX_T seed_index_limit = mod_wrap > 0 ? mod_wrap : dataset_desc->size; __syncthreads(); const INDEX_T seed_index_limit = graph_size > 0 ? graph_size : dataset_desc->size; diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh index beb379e44d..77dbcb683c 100644 --- a/cpp/tests/neighbors/ann_cagra.cuh +++ b/cpp/tests/neighbors/ann_cagra.cuh @@ -1455,75 +1455,74 @@ inline std::vector generate_inputs() inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); // Additional distances tested with a single search algo. - inputs2 = raft::util::itertools::product( - {1, 100}, - {1000}, - {8}, - {1, 16}, // k - {graph_build_algo::NN_DESCENT}, - {search_algo::SINGLE_CTA}, - {0}, // query size - {0}, - {256}, - {1}, - {cuvs::distance::DistanceType::InnerProduct, - cuvs::distance::DistanceType::BitwiseHamming, - cuvs::distance::DistanceType::CosineExpanded}, - {false}, - {true}, - {false}, - {0.995}, - {std::optional{std::nullopt}}, - {std::optional{std::nullopt}}, - {std::optional{std::nullopt}}, - {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL}); - inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); - - // Corner cases for small datasets - inputs2 = raft::util::itertools::product( - {2}, - {3, 6, 31, 32, 64, 101}, - {1, 10}, - {2}, // k - {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT}, - {search_algo::SINGLE_CTA, search_algo::MULTI_CTA, search_algo::MULTI_KERNEL}, - {0}, // query size - {0}, - {256}, - {1}, - {cuvs::distance::DistanceType::L2Expanded}, - {false}, - {true}, - {true}, - {0.995}, - {std::optional{std::nullopt}}, - {std::optional{std::nullopt}}, - {std::optional{std::nullopt}}, - {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL, - cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL}); - inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); + // inputs2 = raft::util::itertools::product( + // {1, 100}, + // {1000}, + // {8}, + // {1, 16}, // k + // {graph_build_algo::NN_DESCENT}, + // {search_algo::SINGLE_CTA}, + // {0}, // query size + // {0}, + // {256}, + // {1}, + // {cuvs::distance::DistanceType::InnerProduct, + // cuvs::distance::DistanceType::BitwiseHamming, + // cuvs::distance::DistanceType::CosineExpanded}, + // {false}, + // {true}, + // {false}, + // {0.995}, + // {std::optional{std::nullopt}}, + // {std::optional{std::nullopt}}, + // {std::optional{std::nullopt}}, + // {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL}); + // inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); + + // // Corner cases for small datasets + // inputs2 = raft::util::itertools::product( + // {2}, + // {3, 6, 31, 32, 64, 101}, + // {1, 10}, + // {2}, // k + // {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT}, + // {search_algo::SINGLE_CTA, search_algo::MULTI_CTA, search_algo::MULTI_KERNEL}, + // {0}, // query size + // {0}, + // {256}, + // {1}, + // {cuvs::distance::DistanceType::L2Expanded}, + // {false}, + // {true}, + // {true}, + // {0.995}, + // {std::optional{std::nullopt}}, + // {std::optional{std::nullopt}}, + // {std::optional{std::nullopt}}, + // {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL, + // cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL}); + // inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); // Varying dim and build algo. inputs2 = raft::util::itertools::product( {100}, - {1000}, - {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 1024}, // dim + {1000000}, + {768}, // dim {16}, // k - {graph_build_algo::IVF_PQ, - graph_build_algo::NN_DESCENT, + { + //graph_build_algo::IVF_PQ, + //graph_build_algo::NN_DESCENT, graph_build_algo::ITERATIVE_CAGRA_SEARCH}, {search_algo::AUTO}, {10}, {0}, {64}, {1}, - {cuvs::distance::DistanceType::L2Expanded, - cuvs::distance::DistanceType::InnerProduct, - cuvs::distance::DistanceType::BitwiseHamming}, + {cuvs::distance::DistanceType::L2Expanded}, {false}, {true}, {false}, - {0.995}, + {0.01}, {std::optional{std::nullopt}}, {std::optional{std::nullopt}}, {std::optional{std::nullopt}}, @@ -1532,107 +1531,107 @@ inline std::vector generate_inputs() inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); // Varying team_size, graph_build_algo - inputs2 = raft::util::itertools::product( - {100}, - {1000}, - {64}, - {16}, - {graph_build_algo::IVF_PQ, - graph_build_algo::NN_DESCENT, - graph_build_algo::ITERATIVE_CAGRA_SEARCH}, - {search_algo::AUTO}, - {10}, - {0}, // team_size - {64}, - {1}, - {cuvs::distance::DistanceType::L2Expanded, - cuvs::distance::DistanceType::InnerProduct, - cuvs::distance::DistanceType::BitwiseHamming, - cuvs::distance::DistanceType::CosineExpanded}, - {false}, - {false}, - {false}, - {0.995}, - {std::optional{std::nullopt}}, - {std::optional{std::nullopt}}, - {std::optional{std::nullopt}}, - {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL, - cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL}); - inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); - - // Vary team size only. - inputs2 = raft::util::itertools::product( - {100}, - {1000}, - {64}, - {16}, - {graph_build_algo::NN_DESCENT}, - {search_algo::AUTO}, - {10}, - {8, 16, 32}, // team_size - {64}, - {1}, - {cuvs::distance::DistanceType::L2Expanded, - cuvs::distance::DistanceType::InnerProduct, - cuvs::distance::DistanceType::BitwiseHamming, - cuvs::distance::DistanceType::CosineExpanded}, - {false}, - {false}, - {false}, - {0.995}, - {std::optional{std::nullopt}}, - {std::optional{std::nullopt}}, - {std::optional{std::nullopt}}, - {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL}); - inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); - - // Varying n_rows, host_dataset - inputs2 = raft::util::itertools::product( - {100}, - {10000}, - {32}, - {10}, - {graph_build_algo::AUTO}, - {search_algo::AUTO}, - {10}, - {0}, // team_size - {64}, - {1}, - {cuvs::distance::DistanceType::L2Expanded, cuvs::distance::DistanceType::InnerProduct}, - {false, true}, - {false}, - {true}, - {0.985}, - {std::optional{std::nullopt}}, - {std::optional{std::nullopt}}, - {std::optional{std::nullopt}}, - {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL, - cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL}); - inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); - - // A few PQ configurations. - // Varying dim, vq_n_centers - inputs2 = raft::util::itertools::product( - {100}, - {10000}, - {64, 128, 192, 256, 512, 1024}, // dim - {16}, // k - {graph_build_algo::IVF_PQ}, - {search_algo::AUTO}, - {10}, - {0}, - {64}, - {1}, - {cuvs::distance::DistanceType::L2Expanded}, - {false}, - {true}, - {false}, - {0.6}, - {std::optional{std::nullopt}}, - {std::optional{std::nullopt}}, - {std::optional{std::nullopt}}, - {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL, - cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL}); // don't demand high recall + // inputs2 = raft::util::itertools::product( + // {100}, + // {1000}, + // {64}, + // {16}, + // {graph_build_algo::IVF_PQ, + // graph_build_algo::NN_DESCENT, + // graph_build_algo::ITERATIVE_CAGRA_SEARCH}, + // {search_algo::AUTO}, + // {10}, + // {0}, // team_size + // {64}, + // {1}, + // {cuvs::distance::DistanceType::L2Expanded, + // cuvs::distance::DistanceType::InnerProduct, + // cuvs::distance::DistanceType::BitwiseHamming, + // cuvs::distance::DistanceType::CosineExpanded}, + // {false}, + // {false}, + // {false}, + // {0.995}, + // {std::optional{std::nullopt}}, + // {std::optional{std::nullopt}}, + // {std::optional{std::nullopt}}, + // {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL, + // cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL}); + // inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); + + // // Vary team size only. + // inputs2 = raft::util::itertools::product( + // {100}, + // {1000}, + // {64}, + // {16}, + // {graph_build_algo::NN_DESCENT}, + // {search_algo::AUTO}, + // {10}, + // {8, 16, 32}, // team_size + // {64}, + // {1}, + // {cuvs::distance::DistanceType::L2Expanded, + // cuvs::distance::DistanceType::InnerProduct, + // cuvs::distance::DistanceType::BitwiseHamming, + // cuvs::distance::DistanceType::CosineExpanded}, + // {false}, + // {false}, + // {false}, + // {0.995}, + // {std::optional{std::nullopt}}, + // {std::optional{std::nullopt}}, + // {std::optional{std::nullopt}}, + // {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL}); + // inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); + + // // Varying n_rows, host_dataset + // inputs2 = raft::util::itertools::product( + // {100}, + // {10000}, + // {32}, + // {10}, + // {graph_build_algo::AUTO}, + // {search_algo::AUTO}, + // {10}, + // {0}, // team_size + // {64}, + // {1}, + // {cuvs::distance::DistanceType::L2Expanded, cuvs::distance::DistanceType::InnerProduct}, + // {false, true}, + // {false}, + // {true}, + // {0.985}, + // {std::optional{std::nullopt}}, + // {std::optional{std::nullopt}}, + // {std::optional{std::nullopt}}, + // {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL, + // cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL}); + // inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); + + // // A few PQ configurations. + // // Varying dim, vq_n_centers + // inputs2 = raft::util::itertools::product( + // {100}, + // {10000}, + // {64, 128, 192, 256, 512, 1024}, // dim + // {16}, // k + // {graph_build_algo::IVF_PQ}, + // {search_algo::AUTO}, + // {10}, + // {0}, + // {64}, + // {1}, + // {cuvs::distance::DistanceType::L2Expanded}, + // {false}, + // {true}, + // {false}, + // {0.6}, + // {std::optional{std::nullopt}}, + // {std::optional{std::nullopt}}, + // {std::optional{std::nullopt}}, + // {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL, + // cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL}); // don't demand high recall // without refinement for (uint32_t pq_len : {2}) { // for now, only pq_len = 2 is supported, more options coming soon for (uint32_t vq_n_centers : {100, 1000}) { diff --git a/python/cuvs_bench/cuvs_bench/run/__main__.py b/python/cuvs_bench/cuvs_bench/run/__main__.py index ef1bf06d4e..5abd19d7c7 100644 --- a/python/cuvs_bench/cuvs_bench/run/__main__.py +++ b/python/cuvs_bench/cuvs_bench/run/__main__.py @@ -199,6 +199,7 @@ def main( """ + print("config ?") if not data_export: run_benchmark(**locals()) diff --git a/python/cuvs_bench/cuvs_bench/run/run.py b/python/cuvs_bench/cuvs_bench/run/run.py index 8830c89622..9597ff5245 100644 --- a/python/cuvs_bench/cuvs_bench/run/run.py +++ b/python/cuvs_bench/cuvs_bench/run/run.py @@ -637,6 +637,8 @@ def run_benchmark( conf_file = prepare_conf_file(dataset_conf, subset_size, count, batch_size) algos_conf_fs = gather_algorithm_configs(scripts_path, configuration) + + allowed_algos = algorithms.split(",") if algorithms else None allowed_groups = groups.split(",") if groups else None allowed_algo_groups = ( From 71a4a0901cde840baa14b4b72eb15e2606e1e003 Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Tue, 13 Jan 2026 09:15:59 -0800 Subject: [PATCH 046/119] removed temp files --- cpp/.clangd_headers/cuda_runtime.h | 52 ------------------------------ 1 file changed, 52 deletions(-) delete mode 100644 cpp/.clangd_headers/cuda_runtime.h diff --git a/cpp/.clangd_headers/cuda_runtime.h b/cpp/.clangd_headers/cuda_runtime.h deleted file mode 100644 index b3bdecf5a3..0000000000 --- a/cpp/.clangd_headers/cuda_runtime.h +++ /dev/null @@ -1,52 +0,0 @@ -#pragma once - -#include -#include - -// Basic CUDA types needed for RAFT/cuVS analysis -enum cudaError_t { cudaSuccess = 0, cudaErrorMemoryAllocation = 2 }; -enum cudaMemcpyKind { cudaMemcpyHostToHost = 0, cudaMemcpyHostToDevice = 1, cudaMemcpyDeviceToHost = 2, cudaMemcpyDeviceToDevice = 3 }; - -// Global memory type enum (often used outside the struct) -// Moved out of struct to avoid type mismatch errors -enum cudaMemoryType { - cudaMemoryTypeHost = 1, - cudaMemoryTypeDevice = 2, - cudaMemoryTypeManaged = 3, - cudaMemoryTypeUnregistered = 4 -}; - -struct cudaPointerAttributes { - // Remove internal enum definition to avoid "different enumeration types" error - // Use the global enum type - enum cudaMemoryType type; - int device; - void* devicePointer; - void* hostPointer; - int isManaged; -}; - -typedef struct CUstream_st* cudaStream_t; -typedef struct cudaDeviceProp* cudaDeviceProp_t; // Incomplete type is usually enough for pointers - -// Stub functions (declarations only) -inline cudaError_t cudaPointerGetAttributes(cudaPointerAttributes* attributes, const void* ptr) { return cudaSuccess; } -inline cudaError_t cudaMalloc(void** devPtr, size_t size) { return cudaSuccess; } -inline cudaError_t cudaFree(void* devPtr) { return cudaSuccess; } -inline cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) { return cudaSuccess; } -inline cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0) { return cudaSuccess; } -inline cudaError_t cudaStreamSynchronize(cudaStream_t stream) { return cudaSuccess; } -inline cudaError_t cudaDeviceSynchronize() { return cudaSuccess; } - -// Error handling stubs -inline cudaError_t cudaGetLastError() { return cudaSuccess; } -inline cudaError_t cudaPeekAtLastError() { return cudaSuccess; } -inline const char* cudaGetErrorName(cudaError_t error) { return "cudaSuccess"; } -inline const char* cudaGetErrorString(cudaError_t error) { return "no error"; } - -// Defines that might be checked -#define __CUDACC__ 1 -#define __host__ -#define __device__ -#define __global__ -#define __forceinline__ inline From ff8174d0e5647eff349a48f28f22ffe53ff72333 Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Wed, 14 Jan 2026 00:49:37 -0800 Subject: [PATCH 047/119] started implementing cagra q index computation outside of the loop --- cpp/.clangd_headers/cuda_runtime.h | 52 +++++++++++++++++++ .../detail/cagra/search_multi_kernel.cuh | 2 +- 2 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 cpp/.clangd_headers/cuda_runtime.h diff --git a/cpp/.clangd_headers/cuda_runtime.h b/cpp/.clangd_headers/cuda_runtime.h new file mode 100644 index 0000000000..b3bdecf5a3 --- /dev/null +++ b/cpp/.clangd_headers/cuda_runtime.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include + +// Basic CUDA types needed for RAFT/cuVS analysis +enum cudaError_t { cudaSuccess = 0, cudaErrorMemoryAllocation = 2 }; +enum cudaMemcpyKind { cudaMemcpyHostToHost = 0, cudaMemcpyHostToDevice = 1, cudaMemcpyDeviceToHost = 2, cudaMemcpyDeviceToDevice = 3 }; + +// Global memory type enum (often used outside the struct) +// Moved out of struct to avoid type mismatch errors +enum cudaMemoryType { + cudaMemoryTypeHost = 1, + cudaMemoryTypeDevice = 2, + cudaMemoryTypeManaged = 3, + cudaMemoryTypeUnregistered = 4 +}; + +struct cudaPointerAttributes { + // Remove internal enum definition to avoid "different enumeration types" error + // Use the global enum type + enum cudaMemoryType type; + int device; + void* devicePointer; + void* hostPointer; + int isManaged; +}; + +typedef struct CUstream_st* cudaStream_t; +typedef struct cudaDeviceProp* cudaDeviceProp_t; // Incomplete type is usually enough for pointers + +// Stub functions (declarations only) +inline cudaError_t cudaPointerGetAttributes(cudaPointerAttributes* attributes, const void* ptr) { return cudaSuccess; } +inline cudaError_t cudaMalloc(void** devPtr, size_t size) { return cudaSuccess; } +inline cudaError_t cudaFree(void* devPtr) { return cudaSuccess; } +inline cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) { return cudaSuccess; } +inline cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0) { return cudaSuccess; } +inline cudaError_t cudaStreamSynchronize(cudaStream_t stream) { return cudaSuccess; } +inline cudaError_t cudaDeviceSynchronize() { return cudaSuccess; } + +// Error handling stubs +inline cudaError_t cudaGetLastError() { return cudaSuccess; } +inline cudaError_t cudaPeekAtLastError() { return cudaSuccess; } +inline const char* cudaGetErrorName(cudaError_t error) { return "cudaSuccess"; } +inline const char* cudaGetErrorString(cudaError_t error) { return "no error"; } + +// Defines that might be checked +#define __CUDACC__ 1 +#define __host__ +#define __device__ +#define __global__ +#define __forceinline__ inline diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index e3ef7bacc1..bd5383d04a 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -119,7 +119,7 @@ RAFT_KERNEL random_pickup_kernel( extern __shared__ uint8_t smem[]; dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); // Set the resulting random index limit to the modulo wrap value if it is set - INDEX_T seed_index_limit = mod_wrap > 0 ? mod_wrap : dataset_desc->size; + INDEX_T seed_index_limit = graph_size > 0 ? graph_size : dataset_desc->size; __syncthreads(); const INDEX_T seed_index_limit = graph_size > 0 ? graph_size : dataset_desc->size; From 74f5894a7eac8c0c2d56e0c3cdff6add2824585e Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Wed, 14 Jan 2026 00:53:51 -0800 Subject: [PATCH 048/119] removed stub file --- cpp/.clangd_headers/cuda_runtime.h | 52 ------------------------------ 1 file changed, 52 deletions(-) delete mode 100644 cpp/.clangd_headers/cuda_runtime.h diff --git a/cpp/.clangd_headers/cuda_runtime.h b/cpp/.clangd_headers/cuda_runtime.h deleted file mode 100644 index b3bdecf5a3..0000000000 --- a/cpp/.clangd_headers/cuda_runtime.h +++ /dev/null @@ -1,52 +0,0 @@ -#pragma once - -#include -#include - -// Basic CUDA types needed for RAFT/cuVS analysis -enum cudaError_t { cudaSuccess = 0, cudaErrorMemoryAllocation = 2 }; -enum cudaMemcpyKind { cudaMemcpyHostToHost = 0, cudaMemcpyHostToDevice = 1, cudaMemcpyDeviceToHost = 2, cudaMemcpyDeviceToDevice = 3 }; - -// Global memory type enum (often used outside the struct) -// Moved out of struct to avoid type mismatch errors -enum cudaMemoryType { - cudaMemoryTypeHost = 1, - cudaMemoryTypeDevice = 2, - cudaMemoryTypeManaged = 3, - cudaMemoryTypeUnregistered = 4 -}; - -struct cudaPointerAttributes { - // Remove internal enum definition to avoid "different enumeration types" error - // Use the global enum type - enum cudaMemoryType type; - int device; - void* devicePointer; - void* hostPointer; - int isManaged; -}; - -typedef struct CUstream_st* cudaStream_t; -typedef struct cudaDeviceProp* cudaDeviceProp_t; // Incomplete type is usually enough for pointers - -// Stub functions (declarations only) -inline cudaError_t cudaPointerGetAttributes(cudaPointerAttributes* attributes, const void* ptr) { return cudaSuccess; } -inline cudaError_t cudaMalloc(void** devPtr, size_t size) { return cudaSuccess; } -inline cudaError_t cudaFree(void* devPtr) { return cudaSuccess; } -inline cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) { return cudaSuccess; } -inline cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0) { return cudaSuccess; } -inline cudaError_t cudaStreamSynchronize(cudaStream_t stream) { return cudaSuccess; } -inline cudaError_t cudaDeviceSynchronize() { return cudaSuccess; } - -// Error handling stubs -inline cudaError_t cudaGetLastError() { return cudaSuccess; } -inline cudaError_t cudaPeekAtLastError() { return cudaSuccess; } -inline const char* cudaGetErrorName(cudaError_t error) { return "cudaSuccess"; } -inline const char* cudaGetErrorString(cudaError_t error) { return "no error"; } - -// Defines that might be checked -#define __CUDACC__ 1 -#define __host__ -#define __device__ -#define __global__ -#define __forceinline__ inline From 5d7c4f46a666f9dfdedbfa66b3bf3d06c76f48fb Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Wed, 14 Jan 2026 01:11:04 -0800 Subject: [PATCH 049/119] updated gitignore --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3627558ff5..317c28d997 100644 --- a/.gitignore +++ b/.gitignore @@ -72,7 +72,9 @@ docs/source/_static/rust # clang tooling compile_commands.json -.clangd/ + + + # serialized ann indexes brute_force_index From c6dd661e0c45db723e1717cdd6ec5c8f938dceb2 Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Wed, 14 Jan 2026 08:27:32 -0800 Subject: [PATCH 050/119] cagra q index now is only calculated once in the iterative build --- cpp/include/cuvs/neighbors/cagra.hpp | 7 +++++++ cpp/src/neighbors/detail/cagra/cagra_build.cuh | 6 +++++- cpp/src/neighbors/detail/cagra/cagra_search.cuh | 8 ++++++-- cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh | 4 ++-- .../detail/cagra/search_single_cta_kernel-inl.cuh | 3 ++- 5 files changed, 22 insertions(+), 6 deletions(-) diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index 9b9e8eb0e6..ef55507869 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -247,6 +247,13 @@ struct search_params : cuvs::neighbors::search_params { /** Bit mask used for initial random seed node selection. */ uint64_t rand_xor_mask = 0x128394; + /** + * Maximum node ID for random seed selection. + * When > 0, random seeds are constrained to [0, max_node_id) instead of [0, dataset_size). + * This is useful when the graph is smaller than the dataset (e.g., iterative build with compression). + * Default 0 means no constraint (use dataset_size). + */ + uint32_t max_node_id = 0; /** Whether to use the persistent version of the kernel (only SINGLE_CTA is supported a.t.m.) */ bool persistent = false; /** Persistent kernel: time in seconds before the kernel stops if no requests received. */ diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 77846f202b..70a29fe379 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1990,6 +1990,7 @@ void search_and_optimize(raft::resources const& res, { // Search. // Since there are many queries, divide them into batches and search them. + RAFT_LOG_DEBUG("search_and_optimize: search_params.max_node_id=%u", search_params.max_node_id); cuvs::spatial::knn::detail::utils::batch_load_iterator query_batch( dev_query_view.data_handle(), curr_query_size, @@ -2185,7 +2186,10 @@ auto iterative_build_graph( auto neighbors_view = raft::make_host_matrix_view(neighbors_ptr, curr_query_size, curr_topk); - + // Set max_node_id to constrain random seed selection to valid graph nodes + search_params.max_node_id = static_cast(curr_graph_size); + RAFT_LOG_DEBUG("iterative_build: Setting search_params.max_node_id=%u (curr_graph_size=%lu)", + search_params.max_node_id, curr_graph_size); search_and_optimize(res, search_params, diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index a80afdca95..0efb7cfd45 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -51,6 +51,8 @@ void search_main_core( raft::device_matrix_view distances, CagraSampleFilterT sample_filter = CagraSampleFilterT()) { + RAFT_LOG_DEBUG("search_main_core: max_node_id=%u, graph.extent(0)=%lu", + params.max_node_id, graph.extent(0)); static_assert(std::is_same_v, "Only uint32_t is supported as the graph element type (internal index type)"); RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n", @@ -72,11 +74,13 @@ void search_main_core( topk, queries.extent(1)); + RAFT_LOG_DEBUG("search_main_core: creating plan with max_node_id=%u", params.max_node_id); using CagraSampleFilterT_s = typename CagraSampleFilterT_Selector::type; std::unique_ptr< search_plan_impl> plan = factory::create( res, params, dataset_desc, queries.extent(1), graph.extent(0), graph.extent(1), topk); + RAFT_LOG_DEBUG("search_main_core: plan created, plan->max_node_id=%u", plan->max_node_id); plan->check(topk); @@ -153,7 +157,7 @@ void search_main(raft::resources const& res, if (auto* strided_dset = dynamic_cast*>(&index.data()); strided_dset != nullptr) { // Search using a plain (strided) row-major dataset - RAFT_LOG_INFO("Searching with strided dataset"); + RAFT_LOG_DEBUG("Searching with strided dataset"); RAFT_EXPECTS(index.metric() != cuvs::distance::DistanceType::CosineExpanded || index.dataset_norms().has_value(), "Dataset norms must be provided for CosineExpanded metric"); @@ -180,7 +184,7 @@ void search_main(raft::resources const& res, RAFT_FAIL("FP32 VPQ dataset support is coming soon"); } else if (auto* vpq_dset = dynamic_cast*>(&index.data()); vpq_dset != nullptr) { - RAFT_LOG_INFO("Searching with VPQ dataset"); + RAFT_LOG_DEBUG("Searching with VPQ dataset"); auto desc = dataset_descriptor_init_with_cache( res, params, *vpq_dset, index.metric(), nullptr); search_main_core( diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index bd5383d04a..c8d94edb82 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -178,6 +178,7 @@ void random_pickup(const dataset_descriptor_host& data const auto num_teams_per_threadblock = block_size / dataset_desc.team_size; const dim3 grid_size((num_pickup + num_teams_per_threadblock - 1) / num_teams_per_threadblock, num_queries); + RAFT_LOG_DEBUG("max_node_id: %d", mod_wrap); random_pickup_kernel<<>>( dataset_desc.dev_ptr(cuda_stream), @@ -833,8 +834,7 @@ struct search result_buffer_allocation_size, hashmap.data(), hash_bitlen, - stream, - static_cast(this->dataset_size)); + stream); unsigned iter = 0; while (1) { diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index 5d465c25b5..b0ea9113d7 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -1323,7 +1323,8 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel_p( small_hash_bitlen, small_hash_reset_interval, query_id, - sample_filter); + sample_filter, + 0); // TODO: persistent kernel doesn't support max_node_id yet // make sure all writes are visible even for the host // (e.g. when result buffers are in pinned memory) From 83580cd7bc57226fed054fb6ab58790a41fdcd3d Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Mon, 9 Feb 2026 05:35:00 -0800 Subject: [PATCH 051/119] fixed rebasing artefact --- cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index c8d94edb82..c8bd9db324 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -118,12 +118,10 @@ RAFT_KERNEL random_pickup_kernel( if (global_team_index >= num_pickup) { return; } extern __shared__ uint8_t smem[]; dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id); - // Set the resulting random index limit to the modulo wrap value if it is set - INDEX_T seed_index_limit = graph_size > 0 ? graph_size : dataset_desc->size; __syncthreads(); + // Set the resulting random index limit to the modulo wrap value if it is set const INDEX_T seed_index_limit = graph_size > 0 ? graph_size : dataset_desc->size; - INDEX_T best_index_team_local; DISTANCE_T best_norm2_team_local = utils::get_max_value(); for (unsigned i = 0; i < num_distilation; i++) { From 13b1f77b8000053d0c8a2170b390f9707fc6fbc1 Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Mon, 16 Feb 2026 05:54:19 -0800 Subject: [PATCH 052/119] addressed comments regarding type cast --- cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh | 2 +- cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh index a2e9a43ff0..92331b0eb6 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -630,7 +630,7 @@ void select_and_run(const dataset_descriptor_host& dat ps.max_iterations, num_executed_iterations, sample_filter, - static_cast(graph.extent(0))); + static_cast(graph.extent(0))); } } // namespace multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index 5d465c25b5..32bdcf07ad 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -2348,7 +2348,7 @@ control is returned in this thread (in persistent_runner_t constructor), so we'r small_hash_bitlen, small_hash_reset_interval, sample_filter, - static_cast(graph.extent(0))); + static_cast(graph.extent(0))); RAFT_CUDA_TRY(cudaPeekAtLastError()); } } From 609b0f3db6643c0a62dd89d59202ec4a43b339c5 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Mon, 16 Feb 2026 18:52:40 +0000 Subject: [PATCH 053/119] prune kernel smem --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 41efa1686f..270e76838a 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -166,14 +166,20 @@ __global__ void kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, g uint64_t* const stats) { __shared__ uint32_t smem_num_detour[MAX_DEGREE]; + extern __shared__ unsigned char smem_buf[]; + IdxT* const smem_knn_iA_neighbors = reinterpret_cast(smem_buf); + uint64_t* const num_retain = stats; uint64_t* const num_full = stats + 1; const uint64_t iA = blockIdx.x + (batch_size * batch_id); if (iA >= graph_size) { return; } + + // Load this node's neighbor row into shared memory to reduce global reads for (uint32_t k = threadIdx.x; k < graph_degree; k += blockDim.x) { - smem_num_detour[k] = 0; - if (knn_graph[k + ((uint64_t)graph_degree * iA)] == iA) { + smem_num_detour[k] = 0; + smem_knn_iA_neighbors[k] = knn_graph[k + ((uint64_t)graph_degree * iA)]; + if (smem_knn_iA_neighbors[k] == iA) { // Lower the priority of self-edge smem_num_detour[k] = graph_degree; } @@ -182,14 +188,14 @@ __global__ void kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, g // count number of detours (A->D->B) for (uint32_t kAD = 0; kAD < graph_degree - 1; kAD++) { - const uint64_t iD = knn_graph[kAD + (graph_degree * iA)]; + const uint64_t iD = smem_knn_iA_neighbors[kAD]; if (iD >= graph_size) { continue; } for (uint32_t kDB = threadIdx.x; kDB < graph_degree; kDB += blockDim.x) { const uint64_t iB_candidate = knn_graph[kDB + ((uint64_t)graph_degree * iD)]; for (uint32_t kAB = kAD + 1; kAB < graph_degree; kAB++) { // if ( kDB < kAB ) { - const uint64_t iB = knn_graph[kAB + (graph_degree * iA)]; + const uint64_t iB = smem_knn_iA_neighbors[kAB]; if (iB == iB_candidate) { atomicAdd(smem_num_detour + kAB, 1); break; @@ -1298,9 +1304,10 @@ void optimize( RAFT_CUDA_TRY(cudaMemsetAsync( dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res))); + const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT); for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { kern_prune - <<>>( + <<>>( d_input_graph.data_handle(), graph_size, knn_graph_degree, From a320e0e90453527e156da007bb96dc00de3898c0 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 18 Feb 2026 16:26:54 +0000 Subject: [PATCH 054/119] reduce copies within reverse graph compute --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 61 ++++++++++++------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 270e76838a..f3b0f0778e 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -244,6 +244,29 @@ __global__ void kern_make_rev_graph(const IdxT* const dest_nodes, // [graph_ } } +// Build reverse graph from column k of output_graph (avoids per-column host fill and H2D copy). +template +__global__ void kern_make_rev_graph_column(const IdxT* const output_graph, // [graph_size, degree] + IdxT* const rev_graph, + uint32_t* const rev_graph_count, + const uint32_t graph_size, + const uint32_t degree, + const uint32_t k) +{ + const uint64_t tid = threadIdx.x + (blockDim.x * blockIdx.x); + const uint64_t tnum = blockDim.x * gridDim.x; + + for (uint64_t src_id = tid; src_id < graph_size; src_id += tnum) { + const IdxT dest_id = output_graph[k + (static_cast(degree) * src_id)]; + if (dest_id >= graph_size) continue; + + const uint32_t pos = atomicAdd(rev_graph_count + dest_id, 1); + if (pos < degree) { + rev_graph[(static_cast(degree) * dest_id) + pos] = static_cast(src_id); + } + } +} + template __device__ __host__ LabelT get_root_label(IdxT i, const LabelT* label) { @@ -1444,32 +1467,26 @@ void optimize( graph_size * sizeof(uint32_t), raft::resource::get_cuda_stream(res))); - auto dest_nodes = raft::make_host_vector(graph_size); - auto d_dest_nodes = - raft::make_device_mdarray(res, large_tmp_mr, raft::make_extents(graph_size)); - - for (uint64_t k = 0; k < output_graph_degree; k++) { -#pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - // dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)]; - dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)]; - } - raft::resource::sync_stream(res); - - raft::copy(d_dest_nodes.data_handle(), - dest_nodes.data_handle(), - graph_size, - raft::resource::get_cuda_stream(res)); + // Copy full output graph to device once; kernel indexes by column k (no per-column H2D copy). + // TODO: depending on available device memory, this may need to be split into multiple copies. + auto d_output_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); + raft::copy(d_output_graph.data_handle(), + output_graph_ptr, + static_cast(graph_size) * output_graph_degree, + raft::resource::get_cuda_stream(res)); - dim3 threads(256, 1, 1); - dim3 blocks(1024, 1, 1); - kern_make_rev_graph<<>>( - d_dest_nodes.data_handle(), + dim3 threads(256, 1, 1); + dim3 blocks(1024, 1, 1); + for (uint32_t k = 0; k < output_graph_degree; k++) { + kern_make_rev_graph_column<<>>( + d_output_graph.data_handle(), d_rev_graph.data_handle(), d_rev_graph_count.data_handle(), graph_size, - output_graph_degree); - RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u \r", k, output_graph_degree); + output_graph_degree, + k); + RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %u / %u \r", k, output_graph_degree); } raft::resource::sync_stream(res); From 6d1a6187f2cc138c4941608d4d9c746a11e0d774 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Thu, 19 Feb 2026 23:07:23 +0000 Subject: [PATCH 055/119] optimize() draft move more compute to GPU --- .../neighbors/detail/cagra/cagra_build.cuh | 2 + cpp/src/neighbors/detail/cagra/graph_core.cuh | 864 ++++++++++++------ 2 files changed, 590 insertions(+), 276 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 97d7bb1bac..152b603286 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -822,6 +822,8 @@ inline std::pair optimize_workspace_size(size_t n_rows, size_t index_size, bool mst_optimize = false) { + // TODO: MODIFY!! + // MST optimization memory (host only) size_t mst_host = n_rows * index_size; // mst_graph_num_edges if (mst_optimize) { diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index f3b0f0778e..f2cd79ecb6 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -161,8 +161,8 @@ __global__ void kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, g const uint32_t degree, const uint32_t batch_size, const uint32_t batch_id, - uint8_t* const detour_count, // [graph_chunk_size, graph_degree] - uint32_t* const num_no_detour_edges, // [graph_size] + uint8_t* const detour_count, // [batch_size, graph_degree] + uint32_t* const num_no_detour_edges, // [batch_size] uint64_t* const stats) { __shared__ uint32_t smem_num_detour[MAX_DEGREE]; @@ -172,7 +172,9 @@ __global__ void kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, g uint64_t* const num_retain = stats; uint64_t* const num_full = stats + 1; - const uint64_t iA = blockIdx.x + (batch_size * batch_id); + const uint64_t iA = blockIdx.x + (batch_size * batch_id); + const uint64_t iA_batch = iA % static_cast(batch_size); + if (iA >= graph_size) { return; } // Load this node's neighbor row into shared memory to reduce global reads @@ -208,7 +210,7 @@ __global__ void kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, g uint32_t num_edges_no_detour = 0; for (uint32_t k = threadIdx.x; k < graph_degree; k += blockDim.x) { - detour_count[k + (graph_degree * iA)] = min(smem_num_detour[k], (uint32_t)255); + detour_count[k + (graph_degree * iA_batch)] = min(smem_num_detour[k], (uint32_t)255); if (smem_num_detour[k] == 0) { num_edges_no_detour++; } } num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 1); @@ -219,7 +221,7 @@ __global__ void kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, g num_edges_no_detour = min(num_edges_no_detour, degree); if (threadIdx.x == 0) { - num_no_detour_edges[iA] = num_edges_no_detour; + num_no_detour_edges[iA_batch] = num_edges_no_detour; atomicAdd((unsigned long long int*)num_retain, (unsigned long long int)num_edges_no_detour); if (num_edges_no_detour >= degree) { atomicAdd((unsigned long long int*)num_full, 1); } } @@ -244,26 +246,179 @@ __global__ void kern_make_rev_graph(const IdxT* const dest_nodes, // [graph_ } } -// Build reverse graph from column k of output_graph (avoids per-column host fill and H2D copy). +// Select output_graph_degree neighbors with smallest detour count per node (writes to device). template -__global__ void kern_make_rev_graph_column(const IdxT* const output_graph, // [graph_size, degree] - IdxT* const rev_graph, - uint32_t* const rev_graph_count, - const uint32_t graph_size, - const uint32_t degree, - const uint32_t k) +__global__ void kern_select_smallest_detour_neighbors( + const IdxT* const knn_graph, + uint64_t graph_size, + uint64_t knn_graph_degree, + uint64_t output_graph_degree, + const uint8_t* const d_detour_count, // [batch_size, graph_degree] + IdxT* output_graph_ptr, // [batch_size, output_graph_degree] + const uint32_t batch_size, + const uint32_t batch_id) { - const uint64_t tid = threadIdx.x + (blockDim.x * blockIdx.x); - const uint64_t tnum = blockDim.x * gridDim.x; + // FIXME: this does not really work for num_warps > 1 + constexpr unsigned warp_mask = 0xffffffff; + const uint32_t num_warps = blockDim.x / raft::WarpSize; + extern __shared__ unsigned char smem_buf[]; + uint32_t* smem_indices = reinterpret_cast(smem_buf); + uint16_t* smem_detour_count = + reinterpret_cast(&smem_indices[knn_graph_degree * num_warps]); - for (uint64_t src_id = tid; src_id < graph_size; src_id += tnum) { - const IdxT dest_id = output_graph[k + (static_cast(degree) * src_id)]; - if (dest_id >= graph_size) continue; + const uint32_t wid = threadIdx.x / raft::WarpSize; + const uint32_t lane_id = threadIdx.x % raft::WarpSize; + const uint64_t nid = static_cast(blockIdx.x) * num_warps + + (static_cast(batch_size) * batch_id * num_warps) + wid; - const uint32_t pos = atomicAdd(rev_graph_count + dest_id, 1); - if (pos < degree) { - rev_graph[(static_cast(degree) * dest_id) + pos] = static_cast(src_id); + const uint64_t nid_batch = nid % static_cast(batch_size); + + if (nid >= graph_size) return; + + for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) { + smem_detour_count[(knn_graph_degree * wid) + k] = + d_detour_count[nid_batch * knn_graph_degree + k]; + smem_indices[(knn_graph_degree * wid) + k] = k; + } + __syncwarp(warp_mask); + + for (uint32_t i = 0; i < output_graph_degree; i++) { + uint32_t local_min = 256; + uint32_t local_idx = 0xffffffff; + for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) { + uint32_t c = smem_detour_count[(knn_graph_degree * wid) + k]; + if (c < local_min) { + local_min = c; + local_idx = smem_indices[(knn_graph_degree * wid) + k]; + } + } + uint32_t local_min_with_tag = (local_min << 16) | local_idx; + for (int offset = raft::WarpSize / 2; offset > 0; offset /= 2) { + uint32_t other = __shfl_down_sync(warp_mask, local_min_with_tag, offset); + local_min_with_tag = (local_min_with_tag <= other) ? local_min_with_tag : other; + } + uint32_t warp_min_tag = __shfl_sync(warp_mask, local_min_with_tag, 0); + uint32_t warp_local_idx = warp_min_tag & 0xffff; + + if (local_idx == warp_local_idx) { + output_graph_ptr[nid_batch * output_graph_degree + i] = + knn_graph[knn_graph_degree * nid + warp_local_idx]; + smem_detour_count[knn_graph_degree * wid + warp_local_idx] = 255; + } + __syncwarp(warp_mask); + } +} + +// Helper functions for merging the graph +template +__device__ unsigned int warp_pos_in_array(T val, const T* array, uint64_t num) +{ + unsigned int ret = num; + const uint32_t lane_id = threadIdx.x % 32; + for (uint64_t i = lane_id; i < num; i += 32) { + if (val == array[i]) { + ret = i; + break; + } + } + ret = __reduce_min_sync(0xffffffff, ret); + return ret; +} + +template +__device__ void thread_shift_array(T* array, uint64_t num) +{ + for (uint64_t i = num; i > 0; i--) { + array[i] = array[i - 1]; + } +} + +template +__global__ void kern_merge_graph(IdxT* output_graph, + const IdxT* const rev_graph, + uint32_t* const rev_graph_count, // [graph_size] + const uint32_t graph_size, + const uint32_t output_graph_degree, + const IdxT* const mst_graph, + const uint32_t mst_graph_degree, + const uint32_t* const mst_graph_num_edges_ptr, + const uint32_t batch_size, + const uint32_t batch_id, + bool guarantee_connectivity, + bool* check_num_protected_edges) +{ + extern __shared__ unsigned char smem_buf[]; + IdxT* smem_sorted_output_graph = reinterpret_cast(smem_buf); + + const uint32_t wid = threadIdx.x / 32; + const uint32_t lane_id = threadIdx.x % 32; + const uint32_t num_warps = blockDim.x / 32; + const uint64_t nid = blockIdx.x * num_warps + (batch_size * batch_id * num_warps) + wid; + if (nid >= graph_size) { return; } + + if (lane_id == 0) check_num_protected_edges[0] = true; + + const auto mst_graph_num_edges = mst_graph_num_edges_ptr[nid]; + // If guarantee_connectivity == true, use a temporal list to merge the + // neighbor lists of the graphs. + if (guarantee_connectivity) { + for (uint32_t i = lane_id; i < mst_graph_degree; i += 32) { + smem_sorted_output_graph[i] = mst_graph[nid * mst_graph_degree + i]; } + __syncwarp(); + for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges; + (pruned_j < output_graph_degree) && (output_j < output_graph_degree); + pruned_j++) { + const auto v = output_graph[output_graph_degree * nid + pruned_j]; + unsigned int dup = 0; + for (uint32_t m = lane_id; m < output_j; m += 32) { + if (v == smem_sorted_output_graph[m]) { + dup = 1; + break; + } + } + + unsigned int warp_dup = __ballot_sync(0xffffffff, dup); + if (warp_dup == 0) { + if (lane_id == 0) smem_sorted_output_graph[output_j] = v; + output_j++; + } + __syncwarp(); + } + } + + else { + for (uint32_t i = lane_id; i < output_graph_degree; i += 32) { + smem_sorted_output_graph[i] = output_graph[output_graph_degree * nid + i]; + } + __syncwarp(); + } + + const auto num_protected_edges = max(mst_graph_num_edges, output_graph_degree / 2); + + if (num_protected_edges > output_graph_degree) { check_num_protected_edges[0] = false; } + if (num_protected_edges == output_graph_degree) { return; } + + auto kr = min(rev_graph_count[nid], output_graph_degree); + + while (kr) { + kr -= 1; + if (rev_graph[kr + (output_graph_degree * nid)] < graph_size) { + uint64_t pos = warp_pos_in_array( + rev_graph[kr + (output_graph_degree * nid)], smem_sorted_output_graph, output_graph_degree); + if (pos < num_protected_edges) { continue; } + uint64_t num_shift = pos - num_protected_edges; + if (pos >= output_graph_degree) { num_shift = output_graph_degree - num_protected_edges - 1; } + if (lane_id == 0) { + thread_shift_array(smem_sorted_output_graph + num_protected_edges, num_shift); + smem_sorted_output_graph[num_protected_edges] = rev_graph[kr + (output_graph_degree * nid)]; + } + __syncwarp(); + } + } + + for (uint32_t i = lane_id; i < output_graph_degree; i += 32) { + output_graph[(output_graph_degree * nid) + i] = smem_sorted_output_graph[i]; } } @@ -737,11 +892,11 @@ void mst_opt_update_graph(IdxT* mst_graph_ptr, // an approximate MST. // * If the input kNN graph is disconnected, random connection is added to the largest cluster. // -template +template void mst_optimization(raft::resources const& res, - raft::host_matrix_view input_graph, - raft::host_matrix_view output_graph, - raft::host_vector_view mst_graph_num_edges, + InputMatrixView input_graph, + OutputMatrixView output_graph, + VectorView mst_graph_num_edges, bool use_gpu = true) { if (use_gpu) { @@ -1185,6 +1340,7 @@ void count_2hop_detours(raft::host_matrix_view k } } +// TODO allow pinned input for both knn_graph and new_graph template , raft::memory_type::host>> @@ -1213,9 +1369,10 @@ void optimize( "cagra::graph::optimize(%zu, %zu, %u)", graph_size, knn_graph_degree, output_graph_degree); // MST optimization - auto mst_graph = raft::make_host_matrix(0, 0); - auto mst_graph_num_edges = raft::make_host_vector(graph_size); + auto mst_graph = raft::make_pinned_matrix(res, 0, 0); + auto mst_graph_num_edges = raft::make_pinned_vector(res, graph_size); auto mst_graph_num_edges_ptr = mst_graph_num_edges.data_handle(); + #pragma omp parallel for for (uint64_t i = 0; i < graph_size; i++) { mst_graph_num_edges_ptr[i] = 0; @@ -1223,10 +1380,10 @@ void optimize( if (guarantee_connectivity) { raft::common::nvtx::range block_scope( "cagra::graph::optimize/check_connectivity"); - mst_graph = - raft::make_host_matrix(graph_size, output_graph_degree); + mst_graph = raft::make_pinned_matrix( + res, graph_size, output_graph_degree); RAFT_LOG_INFO("MST optimization is used to guarantee graph connectivity."); - mst_optimization(res, knn_graph, mst_graph.view(), mst_graph_num_edges.view(), use_gpu); + mst_optimization(res, knn_graph, mst_graph.view(), mst_graph_num_edges.view(), use_gpu); for (uint64_t i = 0; i < graph_size; i++) { if (i < 8 || i >= graph_size - 8) { @@ -1235,6 +1392,37 @@ void optimize( } } + uint32_t batch_size = + std::min(static_cast(graph_size), static_cast(256 * 1024)); + const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size; + + // + // If the available device memory is insufficient, do not use the GPU to count + // the number of 2-hop detours, but use the CPU. + // + // TODO: we should decide on a global strategy for this in a single place + // it comes down to input memory type and available memory which data should be copied to GPU + bool _use_gpu_prune = use_gpu; + if (_use_gpu_prune) { + try { + auto d_detour_count = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(batch_size, knn_graph_degree)); + auto d_num_no_detour_edges = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(batch_size)); + auto d_output_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(batch_size, output_graph_degree)); + // TODO we also want to consider pinned memory in case we are short on memory + auto d_input_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size, knn_graph_degree)); + } catch (std::bad_alloc& e) { + RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU"); + _use_gpu_prune = false; + } catch (raft::logic_error& e) { + RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU (logic error)"); + _use_gpu_prune = false; + } + } + { raft::common::nvtx::range block_scope( "cagra::graph::optimize/prune"); @@ -1253,63 +1441,10 @@ void optimize( // specified number of edges are picked up for each node, starting with the // edge with the lowest number of 2-hop detours. // - auto detour_count = raft::make_host_matrix(graph_size, knn_graph_degree); - - // - // If the available device memory is insufficient, do not use the GPU to count - // the number of 2-hop detours, but use the CPU. - // - bool _use_gpu = use_gpu; - if (_use_gpu) { - try { - auto d_detour_count = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size, knn_graph_degree)); - auto d_num_no_detour_edges = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size)); - auto d_input_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size, knn_graph_degree)); - } catch (std::bad_alloc& e) { - RAFT_LOG_DEBUG("Insufficient memory for 2-hop node counting on GPU"); - _use_gpu = false; - } catch (raft::logic_error& e) { - RAFT_LOG_DEBUG("Insufficient memory for 2-hop node counting on GPU (logic error)"); - _use_gpu = false; - } - } - if (_use_gpu) { - // Count 2-hop detours on GPU - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/prune/2-hop-counting-by-GPU"); - const double time_2hop_count_start = cur_time(); - - uint64_t num_keep __attribute__((unused)) = 0; - uint64_t num_full __attribute__((unused)) = 0; - auto d_detour_count = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size, knn_graph_degree)); - - RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(), - 0xff, - graph_size * knn_graph_degree * sizeof(uint8_t), - raft::resource::get_cuda_stream(res))); - - auto d_num_no_detour_edges = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size)); - RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(), - 0x00, - graph_size * sizeof(uint32_t), - raft::resource::get_cuda_stream(res))); - - auto dev_stats = raft::make_device_vector(res, 2); - auto host_stats = raft::make_host_vector(2); - + if (_use_gpu_prune) { + // Pruning on GPU RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r"); - // Copy knn_graph over to device if necessary - device_matrix_view_from_host d_input_graph( - res, - raft::make_host_matrix_view( - knn_graph.data_handle(), graph_size, knn_graph_degree)); - constexpr int MAX_DEGREE = 1024; if (knn_graph_degree > MAX_DEGREE) { RAFT_FAIL( @@ -1318,17 +1453,47 @@ void optimize( knn_graph_degree, MAX_DEGREE); } - const uint32_t batch_size = - std::min(static_cast(graph_size), static_cast(256 * 1024)); - const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size; - const dim3 threads_prune(32, 1, 1); - const dim3 blocks_prune(batch_size, 1, 1); + const double prune_start = cur_time(); + + uint64_t num_keep __attribute__((unused)) = 0; + uint64_t num_full __attribute__((unused)) = 0; + auto dev_stats = raft::make_device_vector(res, 2); + auto host_stats = raft::make_host_vector(2); RAFT_CUDA_TRY(cudaMemsetAsync( dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res))); - const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT); + // Copy knn_graph over to device if necessary + // TODO: should we use pinned memory if we have issues fitting on GPU? + device_matrix_view_from_host d_input_graph( + res, + raft::make_host_matrix_view( + knn_graph.data_handle(), graph_size, knn_graph_degree)); + + // data structures per batch + auto d_detour_count = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(batch_size, knn_graph_degree)); + auto d_num_no_detour_edges = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(batch_size)); + auto d_output_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(batch_size, output_graph_degree)); + for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { + // initialize the detour_count and num_no_detour_edges for the current batch + RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(), + 0xff, + batch_size * knn_graph_degree * sizeof(uint8_t), + raft::resource::get_cuda_stream(res))); + + RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(), + 0x00, + batch_size * sizeof(uint32_t), + raft::resource::get_cuda_stream(res))); + + // count 2-hop detours for the current batch + const dim3 threads_prune(32, 1, 1); + const dim3 blocks_prune(batch_size, 1, 1); + const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT); kern_prune <<>>( d_input_graph.data_handle(), @@ -1340,6 +1505,30 @@ void optimize( d_detour_count.data_handle(), d_num_no_detour_edges.data_handle(), dev_stats.data_handle()); + + // select smallest-detour neighbors for the current batch + const size_t select_smem_size = + (knn_graph_degree * knn_graph_degree) * (sizeof(uint16_t) + sizeof(uint32_t)); + const dim3 threads_select(32, 1, 1); + const dim3 blocks_select(batch_size, 1, 1); + kern_select_smallest_detour_neighbors + <<>>(d_input_graph.data_handle(), + graph_size, + knn_graph_degree, + output_graph_degree, + d_detour_count.data_handle(), + d_output_graph.data_handle(), + batch_size, + i_batch); + + raft::copy(output_graph_ptr, + d_output_graph.data_handle() + i_batch * batch_size * output_graph_degree, + static_cast(batch_size) * output_graph_degree, + raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); RAFT_LOG_DEBUG( "# Pruning kNN Graph on GPUs (%.1lf %%)\r", @@ -1348,96 +1537,93 @@ void optimize( raft::resource::sync_stream(res); RAFT_LOG_DEBUG("\n"); - raft::copy(detour_count.data_handle(), - d_detour_count.data_handle(), - detour_count.size(), - raft::resource::get_cuda_stream(res)); - raft::copy( host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res)); num_keep = host_stats.data_handle()[0]; num_full = host_stats.data_handle()[1]; - const double time_2hop_count_end = cur_time(); + const double prune_end = cur_time(); RAFT_LOG_DEBUG( - "# Time for 2-hop detour counting on GPU: %.1lf sec, " + "# Time for pruning on GPU: %.1lf sec, " "avg_no_detour_edges_per_node: %.2lf/%u, " "nodes_with_no_detour_at_all_edges: %.1lf%%", - time_2hop_count_end - time_2hop_count_start, + prune_end - prune_start, (double)num_keep / graph_size, output_graph_degree, (double)num_full / graph_size * 100); } else { - // Count 2-hop detours on CPU - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/prune/2-hop-counting-by-CPU"); - const double time_2hop_count_start = cur_time(); + // Pruning on CPU + auto detour_count = raft::make_host_matrix(graph_size, knn_graph_degree); - count_2hop_detours(knn_graph, detour_count.view()); + { + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/prune/2-hop-counting-by-CPU"); + const double time_2hop_count_start = cur_time(); - const double time_2hop_count_end = cur_time(); - RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec", - time_2hop_count_end - time_2hop_count_start); - } + count_2hop_detours(knn_graph, detour_count.view()); - // Create pruned kNN graph - bool invalid_neighbor_list = false; + const double time_2hop_count_end = cur_time(); + RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec", + time_2hop_count_end - time_2hop_count_start); + } + bool invalid_neighbor_list = false; #pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable - // count of the neighbors while increasing the target detourable count from zero. - uint64_t pk = 0; - uint32_t num_detour = 0; - for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) { - uint32_t next_num_detour = std::numeric_limits::max(); - for (uint64_t k = 0; k < knn_graph_degree; k++) { - const auto num_detour_k = detour_count(i, k); - // Find the detourable count to check in the next iteration - if (num_detour_k > num_detour) { - next_num_detour = std::min(static_cast(num_detour_k), next_num_detour); - } - - // Store the neighbor index if its detourable count is equal to `num_detour`. - if (num_detour_k != num_detour) { continue; } + for (uint64_t i = 0; i < graph_size; i++) { + // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable + // count of the neighbors while increasing the target detourable count from zero. + uint64_t pk = 0; + uint32_t num_detour = 0; + for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) { + uint32_t next_num_detour = std::numeric_limits::max(); + for (uint64_t k = 0; k < knn_graph_degree; k++) { + const auto num_detour_k = detour_count(i, k); + // Find the detourable count to check in the next iteration + if (num_detour_k > num_detour) { + next_num_detour = std::min(static_cast(num_detour_k), next_num_detour); + } - // Check duplication and append - const auto candidate_node = knn_graph(i, k); - bool dup = false; - for (uint32_t dk = 0; dk < pk; dk++) { - if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) { - dup = true; - break; + // Store the neighbor index if its detourable count is equal to `num_detour`. + if (num_detour_k != num_detour) { continue; } + + // Check duplication and append + const auto candidate_node = knn_graph(i, k); + bool dup = false; + for (uint32_t dk = 0; dk < pk; dk++) { + if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) { + dup = true; + break; + } } - } - if (!dup && candidate_node < graph_size) { - output_graph_ptr[i * output_graph_degree + pk] = candidate_node; - pk += 1; + if (!dup && candidate_node < graph_size) { + output_graph_ptr[i * output_graph_degree + pk] = candidate_node; + pk += 1; + } + if (pk >= output_graph_degree) break; } if (pk >= output_graph_degree) break; - } - if (pk >= output_graph_degree) break; - if (next_num_detour == std::numeric_limits::max()) { - // There are no valid edges enough in the initial kNN graph. Break the loop here and catch - // the error at the next validation (pk != output_graph_degree). - break; + if (next_num_detour == std::numeric_limits::max()) { + // There are no valid edges enough in the initial kNN graph. Break the loop here and + // catch the error at the next validation (pk != output_graph_degree). + break; + } + num_detour = next_num_detour; + } + if (pk != output_graph_degree) { + RAFT_LOG_DEBUG( + "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for " + "node %lu in the rank-based node reranking process", + output_graph_degree, + i); + invalid_neighbor_list = true; } - num_detour = next_num_detour; - } - if (pk != output_graph_degree) { - RAFT_LOG_DEBUG( - "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for " - "node %lu in the rank-based node reranking process", - output_graph_degree, - i); - invalid_neighbor_list = true; } + RAFT_EXPECTS( + !invalid_neighbor_list, + "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " + "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " + "overflows occur during the norm computation between the dataset vectors."); } - RAFT_EXPECTS( - !invalid_neighbor_list, - "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " - "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " - "overflows occur during the norm computation between the dataset vectors."); const double time_prune_end = cur_time(); RAFT_LOG_DEBUG("# Pruning time: %.1lf ms", (time_prune_end - time_prune_start) * 1000.0); @@ -1446,155 +1632,281 @@ void optimize( auto rev_graph = raft::make_host_matrix(graph_size, output_graph_degree); auto rev_graph_count = raft::make_host_vector(graph_size); - { - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/reverse"); + bool _use_gpu_rev_graph = use_gpu; + // TODO: should we use pinned memory if we have issues fitting on GPU? + if (_use_gpu_rev_graph) { + try { + auto d_rev_graph_count = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size)); + auto d_dest_nodes = + raft::make_device_mdarray(res, large_tmp_mr, raft::make_extents(graph_size)); + auto d_rev_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); + } catch (std::bad_alloc& e) { + RAFT_LOG_DEBUG("Insufficient memory for reverse graph on GPU"); + _use_gpu_rev_graph = false; + } catch (raft::logic_error& e) { + RAFT_LOG_DEBUG("Insufficient memory for reverse graph on GPU (logic error)"); + _use_gpu_rev_graph = false; + } + } + + const double time_make_start = cur_time(); + if (_use_gpu_rev_graph) { // - // Make reverse graph + // Make reverse graph on GPU // - const double time_make_start = cur_time(); + auto d_rev_graph_count = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size)); device_matrix_view_from_host d_rev_graph(res, rev_graph.view()); - RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph.data_handle(), - 0xff, - graph_size * output_graph_degree * sizeof(IdxT), - raft::resource::get_cuda_stream(res))); + device_matrix_view_from_host d_output_graph( + res, + raft::make_host_matrix_view( + output_graph_ptr, graph_size, output_graph_degree)); - auto d_rev_graph_count = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size)); - RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph_count.data_handle(), - 0x00, - graph_size * sizeof(uint32_t), - raft::resource::get_cuda_stream(res))); - - // Copy full output graph to device once; kernel indexes by column k (no per-column H2D copy). - // TODO: depending on available device memory, this may need to be split into multiple copies. - auto d_output_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); - raft::copy(d_output_graph.data_handle(), - output_graph_ptr, - static_cast(graph_size) * output_graph_degree, - raft::resource::get_cuda_stream(res)); + { + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/reverse"); + auto dest_nodes = raft::make_host_vector(graph_size); + auto d_dest_nodes = + raft::make_device_mdarray(res, large_tmp_mr, raft::make_extents(graph_size)); - dim3 threads(256, 1, 1); - dim3 blocks(1024, 1, 1); - for (uint32_t k = 0; k < output_graph_degree; k++) { - kern_make_rev_graph_column<<>>( - d_output_graph.data_handle(), - d_rev_graph.data_handle(), - d_rev_graph_count.data_handle(), - graph_size, - output_graph_degree, - k); - RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %u / %u \r", k, output_graph_degree); - } + RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph.data_handle(), + 0xff, + graph_size * output_graph_degree * sizeof(IdxT), + raft::resource::get_cuda_stream(res))); - raft::resource::sync_stream(res); - RAFT_LOG_DEBUG("\n"); + RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph_count.data_handle(), + 0x00, + graph_size * sizeof(uint32_t), + raft::resource::get_cuda_stream(res))); - if (d_rev_graph.allocated_memory()) { - raft::copy(rev_graph.data_handle(), - d_rev_graph.data_handle(), - graph_size * output_graph_degree, + for (uint64_t k = 0; k < output_graph_degree; k++) { +#pragma omp parallel for + for (uint64_t i = 0; i < graph_size; i++) { + // dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)]; + dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)]; + } + raft::resource::sync_stream(res); + + raft::copy(d_dest_nodes.data_handle(), + dest_nodes.data_handle(), + graph_size, + raft::resource::get_cuda_stream(res)); + + dim3 threads(256, 1, 1); + dim3 blocks(1024, 1, 1); + kern_make_rev_graph<<>>( + d_dest_nodes.data_handle(), + d_rev_graph.data_handle(), + d_rev_graph_count.data_handle(), + graph_size, + output_graph_degree); + RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u \r", k, output_graph_degree); + } + + raft::resource::sync_stream(res); + RAFT_LOG_DEBUG("\n"); + + if (d_rev_graph.allocated_memory()) { + raft::copy(rev_graph.data_handle(), + d_rev_graph.data_handle(), + graph_size * output_graph_degree, + raft::resource::get_cuda_stream(res)); + } + raft::copy(rev_graph_count.data_handle(), + d_rev_graph_count.data_handle(), + graph_size, raft::resource::get_cuda_stream(res)); + + const double time_make_end = cur_time(); + RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms", + (time_make_end - time_make_start) * 1000.0); } - raft::copy(rev_graph_count.data_handle(), - d_rev_graph_count.data_handle(), - graph_size, - raft::resource::get_cuda_stream(res)); - const double time_make_end = cur_time(); - RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms", - (time_make_end - time_make_start) * 1000.0); - } + { + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/combine"); - { - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/combine"); - // - // Create search graphs from MST and pruned and reverse graphs - // - const double time_replace_start = cur_time(); + // Merging the prunned graph and the reverse graph + const double merge_graph_start = cur_time(); + + // Create a boolean variable on the GPU using RAFT device allocator + auto d_check_num_protected_edges = raft::make_device_scalar(res, true); + + const dim3 threads_merge(32, 1, 1); + const dim3 blocks_merge(batch_size, 1, 1); + const size_t merge_smem_size = (output_graph_degree + output_graph_degree) * sizeof(IdxT); + for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { + kern_merge_graph + <<>>( + d_output_graph.data_handle(), + d_rev_graph.data_handle(), + d_rev_graph_count.data_handle(), + graph_size, + output_graph_degree, + mst_graph.data_handle(), + output_graph_degree, + mst_graph_num_edges_ptr, + batch_size, + i_batch, + guarantee_connectivity, + d_check_num_protected_edges.data_handle()); + } + + bool check_num_protected_edges = true; + raft::copy(&check_num_protected_edges, + d_check_num_protected_edges.data_handle(), + 1, + raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + + // TODO: is this required? + if (d_output_graph.allocated_memory()) { + raft::copy(output_graph_ptr, + d_output_graph.data_handle(), + graph_size * output_graph_degree, + raft::resource::get_cuda_stream(res)); + } + + const auto merge_graph_end = cur_time(); + RAFT_EXPECTS(check_num_protected_edges, + "Failed to merge the MST, pruned, and reverse edge graphs. " + "Some nodes have too " + "many MST optimization edges."); + + RAFT_LOG_DEBUG("# Time for merging graphs: %.1lf ms", + (merge_graph_end - merge_graph_start) * 1000.0); + } + } else { + { + // Make reverse graph on CPU + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/reverse"); + + auto rev_graph_ptr = rev_graph.data_handle(); + auto rev_graph_count_ptr = rev_graph_count.data_handle(); - bool check_num_protected_edges = true; #pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - auto my_rev_graph = rev_graph.data_handle() + (output_graph_degree * i); - auto my_out_graph = output_graph_ptr + (output_graph_degree * i); + for (uint64_t i = 0; i < graph_size; i++) { + rev_graph_count_ptr[i] = 0; + } - // If guarantee_connectivity == true, use a temporal list to merge the neighbor lists of the - // graphs. - std::vector temp_output_neighbor_list; - if (guarantee_connectivity) { - temp_output_neighbor_list.resize(output_graph_degree); - my_out_graph = temp_output_neighbor_list.data(); - const auto mst_graph_num_edges = mst_graph_num_edges_ptr[i]; - - // Set MST graph edges - for (uint32_t j = 0; j < mst_graph_num_edges; j++) { - my_out_graph[j] = mst_graph(i, j); + for (uint32_t k = 0; k < output_graph_degree; k++) { +#pragma omp parallel for + for (uint64_t src_id = 0; src_id < graph_size; src_id++) { + const IdxT dest_id = + output_graph_ptr[k + (static_cast(output_graph_degree) * src_id)]; + if (dest_id >= graph_size) continue; + uint32_t pos; +#pragma omp atomic capture + pos = rev_graph_count_ptr[dest_id]++; + if (pos < output_graph_degree) { + rev_graph_ptr[(static_cast(output_graph_degree) * dest_id) + pos] = + static_cast(src_id); + } } + } - // Set pruned graph edges - for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges; - (pruned_j < output_graph_degree) && (output_j < output_graph_degree); - pruned_j++) { - const auto v = output_graph_ptr[output_graph_degree * i + pruned_j]; - - // duplication check - bool dup = false; - for (uint32_t m = 0; m < output_j; m++) { - if (v == my_out_graph[m]) { - dup = true; - break; - } + const double time_make_end = cur_time(); + RAFT_LOG_DEBUG("# Making reverse graph time (CPU): %.1lf ms", + (time_make_end - time_make_start) * 1000.0); + } + + { + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/combine"); + // + // Create search graphs from MST and pruned and reverse graphs + // + const double time_replace_start = cur_time(); + + bool check_num_protected_edges = true; +#pragma omp parallel for + for (uint64_t i = 0; i < graph_size; i++) { + auto my_rev_graph = rev_graph.data_handle() + (output_graph_degree * i); + auto my_out_graph = output_graph_ptr + (output_graph_degree * i); + + // If guarantee_connectivity == true, use a temporal list to merge the neighbor lists of the + // graphs. + std::vector temp_output_neighbor_list; + if (guarantee_connectivity) { + temp_output_neighbor_list.resize(output_graph_degree); + my_out_graph = temp_output_neighbor_list.data(); + const auto mst_graph_num_edges = mst_graph_num_edges_ptr[i]; + + // Set MST graph edges + for (uint32_t j = 0; j < mst_graph_num_edges; j++) { + my_out_graph[j] = mst_graph(i, j); } - if (!dup) { - my_out_graph[output_j] = v; - output_j++; + // Set pruned graph edges + for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges; + (pruned_j < output_graph_degree) && (output_j < output_graph_degree); + pruned_j++) { + const auto v = output_graph_ptr[output_graph_degree * i + pruned_j]; + + // duplication check + bool dup = false; + for (uint32_t m = 0; m < output_j; m++) { + if (v == my_out_graph[m]) { + dup = true; + break; + } + } + + if (!dup) { + my_out_graph[output_j] = v; + output_j++; + } } } - } - const auto num_protected_edges = - std::max(mst_graph_num_edges_ptr[i], output_graph_degree / 2); - if (num_protected_edges > output_graph_degree) { check_num_protected_edges = false; } - if (num_protected_edges == output_graph_degree) continue; - - // Replace some edges of the output graph with edges of the reverse graph. - auto kr = std::min(rev_graph_count.data_handle()[i], output_graph_degree); - while (kr) { - kr -= 1; - if (my_rev_graph[kr] < graph_size) { - uint64_t pos = pos_in_array(my_rev_graph[kr], my_out_graph, output_graph_degree); - if (pos < num_protected_edges) { continue; } - uint64_t num_shift = pos - num_protected_edges; - if (pos >= output_graph_degree) { - num_shift = output_graph_degree - num_protected_edges - 1; + const auto num_protected_edges = + std::max(mst_graph_num_edges_ptr[i], output_graph_degree / 2); + if (num_protected_edges > output_graph_degree) { check_num_protected_edges = false; } + if (num_protected_edges == output_graph_degree) continue; + + // Replace some edges of the output graph with edges of the reverse graph. + auto kr = std::min(rev_graph_count.data_handle()[i], output_graph_degree); + while (kr) { + kr -= 1; + if (my_rev_graph[kr] < graph_size) { + uint64_t pos = pos_in_array(my_rev_graph[kr], my_out_graph, output_graph_degree); + if (pos < num_protected_edges) { continue; } + uint64_t num_shift = pos - num_protected_edges; + if (pos >= output_graph_degree) { + num_shift = output_graph_degree - num_protected_edges - 1; + } + shift_array(my_out_graph + num_protected_edges, num_shift); + my_out_graph[num_protected_edges] = my_rev_graph[kr]; } - shift_array(my_out_graph + num_protected_edges, num_shift); - my_out_graph[num_protected_edges] = my_rev_graph[kr]; } - } - // If guarantee_connectivity == true, move the output neighbor list from the temporal list to - // the output list. If false, the copy is not needed because my_out_graph is a pointer to the - // output buffer. - if (guarantee_connectivity) { - for (uint32_t j = 0; j < output_graph_degree; j++) { - output_graph_ptr[(output_graph_degree * i) + j] = my_out_graph[j]; + // If guarantee_connectivity == true, move the output neighbor list from the temporal list + // to the output list. If false, the copy is not needed because my_out_graph is a pointer to + // the output buffer. + if (guarantee_connectivity) { + for (uint32_t j = 0; j < output_graph_degree; j++) { + output_graph_ptr[(output_graph_degree * i) + j] = my_out_graph[j]; + } } } - } - RAFT_EXPECTS(check_num_protected_edges, - "Failed to merge the MST, pruned, and reverse edge graphs. Some nodes have too " - "many MST optimization edges."); + RAFT_EXPECTS(check_num_protected_edges, + "Failed to merge the MST, pruned, and reverse edge graphs. Some nodes have too " + "many MST optimization edges."); - const double time_replace_end = cur_time(); - RAFT_LOG_DEBUG("# Replacing edges time: %.1lf ms", - (time_replace_end - time_replace_start) * 1000.0); + const double time_replace_end = cur_time(); + RAFT_LOG_DEBUG("# Replacing edges time: %.1lf ms", + (time_replace_end - time_replace_start) * 1000.0); + } + } + // Check stats + { + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/stats"); /* stats */ uint64_t num_replaced_edges = 0; #pragma omp parallel for reduction(+ : num_replaced_edges) From 3e9767c930dbdbce7279353eca5ae2e0fcc8586c Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Fri, 20 Feb 2026 01:27:41 -0800 Subject: [PATCH 056/119] Removed max_node_id --- cpp/include/cuvs/neighbors/cagra.hpp | 7 --- .../neighbors/detail/cagra/cagra_build.cuh | 58 +++++++++---------- .../neighbors/detail/cagra/cagra_search.cuh | 2 - 3 files changed, 28 insertions(+), 39 deletions(-) diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index d105fa816d..6fd734064c 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -247,13 +247,6 @@ struct search_params : cuvs::neighbors::search_params { /** Bit mask used for initial random seed node selection. */ uint64_t rand_xor_mask = 0x128394; - /** - * Maximum node ID for random seed selection. - * When > 0, random seeds are constrained to [0, max_node_id) instead of [0, dataset_size). - * This is useful when the graph is smaller than the dataset (e.g., iterative build with compression). - * Default 0 means no constraint (use dataset_size). - */ - uint32_t max_node_id = 0; /** Whether to use the persistent version of the kernel (only SINGLE_CTA is supported a.t.m.) */ bool persistent = false; /** Persistent kernel: time in seconds before the kernel stops if no requests received. */ diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 70a29fe379..613b05cb67 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1990,7 +1990,6 @@ void search_and_optimize(raft::resources const& res, { // Search. // Since there are many queries, divide them into batches and search them. - RAFT_LOG_DEBUG("search_and_optimize: search_params.max_node_id=%u", search_params.max_node_id); cuvs::spatial::knn::detail::utils::batch_load_iterator query_batch( dev_query_view.data_handle(), curr_query_size, @@ -2024,7 +2023,7 @@ void search_and_optimize(raft::resources const& res, // Optimize graph auto next_graph_size = curr_query_size; cagra_graph = raft::make_host_matrix(0, 0); // delete existing grahp - cagra_graph = raft::make_host_matrix(next_graph_size, next_graph_degree); + cagra_graph = raft::make_host_matrix(next_graph_size, next_graph_degree); optimize( res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0); } @@ -2125,15 +2124,15 @@ auto iterative_build_graph( if (params.compression.has_value()) { auto start = std::chrono::high_resolution_clock::now(); RAFT_EXPECTS(params.metric == cuvs::distance::DistanceType::L2Expanded, - "VPQ compression is only supported with L2Expanded distance mertric"); + "VPQ compression is only supported with L2Expanded distance mertric"); idx_opt.emplace(res, params.metric); - //idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view())); + // idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view())); idx_opt->update_dataset( - res, - // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later - cuvs::neighbors::vpq_build( - res, *params.compression, dev_dataset)); - auto end = std::chrono::high_resolution_clock::now(); + res, + // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later + cuvs::neighbors::vpq_build( + res, *params.compression, dev_dataset)); + auto end = std::chrono::high_resolution_clock::now(); auto elapsed_ms = std::chrono::duration_cast(end - start).count(); RAFT_LOG_INFO("# VPQ compression time: %.3lf sec", (double)elapsed_ms / 1000); } @@ -2148,25 +2147,28 @@ auto iterative_build_graph( // pruning is not used except in the last iteration. // (*) The appropriate setting for itopk_size requires careful consideration. auto curr_topk = next_graph_degree + 1; - auto curr_itopk_size = next_graph_degree + 32; + auto curr_itopk_size = std::max(next_graph_degree + 32, (uint64_t)128); if (flag_last) { curr_topk = topk; curr_itopk_size = curr_topk + 32; } - RAFT_LOG_INFO( - "# graph_size = %lu (%.3lf), graph_degree = %lu, query_size = %lu, itopk = %lu, topk = %lu", - (uint64_t)cagra_graph.extent(0), - (double)cagra_graph.extent(0) / final_graph_size, - (uint64_t)cagra_graph.extent(1), - (uint64_t)curr_query_size, - (uint64_t)curr_itopk_size, - (uint64_t)curr_topk); + // RAFT_LOG_INFO( + // "# graph_size = %lu (%.3lf), graph_degree = %lu, query_size = %lu, itopk = %lu, topk = + // %lu", (uint64_t)cagra_graph.extent(0), (double)cagra_graph.extent(0) / final_graph_size, + // (uint64_t)cagra_graph.extent(1), + // (uint64_t)curr_query_size, + // (uint64_t)curr_itopk_size, + // (uint64_t)curr_topk); cuvs::neighbors::cagra::search_params search_params; - search_params.algo = cuvs::neighbors::cagra::search_algo::AUTO; - search_params.max_queries = max_chunk_size; - search_params.itopk_size = curr_itopk_size; + search_params.algo = cuvs::neighbors::cagra::search_algo::AUTO; + search_params.max_queries = max_chunk_size; + search_params.itopk_size = curr_itopk_size; + search_params.max_iterations = 8; + search_params.search_width = 1; + // This fails. Why? + // search_params.persistent = true; // Create an index (idx), a query view (dev_query_view), and a mdarray for // search results (neighbors). @@ -2174,7 +2176,8 @@ auto iterative_build_graph( dev_dataset.data_handle(), (int64_t)curr_graph_size, dev_dataset.extent(1)); // No compression, create mdspan index if (!params.compression.has_value()) { - idx_opt.emplace(res, params.metric, dev_dataset_view, raft::make_const_mdspan(cagra_graph.view())); + idx_opt.emplace( + res, params.metric, dev_dataset_view, raft::make_const_mdspan(cagra_graph.view())); } else { idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view())); } @@ -2186,11 +2189,6 @@ auto iterative_build_graph( auto neighbors_view = raft::make_host_matrix_view(neighbors_ptr, curr_query_size, curr_topk); - // Set max_node_id to constrain random seed selection to valid graph nodes - search_params.max_node_id = static_cast(curr_graph_size); - RAFT_LOG_DEBUG("iterative_build: Setting search_params.max_node_id=%u (curr_graph_size=%lu)", - search_params.max_node_id, curr_graph_size); - search_and_optimize(res, search_params, idx, @@ -2208,12 +2206,12 @@ auto iterative_build_graph( auto end = std::chrono::high_resolution_clock::now(); auto elapsed_ms = std::chrono::duration_cast(end - start).count(); - RAFT_LOG_INFO("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000); + RAFT_LOG_DEBUG("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000); if (flag_last) { break; } - flag_last = (curr_graph_size == final_graph_size); + flag_last = (curr_graph_size == final_graph_size); auto next_graph_size = curr_query_size; - curr_graph_size = next_graph_size; + curr_graph_size = next_graph_size; } return cagra_graph; diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index 0efb7cfd45..efec4b3f93 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -51,8 +51,6 @@ void search_main_core( raft::device_matrix_view distances, CagraSampleFilterT sample_filter = CagraSampleFilterT()) { - RAFT_LOG_DEBUG("search_main_core: max_node_id=%u, graph.extent(0)=%lu", - params.max_node_id, graph.extent(0)); static_assert(std::is_same_v, "Only uint32_t is supported as the graph element type (internal index type)"); RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n", From 822faea739f9b77f13642c8201090273e27d32bc Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 20 Feb 2026 12:14:35 +0000 Subject: [PATCH 057/119] some fixes, cleanup --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 137 ++++++++++-------- 1 file changed, 77 insertions(+), 60 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index f2cd79ecb6..1b7e46e535 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -173,7 +173,7 @@ __global__ void kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, g uint64_t* const num_full = stats + 1; const uint64_t iA = blockIdx.x + (batch_size * batch_id); - const uint64_t iA_batch = iA % static_cast(batch_size); + const uint64_t iA_batch = blockIdx.x; if (iA >= graph_size) { return; } @@ -246,66 +246,69 @@ __global__ void kern_make_rev_graph(const IdxT* const dest_nodes, // [graph_ } } -// Select output_graph_degree neighbors with smallest detour count per node (writes to device). -template +// Based on the detour count, select the smallest detour count and its index +// (Pruning Update Kernel) +template __global__ void kern_select_smallest_detour_neighbors( - const IdxT* const knn_graph, + const IdxT* const knn_graph, // [graph_chunk_size, graph_degree] uint64_t graph_size, uint64_t knn_graph_degree, uint64_t output_graph_degree, - const uint8_t* const d_detour_count, // [batch_size, graph_degree] - IdxT* output_graph_ptr, // [batch_size, output_graph_degree] - const uint32_t batch_size, - const uint32_t batch_id) + uint8_t* const d_detour_count, // [batch_size, graph_degree] + IdxT* output_graph_ptr, + const uint32_t batch_size, // [batch_size, output_graph_degree] + const uint32_t batch_id, + uint32_t* const d_invalid_neighbor_list) { - // FIXME: this does not really work for num_warps > 1 - constexpr unsigned warp_mask = 0xffffffff; - const uint32_t num_warps = blockDim.x / raft::WarpSize; - extern __shared__ unsigned char smem_buf[]; - uint32_t* smem_indices = reinterpret_cast(smem_buf); - uint16_t* smem_detour_count = - reinterpret_cast(&smem_indices[knn_graph_degree * num_warps]); + assert(blockDim.x == 32); - const uint32_t wid = threadIdx.x / raft::WarpSize; - const uint32_t lane_id = threadIdx.x % raft::WarpSize; - const uint64_t nid = static_cast(blockIdx.x) * num_warps + - (static_cast(batch_size) * batch_id * num_warps) + wid; + // Allocate shared memory for detour counts and their indices + extern __shared__ IdxT smem_indices[]; + uint16_t* smem_detour_count = (uint16_t*)&smem_indices[knn_graph_degree]; - const uint64_t nid_batch = nid % static_cast(batch_size); + const uint64_t nid = blockIdx.x + (batch_size * batch_id); + const uint64_t nid_batch = blockIdx.x; - if (nid >= graph_size) return; + if (nid >= graph_size) { return; } - for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) { - smem_detour_count[(knn_graph_degree * wid) + k] = - d_detour_count[nid_batch * knn_graph_degree + k]; - smem_indices[(knn_graph_degree * wid) + k] = k; + // Each uint64_t loads detour_count for its assigned k + for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) { + smem_detour_count[k] = d_detour_count[nid_batch * knn_graph_degree + k]; + smem_indices[k] = knn_graph[knn_graph_degree * nid + k]; } - __syncwarp(warp_mask); + __syncwarp(); + + const unsigned warp_mask = 0xffffffff; for (uint32_t i = 0; i < output_graph_degree; i++) { - uint32_t local_min = 256; + uint32_t local_min = 255; uint32_t local_idx = 0xffffffff; - for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) { - uint32_t c = smem_detour_count[(knn_graph_degree * wid) + k]; - if (c < local_min) { - local_min = c; - local_idx = smem_indices[(knn_graph_degree * wid) + k]; + for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) { + if (smem_detour_count[k] < local_min) { + local_min = smem_detour_count[k]; + local_idx = k; } } - uint32_t local_min_with_tag = (local_min << 16) | local_idx; - for (int offset = raft::WarpSize / 2; offset > 0; offset /= 2) { - uint32_t other = __shfl_down_sync(warp_mask, local_min_with_tag, offset); - local_min_with_tag = (local_min_with_tag <= other) ? local_min_with_tag : other; + + uint32_t local_min_with_tag = (local_min << 16) | ((uint32_t)local_idx); + uint32_t warp_min_with_tag = __reduce_min_sync(warp_mask, local_min_with_tag); + uint32_t warp_min_count = warp_min_with_tag >> 16; + uint32_t warp_local_idx = warp_min_with_tag & 0xffff; + + if (warp_min_count == 255) { + // No valid position left; set error flag and fill remaining slots with sentinel + if (threadIdx.x == 0) { atomicExch(d_invalid_neighbor_list, 1u); } + break; } - uint32_t warp_min_tag = __shfl_sync(warp_mask, local_min_with_tag, 0); - uint32_t warp_local_idx = warp_min_tag & 0xffff; - if (local_idx == warp_local_idx) { - output_graph_ptr[nid_batch * output_graph_degree + i] = - knn_graph[knn_graph_degree * nid + warp_local_idx]; - smem_detour_count[knn_graph_degree * wid + warp_local_idx] = 255; + IdxT selected_node = smem_indices[warp_local_idx]; + + for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) { + if (smem_indices[k] == selected_node) { smem_detour_count[k] = 255; } } __syncwarp(warp_mask); + + if (threadIdx.x == 0) { output_graph_ptr[nid_batch * output_graph_degree + i] = selected_node; } } } @@ -350,19 +353,18 @@ __global__ void kern_merge_graph(IdxT* output_graph, extern __shared__ unsigned char smem_buf[]; IdxT* smem_sorted_output_graph = reinterpret_cast(smem_buf); - const uint32_t wid = threadIdx.x / 32; - const uint32_t lane_id = threadIdx.x % 32; - const uint32_t num_warps = blockDim.x / 32; - const uint64_t nid = blockIdx.x * num_warps + (batch_size * batch_id * num_warps) + wid; + assert(blockDim.x == 32); + + const uint64_t nid = blockIdx.x + (batch_size * batch_id); if (nid >= graph_size) { return; } - if (lane_id == 0) check_num_protected_edges[0] = true; + if (threadIdx.x == 0) check_num_protected_edges[0] = true; const auto mst_graph_num_edges = mst_graph_num_edges_ptr[nid]; // If guarantee_connectivity == true, use a temporal list to merge the // neighbor lists of the graphs. if (guarantee_connectivity) { - for (uint32_t i = lane_id; i < mst_graph_degree; i += 32) { + for (uint32_t i = threadIdx.x; i < mst_graph_degree; i += 32) { smem_sorted_output_graph[i] = mst_graph[nid * mst_graph_degree + i]; } __syncwarp(); @@ -371,7 +373,7 @@ __global__ void kern_merge_graph(IdxT* output_graph, pruned_j++) { const auto v = output_graph[output_graph_degree * nid + pruned_j]; unsigned int dup = 0; - for (uint32_t m = lane_id; m < output_j; m += 32) { + for (uint32_t m = threadIdx.x; m < output_j; m += 32) { if (v == smem_sorted_output_graph[m]) { dup = 1; break; @@ -380,7 +382,7 @@ __global__ void kern_merge_graph(IdxT* output_graph, unsigned int warp_dup = __ballot_sync(0xffffffff, dup); if (warp_dup == 0) { - if (lane_id == 0) smem_sorted_output_graph[output_j] = v; + if (threadIdx.x == 0) smem_sorted_output_graph[output_j] = v; output_j++; } __syncwarp(); @@ -388,7 +390,7 @@ __global__ void kern_merge_graph(IdxT* output_graph, } else { - for (uint32_t i = lane_id; i < output_graph_degree; i += 32) { + for (uint32_t i = threadIdx.x; i < output_graph_degree; i += 32) { smem_sorted_output_graph[i] = output_graph[output_graph_degree * nid + i]; } __syncwarp(); @@ -409,7 +411,7 @@ __global__ void kern_merge_graph(IdxT* output_graph, if (pos < num_protected_edges) { continue; } uint64_t num_shift = pos - num_protected_edges; if (pos >= output_graph_degree) { num_shift = output_graph_degree - num_protected_edges - 1; } - if (lane_id == 0) { + if (threadIdx.x == 0) { thread_shift_array(smem_sorted_output_graph + num_protected_edges, num_shift); smem_sorted_output_graph[num_protected_edges] = rev_graph[kr + (output_graph_degree * nid)]; } @@ -417,7 +419,7 @@ __global__ void kern_merge_graph(IdxT* output_graph, } } - for (uint32_t i = lane_id; i < output_graph_degree; i += 32) { + for (uint32_t i = threadIdx.x; i < output_graph_degree; i += 32) { output_graph[(output_graph_degree * nid) + i] = smem_sorted_output_graph[i]; } } @@ -1477,6 +1479,7 @@ void optimize( res, large_tmp_mr, raft::make_extents(batch_size)); auto d_output_graph = raft::make_device_mdarray( res, large_tmp_mr, raft::make_extents(batch_size, output_graph_degree)); + auto d_invalid_neighbor_list = raft::make_device_scalar(res, 0u); for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { // initialize the detour_count and num_no_detour_edges for the current batch @@ -1507,8 +1510,7 @@ void optimize( dev_stats.data_handle()); // select smallest-detour neighbors for the current batch - const size_t select_smem_size = - (knn_graph_degree * knn_graph_degree) * (sizeof(uint16_t) + sizeof(uint32_t)); + const size_t select_smem_size = (knn_graph_degree) * (sizeof(uint16_t) + sizeof(IdxT)); const dim3 threads_select(32, 1, 1); const dim3 blocks_select(batch_size, 1, 1); kern_select_smallest_detour_neighbors @@ -1522,10 +1524,11 @@ void optimize( d_detour_count.data_handle(), d_output_graph.data_handle(), batch_size, - i_batch); + i_batch, + d_invalid_neighbor_list.data_handle()); - raft::copy(output_graph_ptr, - d_output_graph.data_handle() + i_batch * batch_size * output_graph_degree, + raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree, + d_output_graph.data_handle(), static_cast(batch_size) * output_graph_degree, raft::resource::get_cuda_stream(res)); @@ -1537,6 +1540,18 @@ void optimize( raft::resource::sync_stream(res); RAFT_LOG_DEBUG("\n"); + uint32_t invalid_neighbor_list = 0; + raft::copy(&invalid_neighbor_list, + d_invalid_neighbor_list.data_handle(), + 1, + raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + RAFT_EXPECTS( + invalid_neighbor_list == 0, + "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " + "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " + "overflows occur during the norm computation between the dataset vectors."); + raft::copy( host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res)); num_keep = host_stats.data_handle()[0]; @@ -1642,6 +1657,8 @@ void optimize( raft::make_device_mdarray(res, large_tmp_mr, raft::make_extents(graph_size)); auto d_rev_graph = raft::make_device_mdarray( res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); + auto d_output_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); } catch (std::bad_alloc& e) { RAFT_LOG_DEBUG("Insufficient memory for reverse graph on GPU"); _use_gpu_rev_graph = false; @@ -1760,9 +1777,7 @@ void optimize( d_check_num_protected_edges.data_handle(), 1, raft::resource::get_cuda_stream(res)); - raft::resource::sync_stream(res); - // TODO: is this required? if (d_output_graph.allocated_memory()) { raft::copy(output_graph_ptr, d_output_graph.data_handle(), @@ -1770,6 +1785,8 @@ void optimize( raft::resource::get_cuda_stream(res)); } + raft::resource::sync_stream(res); + const auto merge_graph_end = cur_time(); RAFT_EXPECTS(check_num_protected_edges, "Failed to merge the MST, pruned, and reverse edge graphs. " From 129ee4ff4b4762b9f15c2de6e63041aabb34d8db Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Wed, 25 Feb 2026 07:20:12 -0800 Subject: [PATCH 058/119] added the fix that also checks whether the kernel function pointer has changed --- cpp/src/neighbors/detail/smem_utils.cuh | 58 +++++++++++++++---------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/cpp/src/neighbors/detail/smem_utils.cuh b/cpp/src/neighbors/detail/smem_utils.cuh index 41c95c0ccd..978f751787 100644 --- a/cpp/src/neighbors/detail/smem_utils.cuh +++ b/cpp/src/neighbors/detail/smem_utils.cuh @@ -6,7 +6,6 @@ #include -#include #include #include @@ -14,14 +13,18 @@ namespace cuvs::neighbors::detail { /** * @brief (Thread-)Safely invoke a kernel with a maximum dynamic shared memory size. - * This is required because the sequence `cudaFuncSetAttribute` + kernel launch is not executed - * atomically. * - * Used this way, the cudaFuncAttributeMaxDynamicSharedMemorySize can only grow and thus - * guarantees that the kernel is safe to launch. + * Maintains a monotonically growing high-water mark for `cudaFuncAttributeMaxDynamicSharedMemorySize`. + * When the kernel function pointer changes, the new kernel is brought up to the current high-water + * mark; when smem_size exceeds the high-water mark, it is grown for the current kernel. + * This guarantees every kernel's attribute is always >= smem_size at the time of launch. + * + * NB: cudaFuncSetAttribute is per kernel function pointer value, not per type. Multiple kernel + * template instantiations may share the same KernelT type (e.g. function pointers with the same + * signature), so we track the kernel identity alongside the smem high-water mark. * * @tparam KernelT The type of the kernel. - * @tparam InvocationT The type of the invocation function. + * @tparam KernelLauncherT The type of the launch function/lambda. * @param kernel The kernel function address (for whom the smem-size is specified). * @param smem_size The size of the dynamic shared memory to be set. * @param launch The kernel launch function/lambda. @@ -31,23 +34,33 @@ void safely_launch_kernel_with_smem_size(KernelT const& kernel, uint32_t smem_size, KernelLauncherT const& launch) { - // the last smem size is parameterized by the kernel thanks to the template parameter. - static std::atomic current_smem_size{0}; - auto last_smem_size = current_smem_size.load(std::memory_order_relaxed); - if (smem_size > last_smem_size) { - // We still need a mutex for the critical section: actualize last_smem_size and set the - // attribute. - static auto mutex = std::mutex{}; - auto guard = std::lock_guard{mutex}; - if (!current_smem_size.compare_exchange_strong( - last_smem_size, smem_size, std::memory_order_relaxed, std::memory_order_relaxed)) { - // The value has been updated by another thread between the load and the mutex acquisition. - if (smem_size > last_smem_size) { - current_smem_size.store(smem_size, std::memory_order_relaxed); - } + // current_smem_size is a monotonically growing high-water mark across all kernel pointers. + // current_kernel tracks which kernel pointer was last used. + static uint32_t current_smem_size{0}; + static KernelT current_kernel{KernelT{}}; + static std::mutex mutex; + + { + std::lock_guard guard(mutex); + + auto last_kernel = current_kernel; + auto last_smem_size = current_smem_size; + + // When the kernel function pointer changes, bring the new kernel up to the global high-water + // mark. This is necessary because cudaFuncSetAttribute applies to a specific function pointer, + // not to the pointer type — different template instantiations may share the same KernelT. + if (kernel != last_kernel) { + current_kernel = kernel; + auto launch_status = + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, last_smem_size); + RAFT_EXPECTS(launch_status == cudaSuccess, + "Failed to set max dynamic shared memory size to %u bytes", + last_smem_size); } - // Only update if the last seen value is smaller than the new one. + // When smem_size exceeds the high-water mark, grow it for the current kernel. + // If the kernel also changed above, this handles the case where smem_size > last_smem_size. if (smem_size > last_smem_size) { + current_smem_size = smem_size; auto launch_status = cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); RAFT_EXPECTS(launch_status == cudaSuccess, @@ -55,7 +68,8 @@ void safely_launch_kernel_with_smem_size(KernelT const& kernel, smem_size); } } - // We don't need to guard the kernel launch because the smem_size can only grow. + // The kernel launch is outside the lock: any concurrent cudaFuncSetAttribute can only increase + // the limit, so the launch is always safe. return launch(kernel); } From 5ec30278eb16a570889d9c08b37d75925274bcce Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Wed, 25 Feb 2026 07:36:35 -0800 Subject: [PATCH 059/119] merged main in --- cpp/src/neighbors/detail/smem_utils.cuh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/neighbors/detail/smem_utils.cuh b/cpp/src/neighbors/detail/smem_utils.cuh index 978f751787..74838f9be9 100644 --- a/cpp/src/neighbors/detail/smem_utils.cuh +++ b/cpp/src/neighbors/detail/smem_utils.cuh @@ -14,10 +14,11 @@ namespace cuvs::neighbors::detail { /** * @brief (Thread-)Safely invoke a kernel with a maximum dynamic shared memory size. * - * Maintains a monotonically growing high-water mark for `cudaFuncAttributeMaxDynamicSharedMemorySize`. - * When the kernel function pointer changes, the new kernel is brought up to the current high-water - * mark; when smem_size exceeds the high-water mark, it is grown for the current kernel. - * This guarantees every kernel's attribute is always >= smem_size at the time of launch. + * Maintains a monotonically growing high-water mark for + * `cudaFuncAttributeMaxDynamicSharedMemorySize`. When the kernel function pointer changes, the new + * kernel is brought up to the current high-water mark; when smem_size exceeds the high-water mark, + * it is grown for the current kernel. This guarantees every kernel's attribute is always >= + * smem_size at the time of launch. * * NB: cudaFuncSetAttribute is per kernel function pointer value, not per type. Multiple kernel * template instantiations may share the same KernelT type (e.g. function pointers with the same @@ -50,7 +51,7 @@ void safely_launch_kernel_with_smem_size(KernelT const& kernel, // mark. This is necessary because cudaFuncSetAttribute applies to a specific function pointer, // not to the pointer type — different template instantiations may share the same KernelT. if (kernel != last_kernel) { - current_kernel = kernel; + current_kernel = kernel; auto launch_status = cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, last_smem_size); RAFT_EXPECTS(launch_status == cudaSuccess, @@ -60,7 +61,7 @@ void safely_launch_kernel_with_smem_size(KernelT const& kernel, // When smem_size exceeds the high-water mark, grow it for the current kernel. // If the kernel also changed above, this handles the case where smem_size > last_smem_size. if (smem_size > last_smem_size) { - current_smem_size = smem_size; + current_smem_size = smem_size; auto launch_status = cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); RAFT_EXPECTS(launch_status == cudaSuccess, From 9b1f741ca39eb4624a08b51d54a2429fa6b08eff Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 25 Feb 2026 15:41:10 +0000 Subject: [PATCH 060/119] some fixes --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 1b7e46e535..8705f555b6 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -268,21 +268,23 @@ __global__ void kern_select_smallest_detour_neighbors( const uint64_t nid = blockIdx.x + (batch_size * batch_id); const uint64_t nid_batch = blockIdx.x; + const uint32_t maxval16 = 0x0000ffff; if (nid >= graph_size) { return; } - // Each uint64_t loads detour_count for its assigned k + // Load indices and detour counts for each neighbor; invalidate out-of-bounds entries for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) { - smem_detour_count[k] = d_detour_count[nid_batch * knn_graph_degree + k]; smem_indices[k] = knn_graph[knn_graph_degree * nid + k]; + smem_detour_count[k] = (smem_indices[k] >= graph_size) + ? maxval16 + : (uint16_t)d_detour_count[nid_batch * knn_graph_degree + k]; } __syncwarp(); const unsigned warp_mask = 0xffffffff; - for (uint32_t i = 0; i < output_graph_degree; i++) { - uint32_t local_min = 255; - uint32_t local_idx = 0xffffffff; + uint32_t local_min = maxval16; + uint32_t local_idx = maxval16; for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) { if (smem_detour_count[k] < local_min) { local_min = smem_detour_count[k]; @@ -295,8 +297,7 @@ __global__ void kern_select_smallest_detour_neighbors( uint32_t warp_min_count = warp_min_with_tag >> 16; uint32_t warp_local_idx = warp_min_with_tag & 0xffff; - if (warp_min_count == 255) { - // No valid position left; set error flag and fill remaining slots with sentinel + if (warp_min_count == maxval16 || warp_local_idx == maxval16) { if (threadIdx.x == 0) { atomicExch(d_invalid_neighbor_list, 1u); } break; } @@ -304,7 +305,7 @@ __global__ void kern_select_smallest_detour_neighbors( IdxT selected_node = smem_indices[warp_local_idx]; for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) { - if (smem_indices[k] == selected_node) { smem_detour_count[k] = 255; } + if (smem_indices[k] == selected_node) { smem_detour_count[k] = maxval16; } } __syncwarp(warp_mask); @@ -1355,7 +1356,11 @@ void optimize( { RAFT_LOG_DEBUG( "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1)); + + // large temporary memory for large arrays, e.g. everything >= O(graph_size) auto large_tmp_mr = raft::resource::get_large_workspace_resource(res); + // temporary memory for small arrays, e.g. everything <= O(batchsize * graph_degree) + // auto tmp_mr = raft::resource::get_tmp_workspace_resource(res); RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0), "Each input array is expected to have the same number of rows"); @@ -1527,9 +1532,12 @@ void optimize( i_batch, d_invalid_neighbor_list.data_handle()); + size_t copy_size = + std::min(static_cast(batch_size), graph_size - i_batch * batch_size) * + output_graph_degree; raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree, d_output_graph.data_handle(), - static_cast(batch_size) * output_graph_degree, + copy_size, raft::resource::get_cuda_stream(res)); raft::resource::sync_stream(res); From 18647117ca3e9fa4242f258d4c65f59302451041 Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Wed, 25 Feb 2026 07:48:49 -0800 Subject: [PATCH 061/119] style fixes| --- cpp/bench/ann/src/common/benchmark.hpp | 1 - cpp/src/neighbors/detail/cagra/compute_distance.hpp | 8 +++++--- cpp/src/neighbors/detail/cagra/device_common.hpp | 4 ++-- cpp/tests/neighbors/ann_cagra.cuh | 9 ++++----- python/cuvs_bench/cuvs_bench/run/run.py | 2 -- 5 files changed, 11 insertions(+), 13 deletions(-) diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index 1d5239ec9b..22859e9ab8 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -647,7 +647,6 @@ inline auto run_main(int argc, char** argv) -> int char* conf_path = argv[--argc]; std::ifstream conf_stream(conf_path); - for (int i = 1; i < argc; i++) { if (parse_bool_flag(argv[i], "--force", force_overwrite) || parse_bool_flag(argv[i], "--build", build_mode) || diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index c314d4d0a3..19b6c1db71 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -239,13 +239,15 @@ struct dataset_descriptor_host { template state(InitF init, size_t size) : ready{false}, value{std::make_tuple(init, size)} { - // RAFT_LOG_INFO("trying to create a descriptor state %p", reinterpret_cast(this)); + // RAFT_LOG_INFO("trying to create a descriptor state %p", + // reinterpret_cast(this)); } ~state() noexcept { if (std::holds_alternative(value)) { - // RAFT_LOG_INFO("trying to free descriptor state %p", reinterpret_cast(this)); + // RAFT_LOG_INFO("trying to free descriptor state %p", + // reinterpret_cast(this)); auto& [ptr, stream] = std::get(value); RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(ptr, stream)); } diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index 3d3a25c8eb..0b75de6bab 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -103,11 +103,11 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( const uint32_t traversed_hash_bitlen, const uint32_t block_id = 0, const uint32_t num_blocks = 1, - const IndexT graph_size = 0) + const IndexT graph_size = 0) { const auto team_size_bits = dataset_desc.team_size_bitshift_from_smem(); const auto max_i = raft::round_up_safe(num_pickup, warp_size >> team_size_bits); - const auto compute_distance = dataset_desc.compute_distance_impl; + const auto compute_distance = dataset_desc.compute_distance_impl; const IndexT seed_index_limit = graph_size > 0 ? graph_size : dataset_desc.size; for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += (blockDim.x >> team_size_bits)) { diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh index 77dbcb683c..80d033c3b4 100644 --- a/cpp/tests/neighbors/ann_cagra.cuh +++ b/cpp/tests/neighbors/ann_cagra.cuh @@ -1508,10 +1508,9 @@ inline std::vector generate_inputs() {100}, {1000000}, {768}, // dim - {16}, // k - { - //graph_build_algo::IVF_PQ, - //graph_build_algo::NN_DESCENT, + {16}, // k + { // graph_build_algo::IVF_PQ, + // graph_build_algo::NN_DESCENT, graph_build_algo::ITERATIVE_CAGRA_SEARCH}, {search_algo::AUTO}, {10}, @@ -1632,7 +1631,7 @@ inline std::vector generate_inputs() // {std::optional{std::nullopt}}, // {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL, // cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL}); // don't demand high recall - // without refinement + // without refinement for (uint32_t pq_len : {2}) { // for now, only pq_len = 2 is supported, more options coming soon for (uint32_t vq_n_centers : {100, 1000}) { for (auto input : inputs2) { diff --git a/python/cuvs_bench/cuvs_bench/run/run.py b/python/cuvs_bench/cuvs_bench/run/run.py index bc3fd15028..6ec7c04847 100644 --- a/python/cuvs_bench/cuvs_bench/run/run.py +++ b/python/cuvs_bench/cuvs_bench/run/run.py @@ -644,8 +644,6 @@ def run_benchmark( conf_file = prepare_conf_file(dataset_conf, subset_size, count, batch_size) algos_conf_fs = gather_algorithm_configs(scripts_path, configuration) - - allowed_algos = algorithms.split(",") if algorithms else None allowed_groups = groups.split(",") if groups else None allowed_algo_groups = ( From a92aa64caefbb649b7404efc17bd9188eafa9e6d Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Thu, 26 Feb 2026 01:16:31 -0800 Subject: [PATCH 062/119] optimisation attempt: skip optimisation except last step --- cpp/include/cuvs/neighbors/cagra.hpp | 8 ++ .../neighbors/detail/cagra/cagra_build.cuh | 135 ++++++++++++++---- 2 files changed, 113 insertions(+), 30 deletions(-) diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index 6fd734064c..247d29649d 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -119,6 +119,14 @@ struct index_params : cuvs::neighbors::index_params { */ bool guarantee_connectivity = false; + /** + * Whether to skip graph optimization (pruning, reverse edges, MST) during non-final iterations + * of iterative graph building. When true, search results are copied directly into the device + * graph without host round-trips. Only applies to iterative_search_params graph builds; the + * final iteration always runs full optimization. + */ + bool skip_graph_optimization = false; + /** * Whether to add the dataset content to the index, i.e.: * diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 0ba98a4447..9b41f70f64 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1971,6 +1971,53 @@ struct mmap_owner { size_t size_; }; +template +void search_to_device_graph(raft::resources const& res, + const cuvs::neighbors::cagra::search_params& search_params, + const index& idx, + raft::device_matrix_view dev_query_view, + raft::device_matrix_view dev_neighbors, + raft::device_matrix_view dev_distances, + raft::device_matrix_view dev_graph_output, + size_t curr_query_size, + size_t next_graph_degree, + size_t curr_topk, + uint64_t max_chunk_size) +{ + cuvs::spatial::knn::detail::utils::batch_load_iterator query_batch( + dev_query_view.data_handle(), + curr_query_size, + dev_query_view.extent(1), + max_chunk_size, + raft::resource::get_cuda_stream(res), + raft::resource::get_workspace_resource(res)); + + for (const auto& batch : query_batch) { + auto batch_dev_query_view = raft::make_device_matrix_view( + batch.data(), batch.size(), dev_query_view.extent(1)); + auto batch_dev_neighbors_view = raft::make_device_matrix_view( + dev_neighbors.data_handle(), batch.size(), curr_topk); + auto batch_dev_distances_view = raft::make_device_matrix_view( + dev_distances.data_handle(), batch.size(), curr_topk); + + cuvs::neighbors::cagra::search( + res, search_params, idx, batch_dev_query_view, batch_dev_neighbors_view, + batch_dev_distances_view); + + RAFT_CUDA_TRY(cudaMemcpy2DAsync( + dev_graph_output.data_handle() + batch.offset() * next_graph_degree, + next_graph_degree * sizeof(IdxT), + dev_neighbors.data_handle(), + curr_topk * sizeof(IdxT), + next_graph_degree * sizeof(IdxT), + batch.size(), + cudaMemcpyDeviceToDevice, + raft::resource::get_cuda_stream(res))); + } + + raft::resource::sync_stream(res); +} + template void search_and_optimize(raft::resources const& res, const cuvs::neighbors::cagra::search_params& search_params, @@ -2120,6 +2167,10 @@ auto iterative_build_graph( bool flag_last = false; auto curr_graph_size = initial_graph_size; + // Device graph for skip_graph_optimization: keeps the graph on device between iterations. + auto dev_graph = raft::make_device_matrix(res, 0, 0); + bool use_device_graph = false; + // Generate the compressed index once if compression is enabled std::optional> idx_opt; if (params.compression.has_value()) { @@ -2154,13 +2205,7 @@ auto iterative_build_graph( curr_itopk_size = curr_topk + 32; } - // RAFT_LOG_INFO( - // "# graph_size = %lu (%.3lf), graph_degree = %lu, query_size = %lu, itopk = %lu, topk = - // %lu", (uint64_t)cagra_graph.extent(0), (double)cagra_graph.extent(0) / final_graph_size, - // (uint64_t)cagra_graph.extent(1), - // (uint64_t)curr_query_size, - // (uint64_t)curr_itopk_size, - // (uint64_t)curr_topk); + bool do_skip = false;//params.skip_graph_optimization && !flag_last; cuvs::neighbors::cagra::search_params search_params; search_params.algo = cuvs::neighbors::cagra::search_algo::AUTO; @@ -2168,42 +2213,72 @@ auto iterative_build_graph( search_params.itopk_size = curr_itopk_size; search_params.max_iterations = 8; search_params.search_width = 1; - // This fails. Why? - // search_params.persistent = true; // Create an index (idx), a query view (dev_query_view), and a mdarray for // search results (neighbors). auto dev_dataset_view = raft::make_device_matrix_view( dev_dataset.data_handle(), (int64_t)curr_graph_size, dev_dataset.extent(1)); - // No compression, create mdspan index + if (!params.compression.has_value()) { - idx_opt.emplace( - res, params.metric, dev_dataset_view, raft::make_const_mdspan(cagra_graph.view())); + if (use_device_graph) { + idx_opt.emplace( + res, params.metric, dev_dataset_view, raft::make_const_mdspan(dev_graph.view())); + } else { + idx_opt.emplace( + res, params.metric, dev_dataset_view, raft::make_const_mdspan(cagra_graph.view())); + } } else { - idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view())); + if (use_device_graph) { + idx_opt->update_graph(res, raft::make_const_mdspan(dev_graph.view())); + } else { + idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view())); + } } const auto& idx = *idx_opt; auto dev_query_view = raft::make_device_matrix_view( dev_dataset.data_handle(), (int64_t)curr_query_size, dev_dataset.extent(1)); - auto neighbors_view = - raft::make_host_matrix_view(neighbors_ptr, curr_query_size, curr_topk); - - search_and_optimize(res, - search_params, - idx, - dev_query_view, - dev_neighbors.view(), - dev_distances.view(), - neighbors_view, - cagra_graph, - curr_query_size, - next_graph_degree, - curr_topk, - max_chunk_size, - flag_last, - params); + if (do_skip) { + auto dev_graph_next = + raft::make_device_matrix(res, curr_query_size, next_graph_degree); + + search_to_device_graph(res, + search_params, + idx, + dev_query_view, + dev_neighbors.view(), + dev_distances.view(), + dev_graph_next.view(), + curr_query_size, + next_graph_degree, + curr_topk, + max_chunk_size); + + dev_graph = std::move(dev_graph_next); + use_device_graph = true; + } else { + auto neighbors_view = + raft::make_host_matrix_view(neighbors_ptr, curr_query_size, curr_topk); + + search_and_optimize(res, + search_params, + idx, + dev_query_view, + dev_neighbors.view(), + dev_distances.view(), + neighbors_view, + cagra_graph, + curr_query_size, + next_graph_degree, + curr_topk, + max_chunk_size, + flag_last, + params); + + dev_graph = raft::make_device_matrix(res, 0, 0); + use_device_graph = false; + } auto end = std::chrono::high_resolution_clock::now(); auto elapsed_ms = std::chrono::duration_cast(end - start).count(); From 6e42fb12418cd7f71777f9b642544b45fd796f5d Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Thu, 26 Feb 2026 02:13:51 -0800 Subject: [PATCH 063/119] make optimize() accept device graph --- .../neighbors/detail/cagra/cagra_build.cuh | 52 +++++++++++++------ cpp/src/neighbors/detail/cagra/graph_core.cuh | 23 +++++--- 2 files changed, 53 insertions(+), 22 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 9b41f70f64..f1f55c90a0 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1904,7 +1904,8 @@ void optimize( raft::resources const& res, raft::mdspan, raft::row_major, g_accessor> knn_graph, raft::host_matrix_view new_graph, - const bool guarantee_connectivity = false) + const bool guarantee_connectivity = false, + const IdxT* d_knn_graph_ptr = nullptr) { using internal_IdxT = typename std::make_unsigned::type; @@ -1922,7 +1923,12 @@ void optimize( knn_graph.extent(1)); cagra::detail::graph::optimize( - res, knn_graph_internal, new_graph_internal, guarantee_connectivity); + res, + knn_graph_internal, + new_graph_internal, + guarantee_connectivity, + true, + reinterpret_cast(d_knn_graph_ptr)); } // RAII wrapper for allocating memory with Transparent HugePage @@ -2034,14 +2040,20 @@ void search_and_optimize(raft::resources const& res, bool flag_last, const index_params& params) { - // Search. - // Since there are many queries, divide them into batches and search them. + auto stream = raft::resource::get_cuda_stream(res); + + // Accumulate search results on device to avoid D-to-H + H-to-D round-trip. + auto dev_knn_graph = + raft::make_device_matrix(res, curr_query_size, curr_topk); + + // Search in batches, accumulate results on both device and host. + // Host copy is needed by optimize Phase 3 (edge selection) which currently runs on CPU. cuvs::spatial::knn::detail::utils::batch_load_iterator query_batch( dev_query_view.data_handle(), curr_query_size, dev_query_view.extent(1), max_chunk_size, - raft::resource::get_cuda_stream(res), + stream, raft::resource::get_workspace_resource(res)); for (const auto& batch : query_batch) { auto batch_dev_query_view = raft::make_device_matrix_view( @@ -2058,20 +2070,28 @@ void search_and_optimize(raft::resources const& res, batch_dev_neighbors_view, batch_dev_distances_view); - auto batch_neighbors_view = raft::make_host_matrix_view( - neighbors_view.data_handle() + batch.offset() * curr_topk, batch.size(), curr_topk); - raft::copy(batch_neighbors_view.data_handle(), + // D-to-D: accumulate into device knn_graph + raft::copy(dev_knn_graph.data_handle() + batch.offset() * curr_topk, + batch_dev_neighbors_view.data_handle(), + batch.size() * curr_topk, + stream); + + // D-to-H: still needed for optimize Phase 3 (host edge selection) + raft::copy(neighbors_view.data_handle() + batch.offset() * curr_topk, batch_dev_neighbors_view.data_handle(), - batch_neighbors_view.size(), - raft::resource::get_cuda_stream(res)); + batch.size() * curr_topk, + stream); } - // Optimize graph + // Optimize graph, passing device knn_graph to skip H-to-D copy inside optimize Phase 2. auto next_graph_size = curr_query_size; - cagra_graph = raft::make_host_matrix(0, 0); // delete existing grahp + cagra_graph = raft::make_host_matrix(0, 0); cagra_graph = raft::make_host_matrix(next_graph_size, next_graph_degree); - optimize( - res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0); + optimize(res, + neighbors_view, + cagra_graph.view(), + flag_last ? params.guarantee_connectivity : 0, + dev_knn_graph.data_handle()); } template (end - start).count(); RAFT_LOG_INFO("# VPQ compression time: %.3lf sec", (double)elapsed_ms / 1000); } + bool do_skip = true; while (true) { auto start = std::chrono::high_resolution_clock::now(); auto curr_query_size = std::min(2 * curr_graph_size, final_graph_size); @@ -2205,7 +2226,8 @@ auto iterative_build_graph( curr_itopk_size = curr_topk + 32; } - bool do_skip = false;//params.skip_graph_optimization && !flag_last; + do_skip = false;//params.skip_graph_optimization && !flag_last; + RAFT_LOG_INFO("# do_skip = %s", do_skip ? "true" : "false"); cuvs::neighbors::cagra::search_params search_params; search_params.algo = cuvs::neighbors::cagra::search_algo::AUTO; diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 69175f152f..89f9db082f 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -31,6 +31,7 @@ #include #include #include +#include #include namespace cuvs::neighbors::cagra::detail::graph { @@ -1143,7 +1144,8 @@ void optimize( raft::mdspan, raft::row_major, g_accessor> knn_graph, raft::host_matrix_view new_graph, const bool guarantee_connectivity = true, - const bool use_gpu = true) + const bool use_gpu = true, + const IdxT* d_knn_graph_ptr = nullptr) { RAFT_LOG_DEBUG( "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1)); @@ -1248,11 +1250,18 @@ void optimize( RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r"); - // Copy knn_graph over to device if necessary - device_matrix_view_from_host d_input_graph( - res, - raft::make_host_matrix_view( - knn_graph.data_handle(), graph_size, knn_graph_degree)); + // Use device knn_graph directly if provided; otherwise copy from host. + std::optional> d_input_graph_copy; + const IdxT* d_input_graph_handle; + if (d_knn_graph_ptr != nullptr) { + d_input_graph_handle = d_knn_graph_ptr; + } else { + d_input_graph_copy.emplace( + res, + raft::make_host_matrix_view( + knn_graph.data_handle(), graph_size, knn_graph_degree)); + d_input_graph_handle = d_input_graph_copy->data_handle(); + } constexpr int MAX_DEGREE = 1024; if (knn_graph_degree > MAX_DEGREE) { @@ -1273,7 +1282,7 @@ void optimize( for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { kern_prune <<>>( - d_input_graph.data_handle(), + d_input_graph_handle, graph_size, knn_graph_degree, output_graph_degree, From c540cbc5ba7a7522568851e5e24695d6981d134b Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Thu, 26 Feb 2026 03:37:40 -0800 Subject: [PATCH 064/119] use reconstructed queries and free the original dataset in cagraq iterative graph construction --- .../neighbors/detail/cagra/cagra_build.cuh | 88 +++++++++++++++++-- 1 file changed, 80 insertions(+), 8 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index f1f55c90a0..eb452fa445 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1977,6 +1977,58 @@ struct mmap_owner { size_t size_; }; +template +__global__ void kern_reconstruct_vpq_queries(const uint8_t* encoded_data, + uint32_t encoded_row_len, + const MathT* vq_codebook, + const MathT* pq_codebook, + uint32_t dim, + uint32_t pq_len, + uint64_t offset, + uint32_t batch_size, + T* output) +{ + const uint64_t batch_idx = blockIdx.x; + if (batch_idx >= batch_size) return; + const uint64_t vec_idx = offset + batch_idx; + const uint8_t* vec_data = encoded_data + vec_idx * encoded_row_len; + const uint32_t vq_code = *reinterpret_cast(vec_data); + const uint8_t* pq_codes = vec_data + sizeof(uint32_t); + const MathT* vq_centroid_ptr = vq_codebook + static_cast(vq_code) * dim; + + for (uint32_t d = threadIdx.x; d < dim; d += blockDim.x) { + uint32_t j = d / pq_len; + uint32_t k = d % pq_len; + float val = static_cast(vq_centroid_ptr[d]) + + static_cast(pq_codebook[static_cast(pq_codes[j]) * pq_len + k]); + output[batch_idx * dim + d] = static_cast(val); + } +} + +template +void reconstruct_vpq_queries(raft::resources const& res, + const vpq_dataset& vpq_dset, + uint64_t offset, + uint32_t batch_size, + raft::device_matrix_view output) +{ + const uint32_t dim = vpq_dset.dim(); + const uint32_t pq_len = vpq_dset.pq_len(); + const uint32_t threads = std::min(dim, 256u); + + kern_reconstruct_vpq_queries + <<>>( + vpq_dset.data.data_handle(), + vpq_dset.encoded_row_length(), + vpq_dset.vq_code_book.data_handle(), + vpq_dset.pq_code_book.data_handle(), + dim, + pq_len, + offset, + batch_size, + output.data_handle()); +} + template void search_to_device_graph(raft::resources const& res, const cuvs::neighbors::cagra::search_params& search_params, @@ -2192,13 +2244,13 @@ auto iterative_build_graph( bool use_device_graph = false; // Generate the compressed index once if compression is enabled + const uint64_t dataset_dim = dev_dataset.extent(1); std::optional> idx_opt; if (params.compression.has_value()) { auto start = std::chrono::high_resolution_clock::now(); RAFT_EXPECTS(params.metric == cuvs::distance::DistanceType::L2Expanded, "VPQ compression is only supported with L2Expanded distance mertric"); idx_opt.emplace(res, params.metric); - // idx_opt->update_graph(res, raft::make_const_mdspan(cagra_graph.view())); idx_opt->update_dataset( res, // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later @@ -2207,6 +2259,12 @@ auto iterative_build_graph( auto end = std::chrono::high_resolution_clock::now(); auto elapsed_ms = std::chrono::duration_cast(end - start).count(); RAFT_LOG_INFO("# VPQ compression time: %.3lf sec", (double)elapsed_ms / 1000); + + // Free the original dataset -- queries will be reconstructed from VPQ codes. + dev_aligned_dataset.reset(); + RAFT_LOG_INFO( + "# Freed original dataset from device (%.1f MiB); queries will use VPQ reconstruction", + to_mib(final_graph_size * dataset_dim * sizeof(T))); } bool do_skip = true; while (true) { @@ -2236,12 +2294,10 @@ auto iterative_build_graph( search_params.max_iterations = 8; search_params.search_width = 1; - // Create an index (idx), a query view (dev_query_view), and a mdarray for - // search results (neighbors). - auto dev_dataset_view = raft::make_device_matrix_view( - dev_dataset.data_handle(), (int64_t)curr_graph_size, dev_dataset.extent(1)); - + // Create index and query views. if (!params.compression.has_value()) { + auto dev_dataset_view = raft::make_device_matrix_view( + dev_dataset.data_handle(), (int64_t)curr_graph_size, dev_dataset.extent(1)); if (use_device_graph) { idx_opt.emplace( res, params.metric, dev_dataset_view, raft::make_const_mdspan(dev_graph.view())); @@ -2258,8 +2314,24 @@ auto iterative_build_graph( } const auto& idx = *idx_opt; - auto dev_query_view = raft::make_device_matrix_view( - dev_dataset.data_handle(), (int64_t)curr_query_size, dev_dataset.extent(1)); + // When compression is enabled, reconstruct queries from VPQ codes instead of + // reading from the (freed) original dataset. + auto dev_reconstructed_queries = + params.compression.has_value() + ? raft::make_device_matrix(res, curr_query_size, dataset_dim) + : raft::make_device_matrix(res, 0, 0); + if (params.compression.has_value()) { + auto* vpq_dset = + dynamic_cast*>(&idx.data()); + RAFT_EXPECTS(vpq_dset != nullptr, "Expected VPQ dataset in compressed index"); + reconstruct_vpq_queries( + res, *vpq_dset, 0, curr_query_size, dev_reconstructed_queries.view()); + } + auto dev_query_view = params.compression.has_value() + ? raft::make_device_matrix_view( + dev_reconstructed_queries.data_handle(), (int64_t)curr_query_size, dataset_dim) + : raft::make_device_matrix_view( + dev_dataset.data_handle(), (int64_t)curr_query_size, dev_dataset.extent(1)); if (do_skip) { auto dev_graph_next = From 099a8d71537134c16f1d1f7530d4e00c5fb1198a Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Thu, 26 Feb 2026 05:40:46 -0800 Subject: [PATCH 065/119] moved edge selection to gpu --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 188 +++++++++++++----- 1 file changed, 136 insertions(+), 52 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 89f9db082f..a9b033c855 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -242,6 +242,53 @@ __global__ void kern_make_rev_graph(const IdxT* const dest_nodes, // [graph_ } } +template +__global__ void kern_select_edges(const uint8_t* const d_detour_count, + const IdxT* const d_knn_graph, + IdxT* const d_output_graph, + const uint32_t graph_size, + const uint32_t knn_graph_degree, + const uint32_t output_graph_degree, + uint32_t* const d_invalid_count) +{ + const uint64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i >= graph_size) return; + + const uint8_t* my_detour = d_detour_count + i * knn_graph_degree; + const IdxT* my_knn = d_knn_graph + i * knn_graph_degree; + IdxT* my_output = d_output_graph + i * output_graph_degree; + + uint32_t pk = 0; + uint32_t num_detour = 0; + for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) { + uint32_t next_num_detour = 0xFFFFFFFFu; + for (uint32_t k = 0; k < knn_graph_degree; k++) { + const uint32_t d = my_detour[k]; + if (d > num_detour) { next_num_detour = min(next_num_detour, d); } + if (d != num_detour) { continue; } + + const IdxT candidate = my_knn[k]; + bool dup = false; + for (uint32_t dk = 0; dk < pk; dk++) { + if (candidate == my_output[dk]) { + dup = true; + break; + } + } + if (!dup && candidate < static_cast(graph_size)) { + my_output[pk] = candidate; + pk++; + } + if (pk >= output_graph_degree) break; + } + if (pk >= output_graph_degree) break; + if (next_num_detour == 0xFFFFFFFFu) break; + num_detour = next_num_detour; + } + + if (pk != output_graph_degree) { atomicAdd(d_invalid_count, 1); } +} + template __device__ __host__ LabelT get_root_label(IdxT i, const LabelT* label) { @@ -1299,8 +1346,6 @@ void optimize( raft::resource::sync_stream(res); RAFT_LOG_DEBUG("\n"); - raft::copy(res, detour_count.view(), raft::make_const_mdspan(d_detour_count.view())); - raft::copy(res, host_stats.view(), raft::make_const_mdspan(dev_stats.view())); num_keep = host_stats.data_handle()[0]; num_full = host_stats.data_handle()[1]; @@ -1314,6 +1359,45 @@ void optimize( (double)num_keep / graph_size, output_graph_degree, (double)num_full / graph_size * 100); + + // GPU edge selection: pick output_graph_degree edges per node with lowest detour counts. + { + raft::common::nvtx::range select_scope( + "cagra::graph::optimize/prune/edge-selection-by-GPU"); + auto d_output_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); + auto d_invalid_count = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(1)); + raft::matrix::fill(res, d_invalid_count.view(), uint32_t(0)); + + const uint32_t select_threads = 256; + const uint32_t select_blocks = (graph_size + select_threads - 1) / select_threads; + kern_select_edges + <<>>( + d_detour_count.data_handle(), + d_input_graph_handle, + d_output_graph.data_handle(), + graph_size, + knn_graph_degree, + output_graph_degree, + d_invalid_count.data_handle()); + raft::resource::sync_stream(res); + + auto h_invalid_count = raft::make_host_vector(1); + raft::copy(res, h_invalid_count.view(), raft::make_const_mdspan(d_invalid_count.view())); + raft::resource::sync_stream(res); + RAFT_EXPECTS( + h_invalid_count.data_handle()[0] == 0, + "Could not generate an intermediate CAGRA graph because the initial kNN graph " + "contains too many invalid or duplicated neighbor nodes. (%u nodes failed)", + h_invalid_count.data_handle()[0]); + + raft::copy(output_graph_ptr, + d_output_graph.data_handle(), + graph_size * output_graph_degree, + raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + } } else { // Count 2-hop detours on CPU raft::common::nvtx::range block_scope( @@ -1325,66 +1409,66 @@ void optimize( const double time_2hop_count_end = cur_time(); RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec", time_2hop_count_end - time_2hop_count_start); - } - // Create pruned kNN graph - bool invalid_neighbor_list = false; + // Create pruned kNN graph + bool invalid_neighbor_list = false; #pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable - // count of the neighbors while increasing the target detourable count from zero. - uint64_t pk = 0; - uint32_t num_detour = 0; - for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) { - uint32_t next_num_detour = std::numeric_limits::max(); - for (uint64_t k = 0; k < knn_graph_degree; k++) { - const auto num_detour_k = detour_count(i, k); - // Find the detourable count to check in the next iteration - if (num_detour_k > num_detour) { - next_num_detour = std::min(static_cast(num_detour_k), next_num_detour); - } - - // Store the neighbor index if its detourable count is equal to `num_detour`. - if (num_detour_k != num_detour) { continue; } + for (uint64_t i = 0; i < graph_size; i++) { + // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable + // count of the neighbors while increasing the target detourable count from zero. + uint64_t pk = 0; + uint32_t num_detour = 0; + for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) { + uint32_t next_num_detour = std::numeric_limits::max(); + for (uint64_t k = 0; k < knn_graph_degree; k++) { + const auto num_detour_k = detour_count(i, k); + // Find the detourable count to check in the next iteration + if (num_detour_k > num_detour) { + next_num_detour = std::min(static_cast(num_detour_k), next_num_detour); + } - // Check duplication and append - const auto candidate_node = knn_graph(i, k); - bool dup = false; - for (uint32_t dk = 0; dk < pk; dk++) { - if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) { - dup = true; - break; + // Store the neighbor index if its detourable count is equal to `num_detour`. + if (num_detour_k != num_detour) { continue; } + + // Check duplication and append + const auto candidate_node = knn_graph(i, k); + bool dup = false; + for (uint32_t dk = 0; dk < pk; dk++) { + if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) { + dup = true; + break; + } } - } - if (!dup && candidate_node < graph_size) { - output_graph_ptr[i * output_graph_degree + pk] = candidate_node; - pk += 1; + if (!dup && candidate_node < graph_size) { + output_graph_ptr[i * output_graph_degree + pk] = candidate_node; + pk += 1; + } + if (pk >= output_graph_degree) break; } if (pk >= output_graph_degree) break; - } - if (pk >= output_graph_degree) break; - if (next_num_detour == std::numeric_limits::max()) { - // There are no valid edges enough in the initial kNN graph. Break the loop here and catch - // the error at the next validation (pk != output_graph_degree). - break; + if (next_num_detour == std::numeric_limits::max()) { + // There are no valid edges enough in the initial kNN graph. Break the loop here and + // catch the error at the next validation (pk != output_graph_degree). + break; + } + num_detour = next_num_detour; + } + if (pk != output_graph_degree) { + RAFT_LOG_DEBUG( + "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for " + "node %lu in the rank-based node reranking process", + output_graph_degree, + i); + invalid_neighbor_list = true; } - num_detour = next_num_detour; - } - if (pk != output_graph_degree) { - RAFT_LOG_DEBUG( - "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for " - "node %lu in the rank-based node reranking process", - output_graph_degree, - i); - invalid_neighbor_list = true; } + RAFT_EXPECTS( + !invalid_neighbor_list, + "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " + "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " + "overflows occur during the norm computation between the dataset vectors."); } - RAFT_EXPECTS( - !invalid_neighbor_list, - "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " - "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " - "overflows occur during the norm computation between the dataset vectors."); const double time_prune_end = cur_time(); RAFT_LOG_DEBUG("# Pruning time: %.1lf ms", (time_prune_end - time_prune_start) * 1000.0); From 764aa64ce5dd411b1abe3907574f31010fb214ec Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Thu, 26 Feb 2026 07:37:09 -0800 Subject: [PATCH 066/119] Moved reverse graph construction to GPU --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 138 +++++++++++++----- 1 file changed, 101 insertions(+), 37 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index a9b033c855..e8a73b46cb 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -242,6 +242,20 @@ __global__ void kern_make_rev_graph(const IdxT* const dest_nodes, // [graph_ } } +template +__global__ void kern_extract_column(const IdxT* const d_matrix, + IdxT* const d_column, + const uint32_t n_rows, + const uint32_t n_cols, + const uint32_t col_idx) +{ + const uint32_t tid = threadIdx.x + (blockDim.x * blockIdx.x); + const uint32_t tnum = blockDim.x * gridDim.x; + for (uint32_t i = tid; i < n_rows; i += tnum) { + d_column[i] = d_matrix[col_idx + (static_cast(n_cols) * i)]; + } +} + template __global__ void kern_select_edges(const uint8_t* const d_detour_count, const IdxT* const d_knn_graph, @@ -1234,6 +1248,10 @@ void optimize( } } + // Device pruned graph: populated by GPU edge selection, reused by GPU reverse graph. + auto d_pruned_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(0, 0)); + { raft::common::nvtx::range block_scope( "cagra::graph::optimize/prune"); @@ -1361,11 +1379,11 @@ void optimize( (double)num_full / graph_size * 100); // GPU edge selection: pick output_graph_degree edges per node with lowest detour counts. + d_pruned_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); { raft::common::nvtx::range select_scope( "cagra::graph::optimize/prune/edge-selection-by-GPU"); - auto d_output_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); auto d_invalid_count = raft::make_device_mdarray( res, large_tmp_mr, raft::make_extents(1)); raft::matrix::fill(res, d_invalid_count.view(), uint32_t(0)); @@ -1376,7 +1394,7 @@ void optimize( <<>>( d_detour_count.data_handle(), d_input_graph_handle, - d_output_graph.data_handle(), + d_pruned_graph.data_handle(), graph_size, knn_graph_degree, output_graph_degree, @@ -1393,7 +1411,7 @@ void optimize( h_invalid_count.data_handle()[0]); raft::copy(output_graph_ptr, - d_output_graph.data_handle(), + d_pruned_graph.data_handle(), graph_size * output_graph_degree, raft::resource::get_cuda_stream(res)); raft::resource::sync_stream(res); @@ -1485,48 +1503,94 @@ void optimize( // const double time_make_start = cur_time(); - device_matrix_view_from_host d_rev_graph(res, rev_graph.view()); - raft::matrix::fill(res, - raft::make_device_vector_view( - d_rev_graph.data_handle(), graph_size * output_graph_degree), - IdxT(-1)); - - auto d_rev_graph_count = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size)); - raft::matrix::fill(res, d_rev_graph_count.view(), uint32_t(0)); + if (d_pruned_graph.extent(0) > 0) { + // GPU path: d_pruned_graph is on device; extract columns on device to preserve + // column-priority ordering (earlier columns get priority in the reverse graph). + auto d_rev_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); + raft::matrix::fill(res, + raft::make_device_vector_view( + d_rev_graph.data_handle(), graph_size * output_graph_degree), + IdxT(-1)); + + auto d_rev_graph_count = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size)); + raft::matrix::fill(res, d_rev_graph_count.view(), uint32_t(0)); - auto dest_nodes = raft::make_host_vector(graph_size); - auto d_dest_nodes = - raft::make_device_mdarray(res, large_tmp_mr, raft::make_extents(graph_size)); + auto d_dest_nodes = + raft::make_device_mdarray(res, large_tmp_mr, raft::make_extents(graph_size)); - for (uint64_t k = 0; k < output_graph_degree; k++) { -#pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - // dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)]; - dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)]; + for (uint64_t k = 0; k < output_graph_degree; k++) { + dim3 ext_threads(256, 1, 1); + dim3 ext_blocks(std::min(static_cast((graph_size + 255) / 256), 65535u), 1, 1); + kern_extract_column + <<>>( + d_pruned_graph.data_handle(), + d_dest_nodes.data_handle(), + graph_size, + output_graph_degree, + k); + + dim3 threads(256, 1, 1); + dim3 blocks(1024, 1, 1); + kern_make_rev_graph<<>>( + d_dest_nodes.data_handle(), + d_rev_graph.data_handle(), + d_rev_graph_count.data_handle(), + graph_size, + output_graph_degree); } raft::resource::sync_stream(res); - raft::copy(res, d_dest_nodes.view(), raft::make_const_mdspan(dest_nodes.view())); + d_pruned_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(0, 0)); - dim3 threads(256, 1, 1); - dim3 blocks(1024, 1, 1); - kern_make_rev_graph<<>>( - d_dest_nodes.data_handle(), - d_rev_graph.data_handle(), - d_rev_graph_count.data_handle(), - graph_size, - output_graph_degree); - RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u \r", k, output_graph_degree); - } + raft::copy(res, rev_graph.view(), raft::make_const_mdspan(d_rev_graph.view())); + raft::copy(res, rev_graph_count.view(), raft::make_const_mdspan(d_rev_graph_count.view())); + } else { + // CPU fallback: per-column H-to-D copy approach. + device_matrix_view_from_host d_rev_graph(res, rev_graph.view()); + raft::matrix::fill(res, + raft::make_device_vector_view( + d_rev_graph.data_handle(), graph_size * output_graph_degree), + IdxT(-1)); + + auto d_rev_graph_count = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size)); + raft::matrix::fill(res, d_rev_graph_count.view(), uint32_t(0)); - raft::resource::sync_stream(res); - RAFT_LOG_DEBUG("\n"); + auto dest_nodes = raft::make_host_vector(graph_size); + auto d_dest_nodes = + raft::make_device_mdarray(res, large_tmp_mr, raft::make_extents(graph_size)); - if (d_rev_graph.allocated_memory()) { - raft::copy(res, rev_graph.view(), raft::make_const_mdspan(d_rev_graph.view())); + for (uint64_t k = 0; k < output_graph_degree; k++) { +#pragma omp parallel for + for (uint64_t i = 0; i < graph_size; i++) { + dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)]; + } + raft::resource::sync_stream(res); + + raft::copy(res, d_dest_nodes.view(), raft::make_const_mdspan(dest_nodes.view())); + + dim3 threads(256, 1, 1); + dim3 blocks(1024, 1, 1); + kern_make_rev_graph<<>>( + d_dest_nodes.data_handle(), + d_rev_graph.data_handle(), + d_rev_graph_count.data_handle(), + graph_size, + output_graph_degree); + RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u \r", k, output_graph_degree); + } + + raft::resource::sync_stream(res); + RAFT_LOG_DEBUG("\n"); + + if (d_rev_graph.allocated_memory()) { + raft::copy(res, rev_graph.view(), raft::make_const_mdspan(d_rev_graph.view())); + } + raft::copy(res, rev_graph_count.view(), raft::make_const_mdspan(d_rev_graph_count.view())); } - raft::copy(res, rev_graph_count.view(), raft::make_const_mdspan(d_rev_graph_count.view())); const double time_make_end = cur_time(); RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms", From ecf3b1db009a78adaeea268ff51d1c2d79763eb9 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 27 Feb 2026 15:39:01 +0000 Subject: [PATCH 067/119] extract prune into separate function --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 497 +++++++++--------- 1 file changed, 245 insertions(+), 252 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 8705f555b6..70cd29aa4a 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -1343,6 +1343,246 @@ void count_2hop_detours(raft::host_matrix_view k } } +// +// Prune unimportant edges based on 2-hop detour counts. +// +// The edge to be retained is determined without explicitly considering distance or angle. +// Suppose the edge is the k-th edge of some node-A to node-B (A->B). Among the edges +// originating at node-A, there are k-1 edges shorter than the edge A->B. Each of these +// k-1 edges are connected to a different k-1 nodes. Among these k-1 nodes, count the +// number of nodes with edges to node-B, which is the number of 2-hop detours for the +// edge A->B. Once the number of 2-hop detours has been counted for all edges, the +// specified number of edges are picked up for each node, starting with the edge with +// the lowest number of 2-hop detours. +// +template +void prune_graph(raft::resources const& res, + InputMatrixView knn_graph, + OutputMatrixView output_graph, + bool use_gpu) +{ + const uint64_t graph_size = output_graph.extent(0); + const uint64_t knn_graph_degree = knn_graph.extent(1); + const uint64_t output_graph_degree = output_graph.extent(1); + auto output_graph_ptr = output_graph.data_handle(); + + auto large_tmp_mr = raft::resource::get_large_workspace_resource(res); + + uint32_t batch_size = + std::min(static_cast(graph_size), static_cast(256 * 1024)); + const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size; + + bool use_gpu_prune = use_gpu; + if (use_gpu_prune) { + try { + auto d_detour_count = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(batch_size, knn_graph_degree)); + auto d_num_no_detour_edges = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(batch_size)); + auto d_output_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(batch_size, output_graph_degree)); + auto d_input_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size, knn_graph_degree)); + } catch (std::bad_alloc& e) { + RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU"); + use_gpu_prune = false; + } catch (raft::logic_error& e) { + RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU (logic error)"); + use_gpu_prune = false; + } + } + + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/prune"); + const double time_prune_start = cur_time(); + + if (use_gpu_prune) { + RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r"); + + constexpr int MAX_DEGREE = 1024; + if (knn_graph_degree > MAX_DEGREE) { + RAFT_FAIL( + "The degree of input knn graph is too large (%zu). " + "It must be equal to or smaller than %d.", + knn_graph_degree, + MAX_DEGREE); + } + + const double prune_start = cur_time(); + + uint64_t num_keep __attribute__((unused)) = 0; + uint64_t num_full __attribute__((unused)) = 0; + auto dev_stats = raft::make_device_vector(res, 2); + auto host_stats = raft::make_host_vector(2); + RAFT_CUDA_TRY(cudaMemsetAsync( + dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res))); + + device_matrix_view_from_host d_input_graph( + res, + raft::make_host_matrix_view( + knn_graph.data_handle(), graph_size, knn_graph_degree)); + + auto d_detour_count = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(batch_size, knn_graph_degree)); + auto d_num_no_detour_edges = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(batch_size)); + auto d_output_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(batch_size, output_graph_degree)); + auto d_invalid_neighbor_list = raft::make_device_scalar(res, 0u); + + for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { + RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(), + 0xff, + batch_size * knn_graph_degree * sizeof(uint8_t), + raft::resource::get_cuda_stream(res))); + + RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(), + 0x00, + batch_size * sizeof(uint32_t), + raft::resource::get_cuda_stream(res))); + + const dim3 threads_prune(32, 1, 1); + const dim3 blocks_prune(batch_size, 1, 1); + const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT); + kern_prune + <<>>( + d_input_graph.data_handle(), + graph_size, + knn_graph_degree, + output_graph_degree, + batch_size, + i_batch, + d_detour_count.data_handle(), + d_num_no_detour_edges.data_handle(), + dev_stats.data_handle()); + + const size_t select_smem_size = (knn_graph_degree) * (sizeof(uint16_t) + sizeof(IdxT)); + const dim3 threads_select(32, 1, 1); + const dim3 blocks_select(batch_size, 1, 1); + kern_select_smallest_detour_neighbors + <<>>( + d_input_graph.data_handle(), + graph_size, + knn_graph_degree, + output_graph_degree, + d_detour_count.data_handle(), + d_output_graph.data_handle(), + batch_size, + i_batch, + d_invalid_neighbor_list.data_handle()); + + size_t copy_size = + std::min(static_cast(batch_size), graph_size - i_batch * batch_size) * + output_graph_degree; + raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree, + d_output_graph.data_handle(), + copy_size, + raft::resource::get_cuda_stream(res)); + + raft::resource::sync_stream(res); + RAFT_LOG_DEBUG( + "# Pruning kNN Graph on GPUs (%.1lf %%)\r", + (double)std::min((i_batch + 1) * batch_size, graph_size) / graph_size * 100); + } + raft::resource::sync_stream(res); + RAFT_LOG_DEBUG("\n"); + + uint32_t invalid_neighbor_list = 0; + raft::copy(&invalid_neighbor_list, + d_invalid_neighbor_list.data_handle(), + 1, + raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + RAFT_EXPECTS( + invalid_neighbor_list == 0, + "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " + "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " + "overflows occur during the norm computation between the dataset vectors."); + + raft::copy( + host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res)); + num_keep = host_stats.data_handle()[0]; + num_full = host_stats.data_handle()[1]; + + const double prune_end = cur_time(); + RAFT_LOG_DEBUG( + "# Time for pruning on GPU: %.1lf sec, " + "avg_no_detour_edges_per_node: %.2lf/%u, " + "nodes_with_no_detour_at_all_edges: %.1lf%%", + prune_end - prune_start, + (double)num_keep / graph_size, + output_graph_degree, + (double)num_full / graph_size * 100); + } else { + auto detour_count = raft::make_host_matrix(graph_size, knn_graph_degree); + + { + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/prune/2-hop-counting-by-CPU"); + const double time_2hop_count_start = cur_time(); + + auto knn_graph_view = raft::make_host_matrix_view( + knn_graph.data_handle(), knn_graph.extent(0), knn_graph.extent(1)); + count_2hop_detours(knn_graph_view, detour_count.view()); + + const double time_2hop_count_end = cur_time(); + RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec", + time_2hop_count_end - time_2hop_count_start); + } + bool invalid_neighbor_list = false; +#pragma omp parallel for + for (uint64_t i = 0; i < graph_size; i++) { + uint64_t pk = 0; + uint32_t num_detour = 0; + for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) { + uint32_t next_num_detour = std::numeric_limits::max(); + for (uint64_t k = 0; k < knn_graph_degree; k++) { + const auto num_detour_k = detour_count(i, k); + if (num_detour_k > num_detour) { + next_num_detour = std::min(static_cast(num_detour_k), next_num_detour); + } + + if (num_detour_k != num_detour) { continue; } + + const auto candidate_node = knn_graph(i, k); + bool dup = false; + for (uint32_t dk = 0; dk < pk; dk++) { + if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) { + dup = true; + break; + } + } + if (!dup && candidate_node < graph_size) { + output_graph_ptr[i * output_graph_degree + pk] = candidate_node; + pk += 1; + } + if (pk >= output_graph_degree) break; + } + if (pk >= output_graph_degree) break; + + if (next_num_detour == std::numeric_limits::max()) { break; } + num_detour = next_num_detour; + } + if (pk != output_graph_degree) { + RAFT_LOG_DEBUG( + "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for " + "node %lu in the rank-based node reranking process", + output_graph_degree, + i); + invalid_neighbor_list = true; + } + } + RAFT_EXPECTS( + !invalid_neighbor_list, + "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " + "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " + "overflows occur during the norm computation between the dataset vectors."); + } + + const double time_prune_end = cur_time(); + RAFT_LOG_DEBUG("# Pruning time: %.1lf ms", (time_prune_end - time_prune_start) * 1000.0); +} + // TODO allow pinned input for both knn_graph and new_graph template (graph_size), static_cast(256 * 1024)); - const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size; - - // - // If the available device memory is insufficient, do not use the GPU to count - // the number of 2-hop detours, but use the CPU. - // - // TODO: we should decide on a global strategy for this in a single place - // it comes down to input memory type and available memory which data should be copied to GPU - bool _use_gpu_prune = use_gpu; - if (_use_gpu_prune) { - try { - auto d_detour_count = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(batch_size, knn_graph_degree)); - auto d_num_no_detour_edges = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(batch_size)); - auto d_output_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(batch_size, output_graph_degree)); - // TODO we also want to consider pinned memory in case we are short on memory - auto d_input_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size, knn_graph_degree)); - } catch (std::bad_alloc& e) { - RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU"); - _use_gpu_prune = false; - } catch (raft::logic_error& e) { - RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU (logic error)"); - _use_gpu_prune = false; - } - } - - { - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/prune"); - const double time_prune_start = cur_time(); - - // - // Prune unimportant edges. - // - // The edge to be retained is determined without explicitly considering - // distance or angle. Suppose the edge is the k-th edge of some node-A to - // node-B (A->B). Among the edges originating at node-A, there are k-1 edges - // shorter than the edge A->B. Each of these k-1 edges are connected to a - // different k-1 nodes. Among these k-1 nodes, count the number of nodes with - // edges to node-B, which is the number of 2-hop detours for the edge A->B. - // Once the number of 2-hop detours has been counted for all edges, the - // specified number of edges are picked up for each node, starting with the - // edge with the lowest number of 2-hop detours. - // - if (_use_gpu_prune) { - // Pruning on GPU - RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r"); - - constexpr int MAX_DEGREE = 1024; - if (knn_graph_degree > MAX_DEGREE) { - RAFT_FAIL( - "The degree of input knn graph is too large (%zu). " - "It must be equal to or smaller than %d.", - knn_graph_degree, - MAX_DEGREE); - } - - const double prune_start = cur_time(); - - uint64_t num_keep __attribute__((unused)) = 0; - uint64_t num_full __attribute__((unused)) = 0; - auto dev_stats = raft::make_device_vector(res, 2); - auto host_stats = raft::make_host_vector(2); - RAFT_CUDA_TRY(cudaMemsetAsync( - dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res))); - - // Copy knn_graph over to device if necessary - // TODO: should we use pinned memory if we have issues fitting on GPU? - device_matrix_view_from_host d_input_graph( - res, - raft::make_host_matrix_view( - knn_graph.data_handle(), graph_size, knn_graph_degree)); - - // data structures per batch - auto d_detour_count = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(batch_size, knn_graph_degree)); - auto d_num_no_detour_edges = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(batch_size)); - auto d_output_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(batch_size, output_graph_degree)); - auto d_invalid_neighbor_list = raft::make_device_scalar(res, 0u); - - for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { - // initialize the detour_count and num_no_detour_edges for the current batch - RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(), - 0xff, - batch_size * knn_graph_degree * sizeof(uint8_t), - raft::resource::get_cuda_stream(res))); - - RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(), - 0x00, - batch_size * sizeof(uint32_t), - raft::resource::get_cuda_stream(res))); - - // count 2-hop detours for the current batch - const dim3 threads_prune(32, 1, 1); - const dim3 blocks_prune(batch_size, 1, 1); - const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT); - kern_prune - <<>>( - d_input_graph.data_handle(), - graph_size, - knn_graph_degree, - output_graph_degree, - batch_size, - i_batch, - d_detour_count.data_handle(), - d_num_no_detour_edges.data_handle(), - dev_stats.data_handle()); - - // select smallest-detour neighbors for the current batch - const size_t select_smem_size = (knn_graph_degree) * (sizeof(uint16_t) + sizeof(IdxT)); - const dim3 threads_select(32, 1, 1); - const dim3 blocks_select(batch_size, 1, 1); - kern_select_smallest_detour_neighbors - <<>>(d_input_graph.data_handle(), - graph_size, - knn_graph_degree, - output_graph_degree, - d_detour_count.data_handle(), - d_output_graph.data_handle(), - batch_size, - i_batch, - d_invalid_neighbor_list.data_handle()); - - size_t copy_size = - std::min(static_cast(batch_size), graph_size - i_batch * batch_size) * - output_graph_degree; - raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree, - d_output_graph.data_handle(), - copy_size, - raft::resource::get_cuda_stream(res)); - - raft::resource::sync_stream(res); - RAFT_LOG_DEBUG( - "# Pruning kNN Graph on GPUs (%.1lf %%)\r", - (double)std::min((i_batch + 1) * batch_size, graph_size) / graph_size * 100); - } - raft::resource::sync_stream(res); - RAFT_LOG_DEBUG("\n"); - - uint32_t invalid_neighbor_list = 0; - raft::copy(&invalid_neighbor_list, - d_invalid_neighbor_list.data_handle(), - 1, - raft::resource::get_cuda_stream(res)); - raft::resource::sync_stream(res); - RAFT_EXPECTS( - invalid_neighbor_list == 0, - "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " - "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " - "overflows occur during the norm computation between the dataset vectors."); - - raft::copy( - host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res)); - num_keep = host_stats.data_handle()[0]; - num_full = host_stats.data_handle()[1]; - - const double prune_end = cur_time(); - RAFT_LOG_DEBUG( - "# Time for pruning on GPU: %.1lf sec, " - "avg_no_detour_edges_per_node: %.2lf/%u, " - "nodes_with_no_detour_at_all_edges: %.1lf%%", - prune_end - prune_start, - (double)num_keep / graph_size, - output_graph_degree, - (double)num_full / graph_size * 100); - } else { - // Pruning on CPU - auto detour_count = raft::make_host_matrix(graph_size, knn_graph_degree); - - { - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/prune/2-hop-counting-by-CPU"); - const double time_2hop_count_start = cur_time(); - - count_2hop_detours(knn_graph, detour_count.view()); - - const double time_2hop_count_end = cur_time(); - RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec", - time_2hop_count_end - time_2hop_count_start); - } - bool invalid_neighbor_list = false; -#pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable - // count of the neighbors while increasing the target detourable count from zero. - uint64_t pk = 0; - uint32_t num_detour = 0; - for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) { - uint32_t next_num_detour = std::numeric_limits::max(); - for (uint64_t k = 0; k < knn_graph_degree; k++) { - const auto num_detour_k = detour_count(i, k); - // Find the detourable count to check in the next iteration - if (num_detour_k > num_detour) { - next_num_detour = std::min(static_cast(num_detour_k), next_num_detour); - } - - // Store the neighbor index if its detourable count is equal to `num_detour`. - if (num_detour_k != num_detour) { continue; } - - // Check duplication and append - const auto candidate_node = knn_graph(i, k); - bool dup = false; - for (uint32_t dk = 0; dk < pk; dk++) { - if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) { - dup = true; - break; - } - } - if (!dup && candidate_node < graph_size) { - output_graph_ptr[i * output_graph_degree + pk] = candidate_node; - pk += 1; - } - if (pk >= output_graph_degree) break; - } - if (pk >= output_graph_degree) break; - - if (next_num_detour == std::numeric_limits::max()) { - // There are no valid edges enough in the initial kNN graph. Break the loop here and - // catch the error at the next validation (pk != output_graph_degree). - break; - } - num_detour = next_num_detour; - } - if (pk != output_graph_degree) { - RAFT_LOG_DEBUG( - "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for " - "node %lu in the rank-based node reranking process", - output_graph_degree, - i); - invalid_neighbor_list = true; - } - } - RAFT_EXPECTS( - !invalid_neighbor_list, - "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " - "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " - "overflows occur during the norm computation between the dataset vectors."); - } - - const double time_prune_end = cur_time(); - RAFT_LOG_DEBUG("# Pruning time: %.1lf ms", (time_prune_end - time_prune_start) * 1000.0); - } + prune_graph(res, knn_graph, new_graph, use_gpu); auto rev_graph = raft::make_host_matrix(graph_size, output_graph_degree); auto rev_graph_count = raft::make_host_vector(graph_size); @@ -1760,6 +1749,10 @@ void optimize( // Create a boolean variable on the GPU using RAFT device allocator auto d_check_num_protected_edges = raft::make_device_scalar(res, true); + uint32_t batch_size = + std::min(static_cast(graph_size), static_cast(256 * 1024)); + const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size; + const dim3 threads_merge(32, 1, 1); const dim3 blocks_merge(batch_size, 1, 1); const size_t merge_smem_size = (output_graph_degree + output_graph_degree) * sizeof(IdxT); From 972d278c77c05add60e4518150e00b0f0f7898cf Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Mon, 2 Mar 2026 14:41:22 +0000 Subject: [PATCH 068/119] extract optimize components --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 1174 +++++++++-------- 1 file changed, 616 insertions(+), 558 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 70cd29aa4a..713b03ca20 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -674,6 +674,316 @@ void shift_array(T* array, uint64_t num) array[i] = array[i - 1]; } } + +template +void log_replaced_edges_stats(const IdxT* output_graph_ptr, + uint64_t graph_size, + uint64_t output_graph_degree) +{ + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/stats"); + uint64_t num_replaced_edges = 0; +#pragma omp parallel for reduction(+ : num_replaced_edges) + for (uint64_t i = 0; i < graph_size; i++) { + for (uint64_t k = 0; k < output_graph_degree; k++) { + const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)]; + const uint64_t pos = + pos_in_array(j, output_graph_ptr + (output_graph_degree * i), output_graph_degree); + if (pos == output_graph_degree) { num_replaced_edges += 1; } + } + } + RAFT_LOG_DEBUG("# Average number of replaced edges per node: %.2f", + (double)num_replaced_edges / graph_size); +} + +template +void log_incoming_edges_histogram(const IdxT* output_graph_ptr, + uint64_t graph_size, + uint64_t output_graph_degree) +{ + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/check_edges"); + auto in_edge_count = raft::make_host_vector(graph_size); + auto in_edge_count_ptr = in_edge_count.data_handle(); +#pragma omp parallel for + for (uint64_t i = 0; i < graph_size; i++) { + in_edge_count_ptr[i] = 0; + } +#pragma omp parallel for + for (uint64_t i = 0; i < graph_size; i++) { + for (uint64_t k = 0; k < output_graph_degree; k++) { + const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)]; + if (j >= graph_size) continue; +#pragma omp atomic + in_edge_count_ptr[j] += 1; + } + } + auto hist = raft::make_host_vector(output_graph_degree); + auto hist_ptr = hist.data_handle(); + for (uint64_t k = 0; k < output_graph_degree; k++) { + hist_ptr[k] = 0; + } +#pragma omp parallel for + for (uint64_t i = 0; i < graph_size; i++) { + uint32_t count = in_edge_count_ptr[i]; + if (count >= output_graph_degree) continue; +#pragma omp atomic + hist_ptr[count] += 1; + } + RAFT_LOG_DEBUG("# Histogram for number of incoming edges\n"); + uint32_t sum_hist = 0; + for (uint64_t k = 0; k < output_graph_degree; k++) { + sum_hist += hist_ptr[k]; + RAFT_LOG_DEBUG("# %3lu, %8u, %lf, (%8u, %lf)\n", + k, + hist_ptr[k], + (double)hist_ptr[k] / graph_size, + sum_hist, + (double)sum_hist / graph_size); + } +} + +template +void check_duplicates_and_out_of_range(const IdxT* output_graph_ptr, + uint64_t graph_size, + uint64_t output_graph_degree) +{ + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/check_duplicates"); + uint64_t num_dup = 0; + uint64_t num_oor = 0; +#pragma omp parallel for reduction(+ : num_dup) reduction(+ : num_oor) + for (uint64_t i = 0; i < graph_size; i++) { + auto my_out_graph = output_graph_ptr + (output_graph_degree * i); + for (uint32_t j = 0; j < output_graph_degree; j++) { + const auto neighbor_a = my_out_graph[j]; + + if (neighbor_a > graph_size) { + num_oor++; + continue; + } + + for (uint32_t k = j + 1; k < output_graph_degree; k++) { + const auto neighbor_b = my_out_graph[k]; + if (neighbor_a == neighbor_b) { num_dup++; } + } + } + } + RAFT_EXPECTS( + num_dup == 0, "%lu duplicated node(s) are found in the generated CAGRA graph", num_dup); + RAFT_EXPECTS( + num_oor == 0, "%lu out-of-range index node(s) are found in the generated CAGRA graph", num_oor); +} + +template +void merge_graph_gpu(raft::resources const& res, + IdxT* output_graph_ptr, + const IdxT* d_rev_graph, + uint32_t* d_rev_graph_count, + const IdxT* mst_graph_ptr, + const uint32_t* mst_graph_num_edges_ptr, + uint64_t graph_size, + uint64_t output_graph_degree, + bool guarantee_connectivity) +{ + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/combine"); + + const double merge_graph_start = cur_time(); + + device_matrix_view_from_host d_output_graph( + res, + raft::make_host_matrix_view(output_graph_ptr, graph_size, output_graph_degree)); + + auto d_check_num_protected_edges = raft::make_device_scalar(res, true); + + uint32_t batch_size = + std::min(static_cast(graph_size), static_cast(256 * 1024)); + const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size; + + const dim3 threads_merge(32, 1, 1); + const dim3 blocks_merge(batch_size, 1, 1); + const size_t merge_smem_size = (output_graph_degree + output_graph_degree) * sizeof(IdxT); + for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { + kern_merge_graph + <<>>( + d_output_graph.data_handle(), + d_rev_graph, + d_rev_graph_count, + static_cast(graph_size), + static_cast(output_graph_degree), + mst_graph_ptr, + static_cast(output_graph_degree), + mst_graph_num_edges_ptr, + batch_size, + i_batch, + guarantee_connectivity, + d_check_num_protected_edges.data_handle()); + } + + bool check_num_protected_edges = true; + raft::copy(&check_num_protected_edges, + d_check_num_protected_edges.data_handle(), + 1, + raft::resource::get_cuda_stream(res)); + + if (d_output_graph.allocated_memory()) { + raft::copy(output_graph_ptr, + d_output_graph.data_handle(), + graph_size * output_graph_degree, + raft::resource::get_cuda_stream(res)); + } + + raft::resource::sync_stream(res); + + const auto merge_graph_end = cur_time(); + RAFT_EXPECTS(check_num_protected_edges, + "Failed to merge the MST, pruned, and reverse edge graphs. " + "Some nodes have too " + "many MST optimization edges."); + + RAFT_LOG_DEBUG("# Time for merging graphs: %.1lf ms", + (merge_graph_end - merge_graph_start) * 1000.0); +} + +template +void merge_graph_cpu(IdxT* output_graph_ptr, + const IdxT* rev_graph_ptr, + const uint32_t* rev_graph_count_ptr, + const IdxT* mst_graph_ptr, + const uint32_t* mst_graph_num_edges_ptr, + uint64_t graph_size, + uint64_t output_graph_degree, + bool guarantee_connectivity) +{ + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/combine"); + + const double time_replace_start = cur_time(); + + bool check_num_protected_edges = true; +#pragma omp parallel for + for (uint64_t i = 0; i < graph_size; i++) { + auto my_rev_graph = rev_graph_ptr + (output_graph_degree * i); + auto my_out_graph = output_graph_ptr + (output_graph_degree * i); + + std::vector temp_output_neighbor_list; + if (guarantee_connectivity) { + temp_output_neighbor_list.resize(output_graph_degree); + my_out_graph = temp_output_neighbor_list.data(); + const auto mst_graph_num_edges = mst_graph_num_edges_ptr[i]; + + for (uint32_t j = 0; j < mst_graph_num_edges; j++) { + my_out_graph[j] = mst_graph_ptr[i * output_graph_degree + j]; + } + + for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges; + (pruned_j < output_graph_degree) && (output_j < output_graph_degree); + pruned_j++) { + const auto v = output_graph_ptr[output_graph_degree * i + pruned_j]; + + bool dup = false; + for (uint32_t m = 0; m < output_j; m++) { + if (v == my_out_graph[m]) { + dup = true; + break; + } + } + + if (!dup) { + my_out_graph[output_j] = v; + output_j++; + } + } + } + + const auto num_protected_edges = + std::max(mst_graph_num_edges_ptr[i], output_graph_degree / 2); + if (num_protected_edges > output_graph_degree) { check_num_protected_edges = false; } + if (num_protected_edges == output_graph_degree) continue; + + auto kr = std::min(rev_graph_count_ptr[i], output_graph_degree); + while (kr) { + kr -= 1; + if (my_rev_graph[kr] < graph_size) { + uint64_t pos = pos_in_array(my_rev_graph[kr], my_out_graph, output_graph_degree); + if (pos < num_protected_edges) { continue; } + uint64_t num_shift = pos - num_protected_edges; + if (pos >= output_graph_degree) { + num_shift = output_graph_degree - num_protected_edges - 1; + } + shift_array(my_out_graph + num_protected_edges, num_shift); + my_out_graph[num_protected_edges] = my_rev_graph[kr]; + } + } + + if (guarantee_connectivity) { + for (uint32_t j = 0; j < output_graph_degree; j++) { + output_graph_ptr[(output_graph_degree * i) + j] = my_out_graph[j]; + } + } + } + RAFT_EXPECTS(check_num_protected_edges, + "Failed to merge the MST, pruned, and reverse edge graphs. Some nodes have too " + "many MST optimization edges."); + + const double time_replace_end = cur_time(); + RAFT_LOG_DEBUG("# Replacing edges time: %.1lf ms", + (time_replace_end - time_replace_start) * 1000.0); +} + +template +void make_reverse_graph_gpu(raft::resources const& res, + IdxT* d_rev_graph, + uint32_t* d_rev_graph_count, + raft::host_matrix_view new_graph) +{ + const uint64_t graph_size = new_graph.extent(0); + const uint64_t output_graph_degree = new_graph.extent(1); + const IdxT* output_graph_ptr = new_graph.data_handle(); + + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/reverse"); + + auto large_tmp_mr = raft::resource::get_large_workspace_resource(res); + auto dest_nodes = raft::make_host_vector(graph_size); + auto d_dest_nodes = + raft::make_device_mdarray(res, large_tmp_mr, raft::make_extents(graph_size)); + + RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph, + 0xff, + graph_size * output_graph_degree * sizeof(IdxT), + raft::resource::get_cuda_stream(res))); + + RAFT_CUDA_TRY(cudaMemsetAsync( + d_rev_graph_count, 0x00, graph_size * sizeof(uint32_t), raft::resource::get_cuda_stream(res))); + + for (uint64_t k = 0; k < output_graph_degree; k++) { +#pragma omp parallel for + for (uint64_t i = 0; i < graph_size; i++) { + dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)]; + } + raft::resource::sync_stream(res); + + raft::copy(d_dest_nodes.data_handle(), + dest_nodes.data_handle(), + graph_size, + raft::resource::get_cuda_stream(res)); + + dim3 threads(256, 1, 1); + dim3 blocks(1024, 1, 1); + kern_make_rev_graph<<>>( + d_dest_nodes.data_handle(), + d_rev_graph, + d_rev_graph_count, + static_cast(graph_size), + static_cast(output_graph_degree)); + RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %lu \r", k, output_graph_degree); + } + + raft::resource::sync_stream(res); + RAFT_LOG_DEBUG("\n"); +} } // namespace template k // specified number of edges are picked up for each node, starting with the edge with // the lowest number of 2-hop detours. // -template -void prune_graph(raft::resources const& res, - InputMatrixView knn_graph, - OutputMatrixView output_graph, - bool use_gpu) +template +void prune_graph_gpu(raft::resources const& res, + IdxT* knn_graph_ptr, + uint64_t graph_size, + uint64_t knn_graph_degree, + IdxT* output_graph_ptr, + uint64_t output_graph_degree) { - const uint64_t graph_size = output_graph.extent(0); - const uint64_t knn_graph_degree = knn_graph.extent(1); - const uint64_t output_graph_degree = output_graph.extent(1); - auto output_graph_ptr = output_graph.data_handle(); - - auto large_tmp_mr = raft::resource::get_large_workspace_resource(res); + auto default_ws_mr = raft::resource::get_workspace_resource(res); uint32_t batch_size = std::min(static_cast(graph_size), static_cast(256 * 1024)); const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size; - bool use_gpu_prune = use_gpu; - if (use_gpu_prune) { - try { - auto d_detour_count = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(batch_size, knn_graph_degree)); - auto d_num_no_detour_edges = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(batch_size)); - auto d_output_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(batch_size, output_graph_degree)); - auto d_input_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size, knn_graph_degree)); - } catch (std::bad_alloc& e) { - RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU"); - use_gpu_prune = false; - } catch (raft::logic_error& e) { - RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU (logic error)"); - use_gpu_prune = false; - } - } - - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/prune"); - const double time_prune_start = cur_time(); + RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r"); - if (use_gpu_prune) { - RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r"); + constexpr int MAX_DEGREE = 1024; + if (knn_graph_degree > MAX_DEGREE) { + RAFT_FAIL( + "The degree of input knn graph is too large (%zu). " + "It must be equal to or smaller than %d.", + knn_graph_degree, + MAX_DEGREE); + } - constexpr int MAX_DEGREE = 1024; - if (knn_graph_degree > MAX_DEGREE) { - RAFT_FAIL( - "The degree of input knn graph is too large (%zu). " - "It must be equal to or smaller than %d.", + const double prune_start = cur_time(); + + uint64_t num_keep __attribute__((unused)) = 0; + uint64_t num_full __attribute__((unused)) = 0; + auto dev_stats = raft::make_device_vector(res, 2); + auto host_stats = raft::make_host_vector(2); + RAFT_CUDA_TRY(cudaMemsetAsync( + dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res))); + + device_matrix_view_from_host d_input_graph( + res, raft::make_host_matrix_view(knn_graph_ptr, graph_size, knn_graph_degree)); + + auto d_detour_count = raft::make_device_mdarray( + res, default_ws_mr, raft::make_extents(batch_size, knn_graph_degree)); + auto d_num_no_detour_edges = raft::make_device_mdarray( + res, default_ws_mr, raft::make_extents(batch_size)); + auto d_output_graph = raft::make_device_mdarray( + res, default_ws_mr, raft::make_extents(batch_size, output_graph_degree)); + auto d_invalid_neighbor_list = raft::make_device_scalar(res, 0u); + + for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { + RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(), + 0xff, + batch_size * knn_graph_degree * sizeof(uint8_t), + raft::resource::get_cuda_stream(res))); + + RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(), + 0x00, + batch_size * sizeof(uint32_t), + raft::resource::get_cuda_stream(res))); + + const dim3 threads_prune(32, 1, 1); + const dim3 blocks_prune(batch_size, 1, 1); + const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT); + kern_prune + <<>>( + d_input_graph.data_handle(), + graph_size, knn_graph_degree, - MAX_DEGREE); - } + output_graph_degree, + batch_size, + i_batch, + d_detour_count.data_handle(), + d_num_no_detour_edges.data_handle(), + dev_stats.data_handle()); + + const size_t select_smem_size = (knn_graph_degree) * (sizeof(uint16_t) + sizeof(IdxT)); + const dim3 threads_select(32, 1, 1); + const dim3 blocks_select(batch_size, 1, 1); + kern_select_smallest_detour_neighbors + <<>>( + d_input_graph.data_handle(), + graph_size, + knn_graph_degree, + output_graph_degree, + d_detour_count.data_handle(), + d_output_graph.data_handle(), + batch_size, + i_batch, + d_invalid_neighbor_list.data_handle()); + + size_t copy_size = + std::min(static_cast(batch_size), graph_size - i_batch * batch_size) * + output_graph_degree; + raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree, + d_output_graph.data_handle(), + copy_size, + raft::resource::get_cuda_stream(res)); - const double prune_start = cur_time(); + raft::resource::sync_stream(res); + RAFT_LOG_DEBUG( + "# Pruning kNN Graph on GPUs (%.1lf %%)\r", + (double)std::min((i_batch + 1) * batch_size, graph_size) / graph_size * 100); + } + raft::resource::sync_stream(res); + RAFT_LOG_DEBUG("\n"); - uint64_t num_keep __attribute__((unused)) = 0; - uint64_t num_full __attribute__((unused)) = 0; - auto dev_stats = raft::make_device_vector(res, 2); - auto host_stats = raft::make_host_vector(2); - RAFT_CUDA_TRY(cudaMemsetAsync( - dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res))); + uint32_t invalid_neighbor_list = 0; + raft::copy(&invalid_neighbor_list, + d_invalid_neighbor_list.data_handle(), + 1, + raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + RAFT_EXPECTS( + invalid_neighbor_list == 0, + "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " + "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " + "overflows occur during the norm computation between the dataset vectors."); - device_matrix_view_from_host d_input_graph( - res, - raft::make_host_matrix_view( - knn_graph.data_handle(), graph_size, knn_graph_degree)); + raft::copy( + host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res)); + num_keep = host_stats.data_handle()[0]; + num_full = host_stats.data_handle()[1]; - auto d_detour_count = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(batch_size, knn_graph_degree)); - auto d_num_no_detour_edges = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(batch_size)); - auto d_output_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(batch_size, output_graph_degree)); - auto d_invalid_neighbor_list = raft::make_device_scalar(res, 0u); - - for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { - RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(), - 0xff, - batch_size * knn_graph_degree * sizeof(uint8_t), - raft::resource::get_cuda_stream(res))); - - RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(), - 0x00, - batch_size * sizeof(uint32_t), - raft::resource::get_cuda_stream(res))); - - const dim3 threads_prune(32, 1, 1); - const dim3 blocks_prune(batch_size, 1, 1); - const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT); - kern_prune - <<>>( - d_input_graph.data_handle(), - graph_size, - knn_graph_degree, - output_graph_degree, - batch_size, - i_batch, - d_detour_count.data_handle(), - d_num_no_detour_edges.data_handle(), - dev_stats.data_handle()); - - const size_t select_smem_size = (knn_graph_degree) * (sizeof(uint16_t) + sizeof(IdxT)); - const dim3 threads_select(32, 1, 1); - const dim3 blocks_select(batch_size, 1, 1); - kern_select_smallest_detour_neighbors - <<>>( - d_input_graph.data_handle(), - graph_size, - knn_graph_degree, - output_graph_degree, - d_detour_count.data_handle(), - d_output_graph.data_handle(), - batch_size, - i_batch, - d_invalid_neighbor_list.data_handle()); - - size_t copy_size = - std::min(static_cast(batch_size), graph_size - i_batch * batch_size) * - output_graph_degree; - raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree, - d_output_graph.data_handle(), - copy_size, - raft::resource::get_cuda_stream(res)); + const double prune_end = cur_time(); + RAFT_LOG_DEBUG( + "# Time for pruning on GPU: %.1lf sec, " + "avg_no_detour_edges_per_node: %.2lf/%u, " + "nodes_with_no_detour_at_all_edges: %.1lf%%", + prune_end - prune_start, + (double)num_keep / graph_size, + output_graph_degree, + (double)num_full / graph_size * 100); +} - raft::resource::sync_stream(res); - RAFT_LOG_DEBUG( - "# Pruning kNN Graph on GPUs (%.1lf %%)\r", - (double)std::min((i_batch + 1) * batch_size, graph_size) / graph_size * 100); - } - raft::resource::sync_stream(res); - RAFT_LOG_DEBUG("\n"); +template +void prune_graph_cpu(IdxT* knn_graph_ptr, + uint64_t graph_size, + uint64_t knn_graph_degree, + IdxT* output_graph_ptr, + uint64_t output_graph_degree) +{ + auto detour_count = raft::make_host_matrix(graph_size, knn_graph_degree); - uint32_t invalid_neighbor_list = 0; - raft::copy(&invalid_neighbor_list, - d_invalid_neighbor_list.data_handle(), - 1, - raft::resource::get_cuda_stream(res)); - raft::resource::sync_stream(res); - RAFT_EXPECTS( - invalid_neighbor_list == 0, - "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " - "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " - "overflows occur during the norm computation between the dataset vectors."); - - raft::copy( - host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res)); - num_keep = host_stats.data_handle()[0]; - num_full = host_stats.data_handle()[1]; - - const double prune_end = cur_time(); - RAFT_LOG_DEBUG( - "# Time for pruning on GPU: %.1lf sec, " - "avg_no_detour_edges_per_node: %.2lf/%u, " - "nodes_with_no_detour_at_all_edges: %.1lf%%", - prune_end - prune_start, - (double)num_keep / graph_size, - output_graph_degree, - (double)num_full / graph_size * 100); - } else { - auto detour_count = raft::make_host_matrix(graph_size, knn_graph_degree); + auto knn_graph_view = + raft::make_host_matrix_view(knn_graph_ptr, graph_size, knn_graph_degree); - { - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/prune/2-hop-counting-by-CPU"); - const double time_2hop_count_start = cur_time(); + { + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/prune/2-hop-counting-by-CPU"); + const double time_2hop_count_start = cur_time(); - auto knn_graph_view = raft::make_host_matrix_view( - knn_graph.data_handle(), knn_graph.extent(0), knn_graph.extent(1)); - count_2hop_detours(knn_graph_view, detour_count.view()); + count_2hop_detours(knn_graph_view, detour_count.view()); - const double time_2hop_count_end = cur_time(); - RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec", - time_2hop_count_end - time_2hop_count_start); - } - bool invalid_neighbor_list = false; + const double time_2hop_count_end = cur_time(); + RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec", + time_2hop_count_end - time_2hop_count_start); + } + bool invalid_neighbor_list = false; #pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - uint64_t pk = 0; - uint32_t num_detour = 0; - for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) { - uint32_t next_num_detour = std::numeric_limits::max(); - for (uint64_t k = 0; k < knn_graph_degree; k++) { - const auto num_detour_k = detour_count(i, k); - if (num_detour_k > num_detour) { - next_num_detour = std::min(static_cast(num_detour_k), next_num_detour); - } + for (uint64_t i = 0; i < graph_size; i++) { + uint64_t pk = 0; + uint32_t num_detour = 0; + for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) { + uint32_t next_num_detour = std::numeric_limits::max(); + for (uint64_t k = 0; k < knn_graph_degree; k++) { + const auto num_detour_k = detour_count(i, k); + if (num_detour_k > num_detour) { + next_num_detour = std::min(static_cast(num_detour_k), next_num_detour); + } - if (num_detour_k != num_detour) { continue; } + if (num_detour_k != num_detour) { continue; } - const auto candidate_node = knn_graph(i, k); - bool dup = false; - for (uint32_t dk = 0; dk < pk; dk++) { - if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) { - dup = true; - break; - } - } - if (!dup && candidate_node < graph_size) { - output_graph_ptr[i * output_graph_degree + pk] = candidate_node; - pk += 1; + const auto candidate_node = knn_graph_view(i, k); + bool dup = false; + for (uint32_t dk = 0; dk < pk; dk++) { + if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) { + dup = true; + break; } - if (pk >= output_graph_degree) break; + } + if (!dup && candidate_node < graph_size) { + output_graph_ptr[i * output_graph_degree + pk] = candidate_node; + pk += 1; } if (pk >= output_graph_degree) break; - - if (next_num_detour == std::numeric_limits::max()) { break; } - num_detour = next_num_detour; - } - if (pk != output_graph_degree) { - RAFT_LOG_DEBUG( - "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for " - "node %lu in the rank-based node reranking process", - output_graph_degree, - i); - invalid_neighbor_list = true; } + if (pk >= output_graph_degree) break; + + if (next_num_detour == std::numeric_limits::max()) { break; } + num_detour = next_num_detour; + } + if (pk != output_graph_degree) { + RAFT_LOG_DEBUG( + "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for " + "node %lu in the rank-based node reranking process", + output_graph_degree, + i); + invalid_neighbor_list = true; } - RAFT_EXPECTS( - !invalid_neighbor_list, - "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " - "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " - "overflows occur during the norm computation between the dataset vectors."); } + RAFT_EXPECTS( + !invalid_neighbor_list, + "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " + "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " + "overflows occur during the norm computation between the dataset vectors."); +} - const double time_prune_end = cur_time(); - RAFT_LOG_DEBUG("# Pruning time: %.1lf ms", (time_prune_end - time_prune_start) * 1000.0); +template +bool is_gpu_accessible(T* ptr) +{ + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr)); + return attr.devicePointer != nullptr; } // TODO allow pinned input for both knn_graph and new_graph @@ -1600,7 +1893,7 @@ void optimize( // large temporary memory for large arrays, e.g. everything >= O(graph_size) auto large_tmp_mr = raft::resource::get_large_workspace_resource(res); // temporary memory for small arrays, e.g. everything <= O(batchsize * graph_degree) - // auto tmp_mr = raft::resource::get_tmp_workspace_resource(res); + auto default_ws_mr = raft::resource::get_workspace_resource(res); RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0), "Each input array is expected to have the same number of rows"); @@ -1611,409 +1904,174 @@ void optimize( const uint64_t output_graph_degree = new_graph.extent(1); const uint64_t graph_size = new_graph.extent(0); // auto input_graph_ptr = knn_graph.data_handle(); - auto output_graph_ptr = new_graph.data_handle(); raft::common::nvtx::range fun_scope( "cagra::graph::optimize(%zu, %zu, %u)", graph_size, knn_graph_degree, output_graph_degree); - // MST optimization - auto mst_graph = raft::make_pinned_matrix(res, 0, 0); - auto mst_graph_num_edges = raft::make_pinned_vector(res, graph_size); - auto mst_graph_num_edges_ptr = mst_graph_num_edges.data_handle(); + // check if input and output are both device accessible + // in this case we assume data to be ONLY device accessible and not host accessible + // furthermore we ensure all large allocations to go to the large workspace resource + // and all small allocations to go to the default workspace resource + bool inout_device_accessible = false; + { + bool input_device_accessible = is_gpu_accessible(knn_graph.data_handle()); + bool output_device_accessible = is_gpu_accessible(new_graph.data_handle()); + RAFT_EXPECTS(input_device_accessible == output_device_accessible, + "Input and output must be either both device accessible or both host accessible"); + inout_device_accessible = input_device_accessible && output_device_accessible; + } + // MST optimization + // currently, only using GPU path for MST optimization + auto p_mst_graph = raft::make_pinned_matrix(res, 0, 0); + auto p_mst_graph_num_edges = raft::make_pinned_vector(res, graph_size); + auto p_mst_graph_num_edges_ptr = p_mst_graph_num_edges.data_handle(); #pragma omp parallel for for (uint64_t i = 0; i < graph_size; i++) { - mst_graph_num_edges_ptr[i] = 0; + p_mst_graph_num_edges_ptr[i] = 0; } if (guarantee_connectivity) { raft::common::nvtx::range block_scope( "cagra::graph::optimize/check_connectivity"); - mst_graph = raft::make_pinned_matrix( + p_mst_graph = raft::make_pinned_matrix( res, graph_size, output_graph_degree); RAFT_LOG_INFO("MST optimization is used to guarantee graph connectivity."); - mst_optimization(res, knn_graph, mst_graph.view(), mst_graph_num_edges.view(), use_gpu); + mst_optimization( + res, knn_graph, p_mst_graph.view(), p_mst_graph_num_edges.view(), use_gpu); for (uint64_t i = 0; i < graph_size; i++) { if (i < 8 || i >= graph_size - 8) { - RAFT_LOG_DEBUG("# mst_graph_num_edges_ptr[%lu]: %u\n", i, mst_graph_num_edges_ptr[i]); + RAFT_LOG_DEBUG("# p_mst_graph_num_edges_ptr[%lu]: %u\n", i, p_mst_graph_num_edges_ptr[i]); } } } - prune_graph(res, knn_graph, new_graph, use_gpu); - - auto rev_graph = raft::make_host_matrix(graph_size, output_graph_degree); - auto rev_graph_count = raft::make_host_vector(graph_size); - - bool _use_gpu_rev_graph = use_gpu; - // TODO: should we use pinned memory if we have issues fitting on GPU? - if (_use_gpu_rev_graph) { + // prune graph -- will use GPU path if possible, otherwise CPU path + // we only need to check in case input is not alreadydevice accessible + bool use_gpu_prune = use_gpu; + if (!inout_device_accessible) { try { - auto d_rev_graph_count = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size)); - auto d_dest_nodes = - raft::make_device_mdarray(res, large_tmp_mr, raft::make_extents(graph_size)); - auto d_rev_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); - auto d_output_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); + auto d_input_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size, knn_graph_degree)); } catch (std::bad_alloc& e) { - RAFT_LOG_DEBUG("Insufficient memory for reverse graph on GPU"); - _use_gpu_rev_graph = false; + RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU"); + use_gpu_prune = false; } catch (raft::logic_error& e) { - RAFT_LOG_DEBUG("Insufficient memory for reverse graph on GPU (logic error)"); - _use_gpu_rev_graph = false; + RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU (logic error)"); + use_gpu_prune = false; } } - - const double time_make_start = cur_time(); - if (_use_gpu_rev_graph) { - // - // Make reverse graph on GPU - // - auto d_rev_graph_count = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size)); - - device_matrix_view_from_host d_rev_graph(res, rev_graph.view()); - device_matrix_view_from_host d_output_graph( + if (use_gpu_prune) { + // should be noop in case input is already device accessible + device_matrix_view_from_host d_input_graph( res, raft::make_host_matrix_view( - output_graph_ptr, graph_size, output_graph_degree)); - - { - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/reverse"); - auto dest_nodes = raft::make_host_vector(graph_size); - auto d_dest_nodes = - raft::make_device_mdarray(res, large_tmp_mr, raft::make_extents(graph_size)); - - RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph.data_handle(), - 0xff, - graph_size * output_graph_degree * sizeof(IdxT), - raft::resource::get_cuda_stream(res))); - - RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph_count.data_handle(), - 0x00, - graph_size * sizeof(uint32_t), - raft::resource::get_cuda_stream(res))); - - for (uint64_t k = 0; k < output_graph_degree; k++) { -#pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - // dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)]; - dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)]; - } - raft::resource::sync_stream(res); - - raft::copy(d_dest_nodes.data_handle(), - dest_nodes.data_handle(), - graph_size, - raft::resource::get_cuda_stream(res)); - - dim3 threads(256, 1, 1); - dim3 blocks(1024, 1, 1); - kern_make_rev_graph<<>>( - d_dest_nodes.data_handle(), - d_rev_graph.data_handle(), - d_rev_graph_count.data_handle(), - graph_size, - output_graph_degree); - RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u \r", k, output_graph_degree); - } - - raft::resource::sync_stream(res); - RAFT_LOG_DEBUG("\n"); - - if (d_rev_graph.allocated_memory()) { - raft::copy(rev_graph.data_handle(), - d_rev_graph.data_handle(), - graph_size * output_graph_degree, - raft::resource::get_cuda_stream(res)); - } - raft::copy(rev_graph_count.data_handle(), - d_rev_graph_count.data_handle(), - graph_size, - raft::resource::get_cuda_stream(res)); - - const double time_make_end = cur_time(); - RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms", - (time_make_end - time_make_start) * 1000.0); - } - - { - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/combine"); - - // Merging the prunned graph and the reverse graph - const double merge_graph_start = cur_time(); - - // Create a boolean variable on the GPU using RAFT device allocator - auto d_check_num_protected_edges = raft::make_device_scalar(res, true); - - uint32_t batch_size = - std::min(static_cast(graph_size), static_cast(256 * 1024)); - const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size; - - const dim3 threads_merge(32, 1, 1); - const dim3 blocks_merge(batch_size, 1, 1); - const size_t merge_smem_size = (output_graph_degree + output_graph_degree) * sizeof(IdxT); - for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { - kern_merge_graph - <<>>( - d_output_graph.data_handle(), - d_rev_graph.data_handle(), - d_rev_graph_count.data_handle(), - graph_size, - output_graph_degree, - mst_graph.data_handle(), - output_graph_degree, - mst_graph_num_edges_ptr, - batch_size, - i_batch, - guarantee_connectivity, - d_check_num_protected_edges.data_handle()); - } - - bool check_num_protected_edges = true; - raft::copy(&check_num_protected_edges, - d_check_num_protected_edges.data_handle(), - 1, - raft::resource::get_cuda_stream(res)); - - if (d_output_graph.allocated_memory()) { - raft::copy(output_graph_ptr, - d_output_graph.data_handle(), - graph_size * output_graph_degree, - raft::resource::get_cuda_stream(res)); - } - - raft::resource::sync_stream(res); + knn_graph.data_handle(), graph_size, knn_graph_degree)); - const auto merge_graph_end = cur_time(); - RAFT_EXPECTS(check_num_protected_edges, - "Failed to merge the MST, pruned, and reverse edge graphs. " - "Some nodes have too " - "many MST optimization edges."); + prune_graph_gpu(res, + d_input_graph.data_handle(), + graph_size, + knn_graph_degree, + new_graph.data_handle(), + output_graph_degree); - RAFT_LOG_DEBUG("# Time for merging graphs: %.1lf ms", - (merge_graph_end - merge_graph_start) * 1000.0); - } } else { - { - // Make reverse graph on CPU - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/reverse"); - - auto rev_graph_ptr = rev_graph.data_handle(); - auto rev_graph_count_ptr = rev_graph_count.data_handle(); - -#pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - rev_graph_count_ptr[i] = 0; - } - - for (uint32_t k = 0; k < output_graph_degree; k++) { -#pragma omp parallel for - for (uint64_t src_id = 0; src_id < graph_size; src_id++) { - const IdxT dest_id = - output_graph_ptr[k + (static_cast(output_graph_degree) * src_id)]; - if (dest_id >= graph_size) continue; - uint32_t pos; -#pragma omp atomic capture - pos = rev_graph_count_ptr[dest_id]++; - if (pos < output_graph_degree) { - rev_graph_ptr[(static_cast(output_graph_degree) * dest_id) + pos] = - static_cast(src_id); - } - } - } + prune_graph_cpu(knn_graph.data_handle(), + graph_size, + knn_graph_degree, + new_graph.data_handle(), + output_graph_degree); + } - const double time_make_end = cur_time(); - RAFT_LOG_DEBUG("# Making reverse graph time (CPU): %.1lf ms", - (time_make_end - time_make_start) * 1000.0); - } + // reverse graph creation will always use the GPU + auto d_rev_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); - { - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/combine"); - // - // Create search graphs from MST and pruned and reverse graphs - // - const double time_replace_start = cur_time(); + // This should use the default workspace resource for random access / atomics + auto d_rev_graph_count = raft::make_device_mdarray( + res, default_ws_mr, raft::make_extents(graph_size)); - bool check_num_protected_edges = true; -#pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - auto my_rev_graph = rev_graph.data_handle() + (output_graph_degree * i); - auto my_out_graph = output_graph_ptr + (output_graph_degree * i); - - // If guarantee_connectivity == true, use a temporal list to merge the neighbor lists of the - // graphs. - std::vector temp_output_neighbor_list; - if (guarantee_connectivity) { - temp_output_neighbor_list.resize(output_graph_degree); - my_out_graph = temp_output_neighbor_list.data(); - const auto mst_graph_num_edges = mst_graph_num_edges_ptr[i]; - - // Set MST graph edges - for (uint32_t j = 0; j < mst_graph_num_edges; j++) { - my_out_graph[j] = mst_graph(i, j); - } - - // Set pruned graph edges - for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges; - (pruned_j < output_graph_degree) && (output_j < output_graph_degree); - pruned_j++) { - const auto v = output_graph_ptr[output_graph_degree * i + pruned_j]; - - // duplication check - bool dup = false; - for (uint32_t m = 0; m < output_j; m++) { - if (v == my_out_graph[m]) { - dup = true; - break; - } - } + const double time_make_start = cur_time(); - if (!dup) { - my_out_graph[output_j] = v; - output_j++; - } - } - } + make_reverse_graph_gpu( + res, d_rev_graph.data_handle(), d_rev_graph_count.data_handle(), new_graph); - const auto num_protected_edges = - std::max(mst_graph_num_edges_ptr[i], output_graph_degree / 2); - if (num_protected_edges > output_graph_degree) { check_num_protected_edges = false; } - if (num_protected_edges == output_graph_degree) continue; - - // Replace some edges of the output graph with edges of the reverse graph. - auto kr = std::min(rev_graph_count.data_handle()[i], output_graph_degree); - while (kr) { - kr -= 1; - if (my_rev_graph[kr] < graph_size) { - uint64_t pos = pos_in_array(my_rev_graph[kr], my_out_graph, output_graph_degree); - if (pos < num_protected_edges) { continue; } - uint64_t num_shift = pos - num_protected_edges; - if (pos >= output_graph_degree) { - num_shift = output_graph_degree - num_protected_edges - 1; - } - shift_array(my_out_graph + num_protected_edges, num_shift); - my_out_graph[num_protected_edges] = my_rev_graph[kr]; - } - } + const double time_make_end = cur_time(); + RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms", + (time_make_end - time_make_start) * 1000.0); - // If guarantee_connectivity == true, move the output neighbor list from the temporal list - // to the output list. If false, the copy is not needed because my_out_graph is a pointer to - // the output buffer. - if (guarantee_connectivity) { - for (uint32_t j = 0; j < output_graph_degree; j++) { - output_graph_ptr[(output_graph_degree * i) + j] = my_out_graph[j]; - } - } - } - RAFT_EXPECTS(check_num_protected_edges, - "Failed to merge the MST, pruned, and reverse edge graphs. Some nodes have too " - "many MST optimization edges."); - - const double time_replace_end = cur_time(); - RAFT_LOG_DEBUG("# Replacing edges time: %.1lf ms", - (time_replace_end - time_replace_start) * 1000.0); + // merge graph -- will use GPU path if possible, otherwise CPU path + // we only need to check in case output is not already device accessible + bool use_gpu_merge = use_gpu; + if (!inout_device_accessible) { + try { + auto d_new_graph = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); + } catch (std::bad_alloc& e) { + RAFT_LOG_DEBUG("Insufficient memory for merging on GPU"); + use_gpu_merge = false; + } catch (raft::logic_error& e) { + RAFT_LOG_DEBUG("Insufficient memory for merging on GPU (logic error)"); + use_gpu_merge = false; } } - // Check stats - { - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/stats"); - /* stats */ - uint64_t num_replaced_edges = 0; -#pragma omp parallel for reduction(+ : num_replaced_edges) - for (uint64_t i = 0; i < graph_size; i++) { - for (uint64_t k = 0; k < output_graph_degree; k++) { - const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)]; - const uint64_t pos = - pos_in_array(j, output_graph_ptr + (output_graph_degree * i), output_graph_degree); - if (pos == output_graph_degree) { num_replaced_edges += 1; } - } + if (use_gpu_merge) { + // should be noop in case output is already device accessible + device_matrix_view_from_host d_new_graph( + res, + raft::make_host_matrix_view( + new_graph.data_handle(), graph_size, output_graph_degree)); + + merge_graph_gpu(res, + d_new_graph.data_handle(), + d_rev_graph.data_handle(), + d_rev_graph_count.data_handle(), + p_mst_graph.data_handle(), + p_mst_graph_num_edges.data_handle(), + graph_size, + output_graph_degree, + guarantee_connectivity); + + if (d_new_graph.allocated_memory()) { + raft::copy(new_graph.data_handle(), + d_new_graph.data_handle(), + graph_size * output_graph_degree, + raft::resource::get_cuda_stream(res)); } - RAFT_LOG_DEBUG("# Average number of replaced edges per node: %.2f", - (double)num_replaced_edges / graph_size); - } + } else { + auto rev_graph = raft::make_host_matrix(graph_size, output_graph_degree); + auto rev_graph_count = raft::make_host_vector(graph_size); + auto mst_graph = raft::make_host_matrix(0, 0); + raft::copy(rev_graph.data_handle(), + d_rev_graph.data_handle(), + graph_size * output_graph_degree, + raft::resource::get_cuda_stream(res)); + raft::copy(rev_graph_count.data_handle(), + d_rev_graph_count.data_handle(), + graph_size, + raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); - // Check number of incoming edges - { - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/check_edges"); - auto in_edge_count = raft::make_host_vector(graph_size); - auto in_edge_count_ptr = in_edge_count.data_handle(); -#pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - in_edge_count_ptr[i] = 0; - } -#pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - for (uint64_t k = 0; k < output_graph_degree; k++) { - const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)]; - if (j >= graph_size) continue; -#pragma omp atomic - in_edge_count_ptr[j] += 1; - } - } - auto hist = raft::make_host_vector(output_graph_degree); - auto hist_ptr = hist.data_handle(); - for (uint64_t k = 0; k < output_graph_degree; k++) { - hist_ptr[k] = 0; - } -#pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - uint32_t count = in_edge_count_ptr[i]; - if (count >= output_graph_degree) continue; -#pragma omp atomic - hist_ptr[count] += 1; - } - RAFT_LOG_DEBUG("# Histogram for number of incoming edges\n"); - uint32_t sum_hist = 0; - for (uint64_t k = 0; k < output_graph_degree; k++) { - sum_hist += hist_ptr[k]; - RAFT_LOG_DEBUG("# %3lu, %8u, %lf, (%8u, %lf)\n", - k, - hist_ptr[k], - (double)hist_ptr[k] / graph_size, - sum_hist, - (double)sum_hist / graph_size); - } + merge_graph_cpu(new_graph.data_handle(), + rev_graph.data_handle(), + rev_graph_count.data_handle(), + p_mst_graph.data_handle(), + p_mst_graph_num_edges_ptr, + graph_size, + output_graph_degree, + guarantee_connectivity); } - // Check duplication and out-of-range indices - { - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/check_duplicates"); - uint64_t num_dup = 0; - uint64_t num_oor = 0; -#pragma omp parallel for reduction(+ : num_dup) reduction(+ : num_oor) - for (uint64_t i = 0; i < graph_size; i++) { - auto my_out_graph = output_graph_ptr + (output_graph_degree * i); - for (uint32_t j = 0; j < output_graph_degree; j++) { - const auto neighbor_a = my_out_graph[j]; + if (!inout_device_accessible) { + // following checks require host access + log_replaced_edges_stats(new_graph.data_handle(), graph_size, output_graph_degree); - // Check oor - if (neighbor_a > graph_size) { - num_oor++; - continue; - } + log_incoming_edges_histogram(new_graph.data_handle(), graph_size, output_graph_degree); - // Check duplication - for (uint32_t k = j + 1; k < output_graph_degree; k++) { - const auto neighbor_b = my_out_graph[k]; - if (neighbor_a == neighbor_b) { num_dup++; } - } - } - } - RAFT_EXPECTS( - num_dup == 0, "%lu duplicated node(s) are found in the generated CAGRA graph", num_dup); - RAFT_EXPECTS(num_oor == 0, - "%lu out-of-range index node(s) are found in the generated CAGRA graph", - num_oor); + check_duplicates_and_out_of_range( + new_graph.data_handle(), graph_size, output_graph_degree); + } else { + RAFT_LOG_DEBUG("Output graph is on GPU, skipping checks"); } } From 5e9ebc53950e472c8ee0035f280905dc5b1984b5 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Mon, 2 Mar 2026 17:34:17 +0000 Subject: [PATCH 069/119] enable both host/device inout graphs for optimize --- .../neighbors/detail/cagra/cagra_build.cuh | 23 ++--- cpp/src/neighbors/detail/cagra/graph_core.cuh | 97 +++++++++++-------- cpp/src/neighbors/detail/cagra/utils.hpp | 18 +++- 3 files changed, 86 insertions(+), 52 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index a1c16250c5..009362aa96 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -822,8 +822,6 @@ inline std::pair optimize_workspace_size(size_t n_rows, size_t index_size, bool mst_optimize = false) { - // TODO: MODIFY!! - // MST optimization memory (host only) size_t mst_host = n_rows * index_size; // mst_graph_num_edges if (mst_optimize) { @@ -835,27 +833,26 @@ inline std::pair optimize_workspace_size(size_t n_rows, // Prune stage memory // We neglect 8 bytes (both on host and device) for stats - size_t prune_host = n_rows * intermediate_degree * sizeof(uint8_t); // detour count + size_t batch_size = std::min(static_cast(256 * 1024), n_rows); - size_t prune_dev = n_rows * intermediate_degree * 1; // detour count (uint8_t) - prune_dev += n_rows * sizeof(uint32_t); // d_num_detour_edges - prune_dev += n_rows * intermediate_degree * index_size; // d_input_graph + size_t prune_dev = batch_size * intermediate_degree * 1; // detour count (uint8_t) + prune_dev += batch_size * sizeof(uint32_t); // d_num_detour_edges + prune_dev += n_rows * intermediate_degree * index_size; // d_input_graph // Reverse graph stage memory - size_t rev_host = n_rows * graph_degree * index_size; // rev_graph - rev_host += n_rows * sizeof(uint32_t); // rev_graph_count - rev_host += n_rows * index_size; // dest_nodes - size_t rev_dev = n_rows * graph_degree * index_size; // d_rev_graph rev_dev += n_rows * sizeof(uint32_t); // d_rev_graph_count rev_dev += n_rows * sizeof(uint32_t); // d_dest_nodes - // Memory for merging graphs (host only) + // Memory for merging graphs (host only optional) size_t combine_host = n_rows * sizeof(uint32_t) + graph_degree * sizeof(uint32_t); // in_edge_count + hist - size_t total_host = mst_host + std::max({prune_host, rev_host, combine_host}); - size_t total_dev = std::max(prune_dev, rev_dev); + // additional memory for combine stage on device + size_t combine_dev = n_rows * graph_degree * index_size; // d_output_graph + + size_t total_host = mst_host + combine_host; + size_t total_dev = std::max(prune_dev, rev_dev + combine_dev); return std::make_pair(total_host, total_dev); } diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 713b03ca20..96110c9613 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -246,6 +246,26 @@ __global__ void kern_make_rev_graph(const IdxT* const dest_nodes, // [graph_ } } +template +__global__ void kern_make_rev_graph_k(const IdxT* const dest_nodes, // [graph_size] + IdxT* const rev_graph, // [size, degree] + uint32_t* const rev_graph_count, // [graph_size] + const uint32_t graph_size, + const uint32_t degree, + uint64_t k) +{ + const uint64_t tid = threadIdx.x + (blockDim.x * blockIdx.x); + const uint64_t tnum = blockDim.x * gridDim.x; + + for (uint64_t src_id = tid; src_id < graph_size; src_id += tnum) { + IdxT dest_id = dest_nodes[k + (degree * src_id)]; + if (dest_id >= graph_size) continue; + + const uint32_t pos = atomicAdd(rev_graph_count + dest_id, 1); + if (pos < degree) { rev_graph[(degree * dest_id) + pos] = static_cast(src_id); } + } +} + // Based on the detour count, select the smallest detour count and its index // (Pruning Update Kernel) template @@ -932,11 +952,11 @@ void merge_graph_cpu(IdxT* output_graph_ptr, (time_replace_end - time_replace_start) * 1000.0); } -template +template void make_reverse_graph_gpu(raft::resources const& res, IdxT* d_rev_graph, uint32_t* d_rev_graph_count, - raft::host_matrix_view new_graph) + InOutMatrixView new_graph) { const uint64_t graph_size = new_graph.extent(0); const uint64_t output_graph_degree = new_graph.extent(1); @@ -958,26 +978,38 @@ void make_reverse_graph_gpu(raft::resources const& res, RAFT_CUDA_TRY(cudaMemsetAsync( d_rev_graph_count, 0x00, graph_size * sizeof(uint32_t), raft::resource::get_cuda_stream(res))); + bool output_graph_device_accessible = is_ptr_device_accessible(output_graph_ptr); + dim3 threads(256, 1, 1); + dim3 blocks(1024, 1, 1); + for (uint64_t k = 0; k < output_graph_degree; k++) { + if (output_graph_device_accessible) { + kern_make_rev_graph_k<<>>( + output_graph_ptr, + d_rev_graph, + d_rev_graph_count, + static_cast(graph_size), + static_cast(output_graph_degree), + k); + } else { #pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)]; - } - raft::resource::sync_stream(res); + for (uint64_t i = 0; i < graph_size; i++) { + dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)]; + } + raft::resource::sync_stream(res); - raft::copy(d_dest_nodes.data_handle(), - dest_nodes.data_handle(), - graph_size, - raft::resource::get_cuda_stream(res)); + raft::copy(d_dest_nodes.data_handle(), + dest_nodes.data_handle(), + graph_size, + raft::resource::get_cuda_stream(res)); - dim3 threads(256, 1, 1); - dim3 blocks(1024, 1, 1); - kern_make_rev_graph<<>>( - d_dest_nodes.data_handle(), - d_rev_graph, - d_rev_graph_count, - static_cast(graph_size), - static_cast(output_graph_degree)); + kern_make_rev_graph<<>>( + d_dest_nodes.data_handle(), + d_rev_graph, + d_rev_graph_count, + static_cast(graph_size), + static_cast(output_graph_degree)); + } RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %lu \r", k, output_graph_degree); } @@ -1868,24 +1900,13 @@ void prune_graph_cpu(IdxT* knn_graph_ptr, "overflows occur during the norm computation between the dataset vectors."); } -template -bool is_gpu_accessible(T* ptr) -{ - cudaPointerAttributes attr; - RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr)); - return attr.devicePointer != nullptr; -} - // TODO allow pinned input for both knn_graph and new_graph -template , raft::memory_type::host>> -void optimize( - raft::resources const& res, - raft::mdspan, raft::row_major, g_accessor> knn_graph, - raft::host_matrix_view new_graph, - const bool guarantee_connectivity = true, - const bool use_gpu = true) +template +void optimize(raft::resources const& res, + InOutMatrixView knn_graph, + InOutMatrixView new_graph, + const bool guarantee_connectivity = true, + const bool use_gpu = true) { RAFT_LOG_DEBUG( "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1)); @@ -1913,8 +1934,8 @@ void optimize( // and all small allocations to go to the default workspace resource bool inout_device_accessible = false; { - bool input_device_accessible = is_gpu_accessible(knn_graph.data_handle()); - bool output_device_accessible = is_gpu_accessible(new_graph.data_handle()); + bool input_device_accessible = is_ptr_device_accessible(knn_graph.data_handle()); + bool output_device_accessible = is_ptr_device_accessible(new_graph.data_handle()); RAFT_EXPECTS(input_device_accessible == output_device_accessible, "Input and output must be either both device accessible or both host accessible"); inout_device_accessible = input_device_accessible && output_device_accessible; @@ -2062,7 +2083,7 @@ void optimize( guarantee_connectivity); } - if (!inout_device_accessible) { + if (is_ptr_host_accessible(new_graph.data_handle())) { // following checks require host access log_replaced_edges_stats(new_graph.data_handle(), graph_size, output_graph_degree); diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp index 30c7287430..7889d6d9a9 100644 --- a/cpp/src/neighbors/detail/cagra/utils.hpp +++ b/cpp/src/neighbors/detail/cagra/utils.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -152,6 +152,22 @@ struct gen_index_msb_1_mask { }; } // namespace utils +template +bool is_ptr_device_accessible(T* ptr) +{ + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr)); + return attr.devicePointer != nullptr; +} + +template +bool is_ptr_host_accessible(T* ptr) +{ + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr)); + return attr.hostPointer != nullptr; +} + /** * Utility to sync memory from a host_matrix_view to a device_matrix_view * From 40977e2e456f2fd9ee32413be3590acfe2e7bdd4 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Mon, 2 Mar 2026 23:35:32 +0000 Subject: [PATCH 070/119] smaller fixes --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 61 +++++++++++-------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 6c0d6c747c..f77f6367e5 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -801,8 +801,8 @@ void check_duplicates_and_out_of_range(const IdxT* output_graph_ptr, template void merge_graph_gpu(raft::resources const& res, IdxT* output_graph_ptr, - const IdxT* d_rev_graph, - uint32_t* d_rev_graph_count, + const IdxT* d_rev_graph_ptr, + uint32_t* d_rev_graph_count_ptr, const IdxT* mst_graph_ptr, const uint32_t* mst_graph_num_edges_ptr, uint64_t graph_size, @@ -831,8 +831,8 @@ void merge_graph_gpu(raft::resources const& res, kern_merge_graph <<>>( d_output_graph.data_handle(), - d_rev_graph, - d_rev_graph_count, + d_rev_graph_ptr, + d_rev_graph_count_ptr, static_cast(graph_size), static_cast(output_graph_degree), mst_graph_ptr, @@ -955,8 +955,8 @@ void merge_graph_cpu(IdxT* output_graph_ptr, template void make_reverse_graph_gpu(raft::resources const& res, - IdxT* d_rev_graph, - uint32_t* d_rev_graph_count, + IdxT* d_rev_graph_ptr, + uint32_t* d_rev_graph_count_ptr, InOutMatrixView new_graph) { const uint64_t graph_size = new_graph.extent(0); @@ -966,18 +966,19 @@ void make_reverse_graph_gpu(raft::resources const& res, raft::common::nvtx::range block_scope( "cagra::graph::optimize/reverse"); - auto large_tmp_mr = raft::resource::get_large_workspace_resource(res); auto dest_nodes = raft::make_host_vector(graph_size); - auto d_dest_nodes = - raft::make_device_mdarray(res, large_tmp_mr, raft::make_extents(graph_size)); + auto d_dest_nodes = raft::make_device_mdarray( + res, raft::resource::get_workspace_resource(res), raft::make_extents(graph_size)); raft::matrix::fill( res, - raft::make_device_vector_view(d_rev_graph, graph_size * output_graph_degree), + raft::make_device_vector_view(d_rev_graph_ptr, graph_size * output_graph_degree), IdxT(-1)); raft::matrix::fill( - res, raft::make_device_vector_view(d_rev_graph_count, graph_size), uint32_t(0)); + res, + raft::make_device_vector_view(d_rev_graph_count_ptr, graph_size), + uint32_t(0)); bool output_graph_device_accessible = is_ptr_device_accessible(output_graph_ptr); dim3 threads(256, 1, 1); @@ -987,8 +988,8 @@ void make_reverse_graph_gpu(raft::resources const& res, if (output_graph_device_accessible) { kern_make_rev_graph_k<<>>( output_graph_ptr, - d_rev_graph, - d_rev_graph_count, + d_rev_graph_ptr, + d_rev_graph_count_ptr, static_cast(graph_size), static_cast(output_graph_degree), k); @@ -1003,8 +1004,8 @@ void make_reverse_graph_gpu(raft::resources const& res, kern_make_rev_graph<<>>( d_dest_nodes.data_handle(), - d_rev_graph, - d_rev_graph_count, + d_rev_graph_ptr, + d_rev_graph_count_ptr, static_cast(graph_size), static_cast(output_graph_degree)); } @@ -1679,6 +1680,8 @@ void prune_graph_gpu(raft::resources const& res, IdxT* output_graph_ptr, uint64_t output_graph_degree) { + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/prune"); auto default_ws_mr = raft::resource::get_workspace_resource(res); uint32_t batch_size = @@ -1715,6 +1718,8 @@ void prune_graph_gpu(raft::resources const& res, res, default_ws_mr, raft::make_extents(batch_size, output_graph_degree)); auto d_invalid_neighbor_list = raft::make_device_scalar(res, 0u); + bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr); + for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { raft::matrix::fill(res, d_detour_count.view(), uint8_t(0xff)); raft::matrix::fill(res, d_num_no_detour_edges.view(), uint32_t(0)); @@ -1744,18 +1749,21 @@ void prune_graph_gpu(raft::resources const& res, knn_graph_degree, output_graph_degree, d_detour_count.data_handle(), - d_output_graph.data_handle(), + output_device_accessible ? d_output_graph.data_handle() + : output_graph_ptr + i_batch * batch_size * output_graph_degree, batch_size, i_batch, d_invalid_neighbor_list.data_handle()); - size_t copy_size = - std::min(static_cast(batch_size), graph_size - i_batch * batch_size) * - output_graph_degree; - raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree, - d_output_graph.data_handle(), - copy_size, - raft::resource::get_cuda_stream(res)); + if (!output_device_accessible) { + size_t copy_size = + std::min(static_cast(batch_size), graph_size - i_batch * batch_size) * + output_graph_degree; + raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree, + d_output_graph.data_handle(), + copy_size, + raft::resource::get_cuda_stream(res)); + } raft::resource::sync_stream(res); RAFT_LOG_DEBUG( @@ -1799,14 +1807,14 @@ void prune_graph_cpu(IdxT* knn_graph_ptr, IdxT* output_graph_ptr, uint64_t output_graph_degree) { + raft::common::nvtx::range block_scope( + "cagra::graph::optimize/prune"); auto detour_count = raft::make_host_matrix(graph_size, knn_graph_degree); auto knn_graph_view = raft::make_host_matrix_view(knn_graph_ptr, graph_size, knn_graph_degree); { - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/prune/2-hop-counting-by-CPU"); const double time_2hop_count_start = cur_time(); count_2hop_detours(knn_graph_view, detour_count.view()); @@ -2022,7 +2030,6 @@ void optimize(raft::resources const& res, } else { auto rev_graph = raft::make_host_matrix(graph_size, output_graph_degree); auto rev_graph_count = raft::make_host_vector(graph_size); - auto mst_graph = raft::make_host_matrix(0, 0); raft::copy(res, rev_graph.view(), d_rev_graph.view()); raft::copy(res, rev_graph_count.view(), d_rev_graph_count.view()); @@ -2036,6 +2043,8 @@ void optimize(raft::resources const& res, guarantee_connectivity); } + raft::resource::sync_stream(res); + if (is_ptr_host_accessible(new_graph.data_handle())) { // following checks require host access log_replaced_edges_stats(new_graph.data_handle(), graph_size, output_graph_degree); From 14e9f3ebc94aec7031d5c8eb685dc9b6fb36595d Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Tue, 3 Mar 2026 12:41:33 +0000 Subject: [PATCH 071/119] bugfix --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index f77f6367e5..5b2893e77f 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -1749,8 +1749,8 @@ void prune_graph_gpu(raft::resources const& res, knn_graph_degree, output_graph_degree, d_detour_count.data_handle(), - output_device_accessible ? d_output_graph.data_handle() - : output_graph_ptr + i_batch * batch_size * output_graph_degree, + output_device_accessible ? output_graph_ptr + i_batch * batch_size * output_graph_degree + : d_output_graph.data_handle(), batch_size, i_batch, d_invalid_neighbor_list.data_handle()); From 416558d40b1207ca7f1b8aad0aeda68b24e68aea Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Thu, 5 Mar 2026 21:43:49 +0000 Subject: [PATCH 072/119] fuse and simplify pruning, remove CPU path --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 331 +++++------------- 1 file changed, 92 insertions(+), 239 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 5b2893e77f..25be6ae393 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -157,79 +157,6 @@ __global__ void kern_sort(const DATA_T* const dataset, // [dataset_chunk_size, } } -template -__global__ void kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, graph_degree] - const uint32_t graph_size, - const uint32_t graph_degree, - const uint32_t degree, - const uint32_t batch_size, - const uint32_t batch_id, - uint8_t* const detour_count, // [batch_size, graph_degree] - uint32_t* const num_no_detour_edges, // [batch_size] - uint64_t* const stats) -{ - __shared__ uint32_t smem_num_detour[MAX_DEGREE]; - extern __shared__ unsigned char smem_buf[]; - IdxT* const smem_knn_iA_neighbors = reinterpret_cast(smem_buf); - - uint64_t* const num_retain = stats; - uint64_t* const num_full = stats + 1; - - const uint64_t iA = blockIdx.x + (batch_size * batch_id); - const uint64_t iA_batch = blockIdx.x; - - if (iA >= graph_size) { return; } - - // Load this node's neighbor row into shared memory to reduce global reads - for (uint32_t k = threadIdx.x; k < graph_degree; k += blockDim.x) { - smem_num_detour[k] = 0; - smem_knn_iA_neighbors[k] = knn_graph[k + ((uint64_t)graph_degree * iA)]; - if (smem_knn_iA_neighbors[k] == iA) { - // Lower the priority of self-edge - smem_num_detour[k] = graph_degree; - } - } - __syncthreads(); - - // count number of detours (A->D->B) - for (uint32_t kAD = 0; kAD < graph_degree - 1; kAD++) { - const uint64_t iD = smem_knn_iA_neighbors[kAD]; - if (iD >= graph_size) { continue; } - for (uint32_t kDB = threadIdx.x; kDB < graph_degree; kDB += blockDim.x) { - const uint64_t iB_candidate = knn_graph[kDB + ((uint64_t)graph_degree * iD)]; - for (uint32_t kAB = kAD + 1; kAB < graph_degree; kAB++) { - // if ( kDB < kAB ) - { - const uint64_t iB = smem_knn_iA_neighbors[kAB]; - if (iB == iB_candidate) { - atomicAdd(smem_num_detour + kAB, 1); - break; - } - } - } - } - __syncthreads(); - } - - uint32_t num_edges_no_detour = 0; - for (uint32_t k = threadIdx.x; k < graph_degree; k += blockDim.x) { - detour_count[k + (graph_degree * iA_batch)] = min(smem_num_detour[k], (uint32_t)255); - if (smem_num_detour[k] == 0) { num_edges_no_detour++; } - } - num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 1); - num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 2); - num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 4); - num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 8); - num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 16); - num_edges_no_detour = min(num_edges_no_detour, degree); - - if (threadIdx.x == 0) { - num_no_detour_edges[iA_batch] = num_edges_no_detour; - atomicAdd((unsigned long long int*)num_retain, (unsigned long long int)num_edges_no_detour); - if (num_edges_no_detour >= degree) { atomicAdd((unsigned long long int*)num_full, 1); } - } -} - template __global__ void kern_make_rev_graph(const IdxT* const dest_nodes, // [graph_size] IdxT* const rev_graph, // [size, degree] @@ -269,48 +196,98 @@ __global__ void kern_make_rev_graph_k(const IdxT* const dest_nodes, // [grap } } -// Based on the detour count, select the smallest detour count and its index -// (Pruning Update Kernel) -template -__global__ void kern_select_smallest_detour_neighbors( - const IdxT* const knn_graph, // [graph_chunk_size, graph_degree] - uint64_t graph_size, - uint64_t knn_graph_degree, - uint64_t output_graph_degree, - uint8_t* const d_detour_count, // [batch_size, graph_degree] - IdxT* output_graph_ptr, - const uint32_t batch_size, // [batch_size, output_graph_degree] - const uint32_t batch_id, - uint32_t* const d_invalid_neighbor_list) +template +__global__ void kern_fused_prune(const IdxT* const knn_graph, // [graph_chunk_size, graph_degree] + IdxT* const output_graph_ptr, + const uint32_t graph_size, + const uint32_t knn_graph_degree, + const uint32_t output_graph_degree, + const uint32_t batch_size, + const uint32_t batch_id, + uint32_t* const d_invalid_neighbor_list, + uint64_t* const stats) { - assert(blockDim.x == 32); + extern __shared__ unsigned char smem_buf[]; - // Allocate shared memory for detour counts and their indices - extern __shared__ IdxT smem_indices[]; - uint16_t* smem_detour_count = (uint16_t*)&smem_indices[knn_graph_degree]; + const uint32_t wid = threadIdx.x / raft::WarpSize; + const uint32_t lane_id = threadIdx.x % raft::WarpSize; - const uint64_t nid = blockIdx.x + (batch_size * batch_id); - const uint64_t nid_batch = blockIdx.x; + IdxT* const smem_indices = + reinterpret_cast(smem_buf + wid * knn_graph_degree * sizeof(IdxT)); + uint32_t* const smem_num_detour = reinterpret_cast( + smem_buf + wid * knn_graph_degree * sizeof(IdxT) + num_warps * knn_graph_degree * sizeof(IdxT)); + + uint64_t* const num_retain = stats; + uint64_t* const num_full = stats + 1; + + const unsigned warp_mask = 0xffffffff; const uint32_t maxval16 = 0x0000ffff; + const uint64_t nid_batch = blockIdx.x * num_warps + wid; + const uint64_t nid = nid_batch + (batch_size * batch_id); + if (nid >= graph_size) { return; } - // Load indices and detour counts for each neighbor; invalidate out-of-bounds entries - for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) { - smem_indices[k] = knn_graph[knn_graph_degree * nid + k]; - smem_detour_count[k] = (smem_indices[k] >= graph_size) - ? maxval16 - : (uint16_t)d_detour_count[nid_batch * knn_graph_degree + k]; + // Load this node's neighbor row into shared memory to reduce global reads + for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) { + smem_num_detour[k] = 0; + smem_indices[k] = knn_graph[k + ((uint64_t)knn_graph_degree * nid)]; + if (smem_indices[k] == nid) { + // Lower the priority of self-edge + smem_num_detour[k] = knn_graph_degree; + } } __syncwarp(); - const unsigned warp_mask = 0xffffffff; + // count number of detours (A->D->B) + for (uint32_t kAD = 0; kAD < knn_graph_degree - 1; kAD++) { + const uint64_t iD = smem_indices[kAD]; + if (iD >= graph_size) { continue; } + for (uint32_t kDB = lane_id; kDB < knn_graph_degree; kDB += raft::WarpSize) { + const uint64_t iB_candidate = knn_graph[kDB + ((uint64_t)knn_graph_degree * iD)]; + for (uint32_t kAB = kAD + 1; kAB < knn_graph_degree; kAB++) { + // if ( kDB < kAB ) + { + const uint64_t iB = smem_indices[kAB]; + if (iB == iB_candidate) { + atomicAdd(smem_num_detour + kAB, 1); + break; + } + } + } + } + __syncwarp(); + } + + uint32_t num_edges_no_detour = 0; + for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) { + smem_num_detour[k] = min(smem_num_detour[k], maxval16); + if (smem_num_detour[k] == 0) { num_edges_no_detour++; } + if (smem_indices[k] >= graph_size) { smem_num_detour[k] = maxval16; } + } + + __syncwarp(); + + num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 1); + num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 2); + num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 4); + num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 8); + num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 16); + num_edges_no_detour = min(num_edges_no_detour, output_graph_degree); + + if (lane_id == 0) { + atomicAdd((unsigned long long int*)num_retain, (unsigned long long int)num_edges_no_detour); + if (num_edges_no_detour >= output_graph_degree) { + atomicAdd((unsigned long long int*)num_full, 1); + } + } + for (uint32_t i = 0; i < output_graph_degree; i++) { uint32_t local_min = maxval16; uint32_t local_idx = maxval16; - for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) { - if (smem_detour_count[k] < local_min) { - local_min = smem_detour_count[k]; + for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) { + if (smem_num_detour[k] < local_min) { + local_min = smem_num_detour[k]; local_idx = k; } } @@ -321,18 +298,18 @@ __global__ void kern_select_smallest_detour_neighbors( uint32_t warp_local_idx = warp_min_with_tag & 0xffff; if (warp_min_count == maxval16 || warp_local_idx == maxval16) { - if (threadIdx.x == 0) { atomicExch(d_invalid_neighbor_list, 1u); } + if (lane_id == 0) { atomicExch(d_invalid_neighbor_list, 1u); } break; } IdxT selected_node = smem_indices[warp_local_idx]; - for (uint32_t k = threadIdx.x; k < knn_graph_degree; k += blockDim.x) { - if (smem_indices[k] == selected_node) { smem_detour_count[k] = maxval16; } + for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) { + if (smem_indices[k] == selected_node) { smem_num_detour[k] = maxval16; } } __syncwarp(warp_mask); - if (threadIdx.x == 0) { output_graph_ptr[nid_batch * output_graph_degree + i] = selected_node; } + if (lane_id == 0) { output_graph_ptr[nid_batch * output_graph_degree + i] = selected_node; } } } @@ -1690,15 +1667,6 @@ void prune_graph_gpu(raft::resources const& res, RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r"); - constexpr int MAX_DEGREE = 1024; - if (knn_graph_degree > MAX_DEGREE) { - RAFT_FAIL( - "The degree of input knn graph is too large (%zu). " - "It must be equal to or smaller than %d.", - knn_graph_degree, - MAX_DEGREE); - } - const double prune_start = cur_time(); uint64_t num_keep __attribute__((unused)) = 0; @@ -1710,10 +1678,6 @@ void prune_graph_gpu(raft::resources const& res, device_matrix_view_from_host d_input_graph( res, raft::make_host_matrix_view(knn_graph_ptr, graph_size, knn_graph_degree)); - auto d_detour_count = raft::make_device_mdarray( - res, default_ws_mr, raft::make_extents(batch_size, knn_graph_degree)); - auto d_num_no_detour_edges = raft::make_device_mdarray( - res, default_ws_mr, raft::make_extents(batch_size)); auto d_output_graph = raft::make_device_mdarray( res, default_ws_mr, raft::make_extents(batch_size, output_graph_degree)); auto d_invalid_neighbor_list = raft::make_device_scalar(res, 0u); @@ -1721,40 +1685,23 @@ void prune_graph_gpu(raft::resources const& res, bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr); for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { - raft::matrix::fill(res, d_detour_count.view(), uint8_t(0xff)); - raft::matrix::fill(res, d_num_no_detour_edges.view(), uint32_t(0)); - - const dim3 threads_prune(32, 1, 1); + const uint32_t num_warps = 4; + const dim3 threads_prune(raft::WarpSize * num_warps, 1, 1); const dim3 blocks_prune(batch_size, 1, 1); - const size_t prune_smem_size = knn_graph_degree * sizeof(IdxT); - kern_prune + const size_t prune_smem_size = num_warps * knn_graph_degree * (sizeof(IdxT) + sizeof(uint32_t)); + kern_fused_prune <<>>( d_input_graph.data_handle(), + output_device_accessible ? output_graph_ptr + i_batch * batch_size * output_graph_degree + : d_output_graph.data_handle(), graph_size, knn_graph_degree, output_graph_degree, batch_size, i_batch, - d_detour_count.data_handle(), - d_num_no_detour_edges.data_handle(), + d_invalid_neighbor_list.data_handle(), dev_stats.data_handle()); - const size_t select_smem_size = (knn_graph_degree) * (sizeof(uint16_t) + sizeof(IdxT)); - const dim3 threads_select(32, 1, 1); - const dim3 blocks_select(batch_size, 1, 1); - kern_select_smallest_detour_neighbors - <<>>( - d_input_graph.data_handle(), - graph_size, - knn_graph_degree, - output_graph_degree, - d_detour_count.data_handle(), - output_device_accessible ? output_graph_ptr + i_batch * batch_size * output_graph_degree - : d_output_graph.data_handle(), - batch_size, - i_batch, - d_invalid_neighbor_list.data_handle()); - if (!output_device_accessible) { size_t copy_size = std::min(static_cast(batch_size), graph_size - i_batch * batch_size) * @@ -1800,79 +1747,6 @@ void prune_graph_gpu(raft::resources const& res, (double)num_full / graph_size * 100); } -template -void prune_graph_cpu(IdxT* knn_graph_ptr, - uint64_t graph_size, - uint64_t knn_graph_degree, - IdxT* output_graph_ptr, - uint64_t output_graph_degree) -{ - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/prune"); - auto detour_count = raft::make_host_matrix(graph_size, knn_graph_degree); - - auto knn_graph_view = - raft::make_host_matrix_view(knn_graph_ptr, graph_size, knn_graph_degree); - - { - const double time_2hop_count_start = cur_time(); - - count_2hop_detours(knn_graph_view, detour_count.view()); - - const double time_2hop_count_end = cur_time(); - RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec", - time_2hop_count_end - time_2hop_count_start); - } - bool invalid_neighbor_list = false; -#pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - uint64_t pk = 0; - uint32_t num_detour = 0; - for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) { - uint32_t next_num_detour = std::numeric_limits::max(); - for (uint64_t k = 0; k < knn_graph_degree; k++) { - const auto num_detour_k = detour_count(i, k); - if (num_detour_k > num_detour) { - next_num_detour = std::min(static_cast(num_detour_k), next_num_detour); - } - - if (num_detour_k != num_detour) { continue; } - - const auto candidate_node = knn_graph_view(i, k); - bool dup = false; - for (uint32_t dk = 0; dk < pk; dk++) { - if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) { - dup = true; - break; - } - } - if (!dup && candidate_node < graph_size) { - output_graph_ptr[i * output_graph_degree + pk] = candidate_node; - pk += 1; - } - if (pk >= output_graph_degree) break; - } - if (pk >= output_graph_degree) break; - - if (next_num_detour == std::numeric_limits::max()) { break; } - num_detour = next_num_detour; - } - if (pk != output_graph_degree) { - RAFT_LOG_DEBUG( - "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for " - "node %lu in the rank-based node reranking process", - output_graph_degree, - i); - invalid_neighbor_list = true; - } - } - RAFT_EXPECTS( - !invalid_neighbor_list, - "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " - "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " - "overflows occur during the norm computation between the dataset vectors."); -} - // TODO allow pinned input for both knn_graph and new_graph template void optimize(raft::resources const& res, @@ -1939,22 +1813,8 @@ void optimize(raft::resources const& res, } } - // prune graph -- will use GPU path if possible, otherwise CPU path - // we only need to check in case input is not alreadydevice accessible - bool use_gpu_prune = use_gpu; - if (!inout_device_accessible) { - try { - auto d_input_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size, knn_graph_degree)); - } catch (std::bad_alloc& e) { - RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU"); - use_gpu_prune = false; - } catch (raft::logic_error& e) { - RAFT_LOG_DEBUG("Insufficient memory for pruning on GPU (logic error)"); - use_gpu_prune = false; - } - } - if (use_gpu_prune) { + // prune graph -- will always use GPU path + { // should be noop in case input is already device accessible device_matrix_view_from_host d_input_graph( res, @@ -1967,13 +1827,6 @@ void optimize(raft::resources const& res, knn_graph_degree, new_graph.data_handle(), output_graph_degree); - - } else { - prune_graph_cpu(knn_graph.data_handle(), - graph_size, - knn_graph_degree, - new_graph.data_handle(), - output_graph_degree); } // reverse graph creation will always use the GPU From d8d8bd877db9596720efaf67bb1373084dbf17c8 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Thu, 5 Mar 2026 22:49:16 +0000 Subject: [PATCH 073/119] cleanup merge, remove CPU path --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 339 +++++------------- 1 file changed, 85 insertions(+), 254 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 25be6ae393..392edc97d9 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -197,8 +197,8 @@ __global__ void kern_make_rev_graph_k(const IdxT* const dest_nodes, // [grap } template -__global__ void kern_fused_prune(const IdxT* const knn_graph, // [graph_chunk_size, graph_degree] - IdxT* const output_graph_ptr, +__global__ void kern_fused_prune(const IdxT* const knn_graph, // [graph_chunk_size, graph_degree] + IdxT* const output_graph_ptr, // [batch_size, output_graph_degree] const uint32_t graph_size, const uint32_t knn_graph_degree, const uint32_t output_graph_degree, @@ -337,8 +337,8 @@ __device__ void thread_shift_array(T* array, uint64_t num) } } -template -__global__ void kern_merge_graph(IdxT* output_graph, +template +__global__ void kern_merge_graph(IdxT* output_graph, // [batch_size, output_graph_degree] const IdxT* const rev_graph, uint32_t* const rev_graph_count, // [graph_size] const uint32_t graph_size, @@ -352,29 +352,32 @@ __global__ void kern_merge_graph(IdxT* output_graph, bool* check_num_protected_edges) { extern __shared__ unsigned char smem_buf[]; - IdxT* smem_sorted_output_graph = reinterpret_cast(smem_buf); - assert(blockDim.x == 32); + const uint32_t wid = threadIdx.x / raft::WarpSize; + const uint32_t lane_id = threadIdx.x % raft::WarpSize; - const uint64_t nid = blockIdx.x + (batch_size * batch_id); - if (nid >= graph_size) { return; } + IdxT* smem_sorted_output_graph = + reinterpret_cast(smem_buf + wid * output_graph_degree * sizeof(IdxT)); + + const uint64_t nid_batch = blockIdx.x * num_warps + wid; + const uint64_t nid = nid_batch + (batch_size * batch_id); - if (threadIdx.x == 0) check_num_protected_edges[0] = true; + if (nid >= graph_size) { return; } - const auto mst_graph_num_edges = mst_graph_num_edges_ptr[nid]; + const auto mst_graph_num_edges = guarantee_connectivity ? mst_graph_num_edges_ptr[nid] : 0; // If guarantee_connectivity == true, use a temporal list to merge the // neighbor lists of the graphs. if (guarantee_connectivity) { - for (uint32_t i = threadIdx.x; i < mst_graph_degree; i += 32) { + for (uint32_t i = lane_id; i < mst_graph_degree; i += raft::WarpSize) { smem_sorted_output_graph[i] = mst_graph[nid * mst_graph_degree + i]; } __syncwarp(); for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges; (pruned_j < output_graph_degree) && (output_j < output_graph_degree); pruned_j++) { - const auto v = output_graph[output_graph_degree * nid + pruned_j]; + const auto v = output_graph[output_graph_degree * nid_batch + pruned_j]; unsigned int dup = 0; - for (uint32_t m = threadIdx.x; m < output_j; m += 32) { + for (uint32_t m = lane_id; m < output_j; m += raft::WarpSize) { if (v == smem_sorted_output_graph[m]) { dup = 1; break; @@ -383,7 +386,7 @@ __global__ void kern_merge_graph(IdxT* output_graph, unsigned int warp_dup = __ballot_sync(0xffffffff, dup); if (warp_dup == 0) { - if (threadIdx.x == 0) smem_sorted_output_graph[output_j] = v; + if (lane_id == 0) smem_sorted_output_graph[output_j] = v; output_j++; } __syncwarp(); @@ -391,8 +394,8 @@ __global__ void kern_merge_graph(IdxT* output_graph, } else { - for (uint32_t i = threadIdx.x; i < output_graph_degree; i += 32) { - smem_sorted_output_graph[i] = output_graph[output_graph_degree * nid + i]; + for (uint32_t i = lane_id; i < output_graph_degree; i += raft::WarpSize) { + smem_sorted_output_graph[i] = output_graph[output_graph_degree * nid_batch + i]; } __syncwarp(); } @@ -412,7 +415,7 @@ __global__ void kern_merge_graph(IdxT* output_graph, if (pos < num_protected_edges) { continue; } uint64_t num_shift = pos - num_protected_edges; if (pos >= output_graph_degree) { num_shift = output_graph_degree - num_protected_edges - 1; } - if (threadIdx.x == 0) { + if (lane_id == 0) { thread_shift_array(smem_sorted_output_graph + num_protected_edges, num_shift); smem_sorted_output_graph[num_protected_edges] = rev_graph[kr + (output_graph_degree * nid)]; } @@ -420,8 +423,8 @@ __global__ void kern_merge_graph(IdxT* output_graph, } } - for (uint32_t i = threadIdx.x; i < output_graph_degree; i += 32) { - output_graph[(output_graph_degree * nid) + i] = smem_sorted_output_graph[i]; + for (uint32_t i = lane_id; i < output_graph_degree; i += raft::WarpSize) { + output_graph[(output_graph_degree * nid_batch) + i] = smem_sorted_output_graph[i]; } } @@ -780,8 +783,8 @@ void merge_graph_gpu(raft::resources const& res, IdxT* output_graph_ptr, const IdxT* d_rev_graph_ptr, uint32_t* d_rev_graph_count_ptr, - const IdxT* mst_graph_ptr, - const uint32_t* mst_graph_num_edges_ptr, + IdxT* mst_graph_ptr, + uint32_t* mst_graph_num_edges_ptr, uint64_t graph_size, uint64_t output_graph_degree, bool guarantee_connectivity) @@ -789,36 +792,62 @@ void merge_graph_gpu(raft::resources const& res, raft::common::nvtx::range block_scope( "cagra::graph::optimize/combine"); + auto default_ws_mr = raft::resource::get_workspace_resource(res); const double merge_graph_start = cur_time(); - device_matrix_view_from_host d_output_graph( - res, - raft::make_host_matrix_view(output_graph_ptr, graph_size, output_graph_degree)); - auto d_check_num_protected_edges = raft::make_device_scalar(res, true); + auto d_invalid_neighbor_list = raft::make_device_scalar(res, 0u); uint32_t batch_size = std::min(static_cast(graph_size), static_cast(256 * 1024)); const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size; - const dim3 threads_merge(32, 1, 1); - const dim3 blocks_merge(batch_size, 1, 1); - const size_t merge_smem_size = (output_graph_degree + output_graph_degree) * sizeof(IdxT); + bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr); + auto d_output_graph = raft::make_device_mdarray( + res, + default_ws_mr, + raft::make_extents(output_device_accessible ? 0 : batch_size, output_graph_degree)); + + device_matrix_view_from_host d_mst_graph( + res, + raft::make_host_matrix_view( + mst_graph_ptr, guarantee_connectivity ? graph_size : 0, output_graph_degree)); + + device_matrix_view_from_host d_mst_graph_num_edges( + res, + raft::make_host_matrix_view( + mst_graph_num_edges_ptr, guarantee_connectivity ? graph_size : 0, 1)); + + const uint32_t num_warps = 4; + const dim3 threads_merge(raft::WarpSize * num_warps, 1, 1); + const dim3 blocks_merge(batch_size / num_warps, 1, 1); + const size_t merge_smem_size = num_warps * output_graph_degree * sizeof(IdxT); for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { - kern_merge_graph + kern_merge_graph <<>>( - d_output_graph.data_handle(), + output_device_accessible ? output_graph_ptr + (i_batch * batch_size * output_graph_degree) + : d_output_graph.data_handle(), d_rev_graph_ptr, d_rev_graph_count_ptr, static_cast(graph_size), static_cast(output_graph_degree), - mst_graph_ptr, + d_mst_graph.data_handle(), static_cast(output_graph_degree), - mst_graph_num_edges_ptr, + d_mst_graph_num_edges.data_handle(), batch_size, i_batch, guarantee_connectivity, d_check_num_protected_edges.data_handle()); + + if (!output_device_accessible) { + size_t copy_size = + std::min(static_cast(batch_size), graph_size - i_batch * batch_size) * + output_graph_degree; + raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree, + d_output_graph.data_handle(), + copy_size, + raft::resource::get_cuda_stream(res)); + } } bool check_num_protected_edges = true; @@ -827,13 +856,6 @@ void merge_graph_gpu(raft::resources const& res, 1, raft::resource::get_cuda_stream(res)); - if (d_output_graph.allocated_memory()) { - raft::copy( - res, - raft::make_host_matrix_view(output_graph_ptr, graph_size, output_graph_degree), - d_output_graph.view()); - } - const auto merge_graph_end = cur_time(); RAFT_EXPECTS(check_num_protected_edges, "Failed to merge the MST, pruned, and reverse edge graphs. " @@ -844,92 +866,6 @@ void merge_graph_gpu(raft::resources const& res, (merge_graph_end - merge_graph_start) * 1000.0); } -template -void merge_graph_cpu(IdxT* output_graph_ptr, - const IdxT* rev_graph_ptr, - const uint32_t* rev_graph_count_ptr, - const IdxT* mst_graph_ptr, - const uint32_t* mst_graph_num_edges_ptr, - uint64_t graph_size, - uint64_t output_graph_degree, - bool guarantee_connectivity) -{ - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/combine"); - - const double time_replace_start = cur_time(); - - bool check_num_protected_edges = true; -#pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - auto my_rev_graph = rev_graph_ptr + (output_graph_degree * i); - auto my_out_graph = output_graph_ptr + (output_graph_degree * i); - - std::vector temp_output_neighbor_list; - if (guarantee_connectivity) { - temp_output_neighbor_list.resize(output_graph_degree); - my_out_graph = temp_output_neighbor_list.data(); - const auto mst_graph_num_edges = mst_graph_num_edges_ptr[i]; - - for (uint32_t j = 0; j < mst_graph_num_edges; j++) { - my_out_graph[j] = mst_graph_ptr[i * output_graph_degree + j]; - } - - for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges; - (pruned_j < output_graph_degree) && (output_j < output_graph_degree); - pruned_j++) { - const auto v = output_graph_ptr[output_graph_degree * i + pruned_j]; - - bool dup = false; - for (uint32_t m = 0; m < output_j; m++) { - if (v == my_out_graph[m]) { - dup = true; - break; - } - } - - if (!dup) { - my_out_graph[output_j] = v; - output_j++; - } - } - } - - const auto num_protected_edges = - std::max(mst_graph_num_edges_ptr[i], output_graph_degree / 2); - if (num_protected_edges > output_graph_degree) { check_num_protected_edges = false; } - if (num_protected_edges == output_graph_degree) continue; - - auto kr = std::min(rev_graph_count_ptr[i], output_graph_degree); - while (kr) { - kr -= 1; - if (my_rev_graph[kr] < graph_size) { - uint64_t pos = pos_in_array(my_rev_graph[kr], my_out_graph, output_graph_degree); - if (pos < num_protected_edges) { continue; } - uint64_t num_shift = pos - num_protected_edges; - if (pos >= output_graph_degree) { - num_shift = output_graph_degree - num_protected_edges - 1; - } - shift_array(my_out_graph + num_protected_edges, num_shift); - my_out_graph[num_protected_edges] = my_rev_graph[kr]; - } - } - - if (guarantee_connectivity) { - for (uint32_t j = 0; j < output_graph_degree; j++) { - output_graph_ptr[(output_graph_degree * i) + j] = my_out_graph[j]; - } - } - } - RAFT_EXPECTS(check_num_protected_edges, - "Failed to merge the MST, pruned, and reverse edge graphs. Some nodes have too " - "many MST optimization edges."); - - const double time_replace_end = cur_time(); - RAFT_LOG_DEBUG("# Replacing edges time: %.1lf ms", - (time_replace_end - time_replace_start) * 1000.0); -} - template void make_reverse_graph_gpu(raft::resources const& res, IdxT* d_rev_graph_ptr, @@ -1585,58 +1521,6 @@ void mst_optimization(raft::resources const& res, RAFT_LOG_DEBUG("# MST optimization time: %.1lf sec", time_mst_opt_end - time_mst_opt_start); } -template -void count_2hop_detours(raft::host_matrix_view knn_graph, - raft::host_matrix_view detour_count) -{ - RAFT_EXPECTS(knn_graph.extent(0) == detour_count.extent(0), - "knn_graph and detour_count are expected to have the same number of rows"); - RAFT_EXPECTS(knn_graph.extent(1) == detour_count.extent(1), - "knn_graph and detour_count are expected to have the same number of cols"); - const uint64_t graph_size = knn_graph.extent(0); - const uint64_t graph_degree = knn_graph.extent(1); - -#pragma omp parallel for - for (IdxT iA = 0; iA < graph_size; iA++) { - // Create a list of nodes, iB_candidates, that can be reached in 2-hops from node A. - auto iB_candidates = - raft::make_host_vector((graph_degree - 1) * (graph_degree - 1)); - for (uint64_t kAC = 0; kAC < graph_degree - 1; kAC++) { - IdxT iC = knn_graph(iA, kAC); - for (uint64_t kCB = 0; kCB < graph_degree - 1; kCB++) { - IdxT iB_candidate; - if (iC == iA || iC >= graph_size) { - iB_candidate = graph_size; - } else { - iB_candidate = knn_graph(iC, kCB); - if (iB_candidate == iA || iB_candidate == iC) { iB_candidate = graph_size; } - } - uint64_t idx; - if (kAC < kCB) { - idx = (kCB * kCB) + kAC; - } else { - idx = (kAC * (kAC + 1)) + kCB; - } - iB_candidates(idx) = iB_candidate; - } - } - // Count how many 2-hop detours are on each edge of node A. - for (uint64_t kAB = 0; kAB < graph_degree; kAB++) { - constexpr uint32_t max_count = 255; - uint32_t count = 0; - IdxT iB = knn_graph(iA, kAB); - if (iB == iA) { - count = max_count; - } else { - for (uint64_t idx = 0; idx < kAB * kAB; idx++) { - if (iB_candidates(idx) == iB) { count += 1; } - } - } - detour_count(iA, kAB) = std::min(count, max_count); - } - } -} - // // Prune unimportant edges based on 2-hop detour counts. // @@ -1678,16 +1562,18 @@ void prune_graph_gpu(raft::resources const& res, device_matrix_view_from_host d_input_graph( res, raft::make_host_matrix_view(knn_graph_ptr, graph_size, knn_graph_degree)); - auto d_output_graph = raft::make_device_mdarray( - res, default_ws_mr, raft::make_extents(batch_size, output_graph_degree)); auto d_invalid_neighbor_list = raft::make_device_scalar(res, 0u); bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr); + auto d_output_graph = raft::make_device_mdarray( + res, + default_ws_mr, + raft::make_extents(output_device_accessible ? 0 : batch_size, output_graph_degree)); for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { const uint32_t num_warps = 4; const dim3 threads_prune(raft::WarpSize * num_warps, 1, 1); - const dim3 blocks_prune(batch_size, 1, 1); + const dim3 blocks_prune(batch_size / num_warps, 1, 1); const size_t prune_smem_size = num_warps * knn_graph_degree * (sizeof(IdxT) + sizeof(uint32_t)); kern_fused_prune <<>>( @@ -1775,54 +1661,36 @@ void optimize(raft::resources const& res, raft::common::nvtx::range fun_scope( "cagra::graph::optimize(%zu, %zu, %u)", graph_size, knn_graph_degree, output_graph_degree); - // check if input and output are both device accessible - // in this case we assume data to be ONLY device accessible and not host accessible - // furthermore we ensure all large allocations to go to the large workspace resource - // and all small allocations to go to the default workspace resource - bool inout_device_accessible = false; - { - bool input_device_accessible = is_ptr_device_accessible(knn_graph.data_handle()); - bool output_device_accessible = is_ptr_device_accessible(new_graph.data_handle()); - RAFT_EXPECTS(input_device_accessible == output_device_accessible, - "Input and output must be either both device accessible or both host accessible"); - inout_device_accessible = input_device_accessible && output_device_accessible; - } - // MST optimization // currently, only using GPU path for MST optimization - auto p_mst_graph = raft::make_pinned_matrix(res, 0, 0); - auto p_mst_graph_num_edges = raft::make_pinned_vector(res, graph_size); - auto p_mst_graph_num_edges_ptr = p_mst_graph_num_edges.data_handle(); -#pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - p_mst_graph_num_edges_ptr[i] = 0; - } + auto mst_graph = raft::make_host_matrix(0, 0); + auto mst_graph_num_edges = raft::make_host_vector(0); + if (guarantee_connectivity) { + auto mst_graph_num_edges = raft::make_host_vector(graph_size); + auto mst_graph_num_edges_ptr = mst_graph_num_edges.data_handle(); +#pragma omp parallel for + for (uint64_t i = 0; i < graph_size; i++) { + mst_graph_num_edges_ptr[i] = 0; + } raft::common::nvtx::range block_scope( "cagra::graph::optimize/check_connectivity"); - p_mst_graph = raft::make_pinned_matrix( - res, graph_size, output_graph_degree); + mst_graph = + raft::make_host_matrix(graph_size, output_graph_degree); RAFT_LOG_INFO("MST optimization is used to guarantee graph connectivity."); - mst_optimization( - res, knn_graph, p_mst_graph.view(), p_mst_graph_num_edges.view(), use_gpu); + mst_optimization(res, knn_graph, mst_graph.view(), mst_graph_num_edges.view(), use_gpu); for (uint64_t i = 0; i < graph_size; i++) { if (i < 8 || i >= graph_size - 8) { - RAFT_LOG_DEBUG("# p_mst_graph_num_edges_ptr[%lu]: %u\n", i, p_mst_graph_num_edges_ptr[i]); + RAFT_LOG_DEBUG("# mst_graph_num_edges_ptr[%lu]: %u\n", i, mst_graph_num_edges_ptr[i]); } } } // prune graph -- will always use GPU path { - // should be noop in case input is already device accessible - device_matrix_view_from_host d_input_graph( - res, - raft::make_host_matrix_view( - knn_graph.data_handle(), graph_size, knn_graph_degree)); - prune_graph_gpu(res, - d_input_graph.data_handle(), + knn_graph.data_handle(), graph_size, knn_graph_degree, new_graph.data_handle(), @@ -1846,51 +1714,14 @@ void optimize(raft::resources const& res, RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms", (time_make_end - time_make_start) * 1000.0); - // merge graph -- will use GPU path if possible, otherwise CPU path - // we only need to check in case output is not already device accessible - bool use_gpu_merge = use_gpu; - if (!inout_device_accessible) { - try { - auto d_new_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); - } catch (std::bad_alloc& e) { - RAFT_LOG_DEBUG("Insufficient memory for merging on GPU"); - use_gpu_merge = false; - } catch (raft::logic_error& e) { - RAFT_LOG_DEBUG("Insufficient memory for merging on GPU (logic error)"); - use_gpu_merge = false; - } - } - - if (use_gpu_merge) { - // should be noop in case output is already device accessible - device_matrix_view_from_host d_new_graph( - res, - raft::make_host_matrix_view( - new_graph.data_handle(), graph_size, output_graph_degree)); - + // merge graph -- will always use GPU path + { merge_graph_gpu(res, - d_new_graph.data_handle(), + new_graph.data_handle(), d_rev_graph.data_handle(), d_rev_graph_count.data_handle(), - p_mst_graph.data_handle(), - p_mst_graph_num_edges.data_handle(), - graph_size, - output_graph_degree, - guarantee_connectivity); - - if (d_new_graph.allocated_memory()) { raft::copy(res, new_graph, d_new_graph.view()); } - } else { - auto rev_graph = raft::make_host_matrix(graph_size, output_graph_degree); - auto rev_graph_count = raft::make_host_vector(graph_size); - raft::copy(res, rev_graph.view(), d_rev_graph.view()); - raft::copy(res, rev_graph_count.view(), d_rev_graph_count.view()); - - merge_graph_cpu(new_graph.data_handle(), - rev_graph.data_handle(), - rev_graph_count.data_handle(), - p_mst_graph.data_handle(), - p_mst_graph_num_edges_ptr, + mst_graph.data_handle(), + mst_graph_num_edges.data_handle(), graph_size, output_graph_degree, guarantee_connectivity); From 00c42045aa9f0f7d148865ec7e570078e5f16658 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 6 Mar 2026 00:01:10 +0000 Subject: [PATCH 074/119] batch reverse creation --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 116 ++++++++---------- 1 file changed, 52 insertions(+), 64 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 392edc97d9..9f2bc09d86 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -157,38 +157,24 @@ __global__ void kern_sort(const DATA_T* const dataset, // [dataset_chunk_size, } } -template -__global__ void kern_make_rev_graph(const IdxT* const dest_nodes, // [graph_size] - IdxT* const rev_graph, // [size, degree] - uint32_t* const rev_graph_count, // [graph_size] - const uint32_t graph_size, - const uint32_t degree) -{ - const uint32_t tid = threadIdx.x + (blockDim.x * blockIdx.x); - const uint32_t tnum = blockDim.x * gridDim.x; - - for (uint32_t src_id = tid; src_id < graph_size; src_id += tnum) { - const IdxT dest_id = dest_nodes[src_id]; - if (dest_id >= graph_size) continue; - - const uint32_t pos = atomicAdd(rev_graph_count + dest_id, 1); - if (pos < degree) { rev_graph[pos + ((uint64_t)degree * dest_id)] = src_id; } - } -} - template -__global__ void kern_make_rev_graph_k(const IdxT* const dest_nodes, // [graph_size] - IdxT* const rev_graph, // [size, degree] - uint32_t* const rev_graph_count, // [graph_size] - const uint32_t graph_size, - const uint32_t degree, - uint64_t k) +__global__ void kern_rev_graph_batched(const IdxT* const dest_nodes, // [batch_size, degree] + IdxT* const rev_graph, // [graph_size, degree] + uint32_t* const rev_graph_count, // [graph_size] + const uint32_t graph_size, + const uint32_t degree, + const uint32_t batch_size, + const uint32_t batch_id) { const uint64_t tid = threadIdx.x + (blockDim.x * blockIdx.x); const uint64_t tnum = blockDim.x * gridDim.x; - for (uint64_t src_id = tid; src_id < graph_size; src_id += tnum) { - IdxT dest_id = dest_nodes[k + (degree * src_id)]; + const uint64_t block_batch_size = min(batch_size, graph_size - batch_id * batch_size); + + for (uint64_t idx = tid; idx < block_batch_size * degree; idx += tnum) { + const IdxT dest_id = dest_nodes[idx]; + const uint32_t src_id = idx / degree; + if (dest_id >= graph_size) continue; const uint32_t pos = atomicAdd(rev_graph_count + dest_id, 1); @@ -866,22 +852,18 @@ void merge_graph_gpu(raft::resources const& res, (merge_graph_end - merge_graph_start) * 1000.0); } -template +template void make_reverse_graph_gpu(raft::resources const& res, + IdxT* output_graph_ptr, IdxT* d_rev_graph_ptr, uint32_t* d_rev_graph_count_ptr, - InOutMatrixView new_graph) + uint64_t graph_size, + uint64_t output_graph_degree) { - const uint64_t graph_size = new_graph.extent(0); - const uint64_t output_graph_degree = new_graph.extent(1); - const IdxT* output_graph_ptr = new_graph.data_handle(); - raft::common::nvtx::range block_scope( "cagra::graph::optimize/reverse"); - auto dest_nodes = raft::make_host_vector(graph_size); - auto d_dest_nodes = raft::make_device_mdarray( - res, raft::resource::get_workspace_resource(res), raft::make_extents(graph_size)); + auto default_ws_mr = raft::resource::get_workspace_resource(res); raft::matrix::fill( res, @@ -893,36 +875,38 @@ void make_reverse_graph_gpu(raft::resources const& res, raft::make_device_vector_view(d_rev_graph_count_ptr, graph_size), uint32_t(0)); - bool output_graph_device_accessible = is_ptr_device_accessible(output_graph_ptr); - dim3 threads(256, 1, 1); - dim3 blocks(1024, 1, 1); + const uint32_t batch_size = + std::min(static_cast(graph_size), static_cast(256 * 1024)); + const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size; - for (uint64_t k = 0; k < output_graph_degree; k++) { - if (output_graph_device_accessible) { - kern_make_rev_graph_k<<>>( - output_graph_ptr, - d_rev_graph_ptr, - d_rev_graph_count_ptr, - static_cast(graph_size), - static_cast(output_graph_degree), - k); - } else { -#pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)]; - } - raft::resource::sync_stream(res); + bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr); + auto d_output_graph = raft::make_device_mdarray( + res, + default_ws_mr, + raft::make_extents(output_device_accessible ? 0 : batch_size, output_graph_degree)); - raft::copy(res, d_dest_nodes.view(), dest_nodes.view()); + for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { + dim3 threads(256, 1, 1); + dim3 blocks(1024, 1, 1); - kern_make_rev_graph<<>>( - d_dest_nodes.data_handle(), - d_rev_graph_ptr, - d_rev_graph_count_ptr, - static_cast(graph_size), - static_cast(output_graph_degree)); + if (!output_device_accessible) { + size_t copy_size = + std::min(static_cast(batch_size), graph_size - i_batch * batch_size) * + output_graph_degree; + raft::copy(d_output_graph.data_handle(), + output_graph_ptr + i_batch * batch_size * output_graph_degree, + copy_size, + raft::resource::get_cuda_stream(res)); } - RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %lu \r", k, output_graph_degree); + kern_rev_graph_batched<<>>( + output_device_accessible ? output_graph_ptr + (i_batch * batch_size * output_graph_degree) + : d_output_graph.data_handle(), + d_rev_graph_ptr, + d_rev_graph_count_ptr, + static_cast(graph_size), + static_cast(output_graph_degree), + static_cast(batch_size), + static_cast(i_batch)); } raft::resource::sync_stream(res); @@ -1707,8 +1691,12 @@ void optimize(raft::resources const& res, const double time_make_start = cur_time(); - make_reverse_graph_gpu( - res, d_rev_graph.data_handle(), d_rev_graph_count.data_handle(), new_graph); + make_reverse_graph_gpu(res, + new_graph.data_handle(), + d_rev_graph.data_handle(), + d_rev_graph_count.data_handle(), + graph_size, + output_graph_degree); const double time_make_end = cur_time(); RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms", From 9e63a7c442d6725703cbb52e575b68e2625f0694 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 6 Mar 2026 12:05:48 +0000 Subject: [PATCH 075/119] add prefetch view to handle managed & host --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 17 +- cpp/src/neighbors/detail/cagra/utils.hpp | 300 +++++++++++++++++- 2 files changed, 313 insertions(+), 4 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 9f2bc09d86..28006fa133 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -1543,8 +1543,19 @@ void prune_graph_gpu(raft::resources const& res, auto host_stats = raft::make_host_vector(2); raft::matrix::fill(res, dev_stats.view(), uint64_t(0)); - device_matrix_view_from_host d_input_graph( - res, raft::make_host_matrix_view(knn_graph_ptr, graph_size, knn_graph_degree)); + // device_matrix_view_from_host d_input_graph( + // res, raft::make_host_matrix_view(knn_graph_ptr, graph_size, + // knn_graph_degree)); + + batched_device_view_from_host d_input_graph( + res, + raft::make_host_matrix_view(knn_graph_ptr, graph_size, knn_graph_degree), + /*batch_size*/ graph_size, + /*read_only*/ true, + /*host_writeback*/ false, + /*initialize*/ true, + /*evict*/ true); + auto input_view = d_input_graph.next_view(); auto d_invalid_neighbor_list = raft::make_device_scalar(res, 0u); @@ -1561,7 +1572,7 @@ void prune_graph_gpu(raft::resources const& res, const size_t prune_smem_size = num_warps * knn_graph_degree * (sizeof(IdxT) + sizeof(uint32_t)); kern_fused_prune <<>>( - d_input_graph.data_handle(), + input_view.data_handle(), output_device_accessible ? output_graph_ptr + i_batch * batch_size * output_graph_degree : d_output_graph.data_handle(), graph_size, diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp index a59ac7fd57..c3d15e59f4 100644 --- a/cpp/src/neighbors/detail/cagra/utils.hpp +++ b/cpp/src/neighbors/detail/cagra/utils.hpp @@ -9,9 +9,13 @@ #include #include #include +#include +#include +#include +#include +#include #include #include - #include #include @@ -308,4 +312,298 @@ void copy_with_padding( raft::resource::get_cuda_stream(res))); } } + +/** + * Utility to create a batched device view from a host view + * + * This utility will create a batched device view from a host view and will handle the prefetch and + * writeback of the data Each batch can be referenced exactlyonce by calling the next_view() + * function + * + * @tparam T The type of the data + * @tparam IdxT The type of the index + * @param res The resources + * @param host_view The host view to create the batched device view from + * @param batch_size The batch size + * @param read_only Whether the data is read only (only for managed memory) + * @param host_writeback Whether to write back the data to the host (only for host memory) + * @param initialize Whether to initialize the data (only for managed memory) + * @param evict Whether to evict the data (only for managed memory) + * + * @return The batched device view + */ +template +class batched_device_view_from_host { + public: + batched_device_view_from_host(raft::resources const& res, + raft::host_matrix_view host_view, + uint64_t batch_size, + bool read_only = false, + bool host_writeback = false, + bool initialize = true, + bool evict = false) + : res_(res), + host_view_(host_view), + batch_size_(batch_size), + offset_(0), + batch_id_(0), + num_buffers_(2), + read_only_(read_only), + host_writeback_(host_writeback), + next_buffer_pos_(0), + evict_(evict), + initialize_(initialize) + { + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle())); + mem_type_ = attr.type; + // cudaMemoryTypeUnregistered = 0 + // cudaMemoryTypeHost = 1 + // cudaMemoryTypeDevice = 2 + // cudaMemoryTypeManaged = 3 + + prefetch_stream_ = raft::resource::get_cuda_stream(res); + writeback_stream_ = raft::resource::get_cuda_stream(res); + if (res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL)) { + if (raft::resource::get_stream_pool_size(res) >= 1) { + prefetch_stream_ = raft::resource::get_stream_from_stream_pool(res); + writeback_stream_ = raft::resource::get_stream_from_stream_pool(res); + } + } + + // allocations + if (mem_type_ == cudaMemoryTypeHost || mem_type_ == cudaMemoryTypeUnregistered) { + device_mem_[0].emplace(raft::make_device_mdarray( + res, + raft::resource::get_large_workspace_resource(res), + raft::make_extents(batch_size, host_view.extent(1)))); + device_ptr[0] = device_mem_[0]->data_handle(); + if (batch_size < static_cast(host_view.extent(0))) { + device_mem_[1].emplace(raft::make_device_mdarray( + res, + raft::resource::get_large_workspace_resource(res), + raft::make_extents(batch_size, host_view.extent(1)))); + device_ptr[1] = device_mem_[1]->data_handle(); + } + if (host_writeback_ && batch_size * 2 < static_cast(host_view.extent(0))) { + num_buffers_ = 3; + device_mem_[2].emplace(raft::make_device_mdarray( + res, + raft::resource::get_large_workspace_resource(res), + raft::make_extents(batch_size, host_view.extent(1)))); + device_ptr[2] = device_mem_[2]->data_handle(); + } + } + + // if data is managed and not for_write_ we can set the attribute on the device ptr + if (mem_type_ == cudaMemoryTypeManaged) { + // location_.type = CU_MEM_LOCATION_TYPE_DEVICE; + location_.type = cudaMemLocationTypeDevice; + location_.id = static_cast(raft::resource::get_device_id(res_)); + if (read_only_) { +#if CUDA_VERSION >= 13000 + RAFT_CUDA_TRY(cudaMemAdvise(host_view_.data_handle(), + host_view_.extent(0) * host_view_.extent(1) * sizeof(T), + cudaMemAdviseSetReadMostly, + location_)); +#else + RAFT_CUDA_TRY(cudaMemAdvise_v2(host_view_.data_handle(), + host_view_.extent(0) * host_view_.extent(1) * sizeof(T), + cudaMemAdviseSetReadMostly, + location_)); +#endif + // TODO maybe also reset upon destruction + } + } + + // prefetch next batch (0) + prefetch_next_batch(); + } + + bool prefetch_next_batch() + { + // this function will ensure the device_ptr [next_buffer_pos_] is pointing to the correct memory + // after the next synchronization with the prefetch stream + + // if data is on host and we are writing to it we will have to copy it back + // if data is on host we will have to copy it to the device_ptr + + // if data is managed and evict_ is true we can evict the data from device memory + // if data is managed we have to prefetch it + + bool next_batch_exists = offset_ < static_cast(host_view_.extent(0)); + + if (next_batch_exists) { + actual_batch_size_[next_buffer_pos_] = + next_batch_exists ? min(batch_size_, host_view_.extent(0) - offset_) : 0; + + switch (mem_type_) { + case cudaMemoryTypeManaged: +#if CUDA_VERSION >= 13000 + if (evict_ && batch_id_ > 1) { + // evict last active + CUdeviceptr dptrs[] = {device_ptr[next_buffer_pos_]}; + size_t sizes[] = {batch_size_ * host_view_.extent(1) * sizeof(T)}; + size_t prefetchLocIdxs[] = {0}; + RAFT_CUDA_TRY(cuMemDiscardBatchAsync( + dptrs, sizes, 1, &location_, prefetchLocIdxs, 1, 0, prefetch_stream_)); + } +#endif + // prefetch + device_ptr[next_buffer_pos_] = host_view_.data_handle() + offset_ * host_view_.extent(1); + if (initialize_) { + // managed API call to prefetch async +#if CUDA_VERSION >= 13000 + RAFT_CUDA_TRY(cudaMemPrefetchAsync( + device_ptr[next_buffer_pos_], + actual_batch_size_[next_buffer_pos_] * host_view_.extent(1) * sizeof(T), + location_, + 0, + prefetch_stream_)); +#else + RAFT_CUDA_TRY(cudaMemPrefetchAsync_v2( + device_ptr[next_buffer_pos_], + actual_batch_size_[next_buffer_pos_] * host_view_.extent(1) * sizeof(T), + location_, + 0, + prefetch_stream_)); +#endif + } else { + // managed API call to cuMemDiscardAndPrefetchBatchAsync (discard and prefetch batch) +#if CUDA_VERSION >= 13000 + CUdeviceptr dptrs[] = {device_ptr[next_buffer_pos_]}; + size_t sizes[] = {actual_batch_size_[next_buffer_pos_] * host_view_.extent(1) * + sizeof(T)}; + size_t prefetchLocIdxs[] = {0}; + RAFT_CUDA_TRY(cuMemDiscardAndPrefetchBatchAsync( + dptrs, sizes, 1, &location_, prefetchLocIdxs, 1, 0, prefetch_stream_)); +#endif + } + + break; + case cudaMemoryTypeHost: + case cudaMemoryTypeUnregistered: + if (host_writeback_ && batch_id_ > 1) { + writeback_stream_.synchronize(); + // copy back last active + uint32_t writeback_pos = (next_buffer_pos_ + num_buffers_ - 2) % num_buffers_; + uint64_t writeback_offset = (offset_ - 2 * batch_size_) * host_view_.extent(1); + raft::copy(host_view_.data_handle() + writeback_offset, + device_ptr[writeback_pos], + actual_batch_size_[writeback_pos] * host_view_.extent(1), + writeback_stream_); + } + if (initialize_) { + // prefetch next position + raft::copy(device_ptr[next_buffer_pos_], + host_view_.data_handle() + offset_ * host_view_.extent(1), + actual_batch_size_[next_buffer_pos_] * host_view_.extent(1), + prefetch_stream_); + } + + break; + case cudaMemoryTypeDevice: + // just move pointer to next position + device_ptr[next_buffer_pos_] = host_view_.data_handle() + offset_ * host_view_.extent(1); + break; + } + + offset_ += actual_batch_size_[next_buffer_pos_]; + // swap next_buffer_pos_ + next_buffer_pos_ = (next_buffer_pos_ + 1) % num_buffers_; + } + + return next_batch_exists; + } + + ~batched_device_view_from_host() noexcept + { + prefetch_stream_.synchronize(); + writeback_stream_.synchronize(); + raft::resource::sync_stream(res_); + + // if data is on host and for_write --> make sure to copy back last active + // if data is managed and evict --> evict last active + + // make sure to sync on prefetch & writeback stream & res + switch (mem_type_) { + case cudaMemoryTypeManaged: +#if CUDA_VERSION >= 13000 + if (evict_ && batch_id_ > 0) { + // managed API call to evict 2 + uint32_t evict_pos = (next_buffer_pos_ + num_buffers_ - 1) % num_buffers_; + CUdeviceptr dptrs[] = {device_ptr[evict_pos]}; + size_t sizes[] = {batch_size_ * host_view_.extent(1) * sizeof(T)}; + size_t prefetchLocIdxs[] = {0}; + RAFT_CUDA_TRY(cuMemDiscardBatchAsync( + dptrs, sizes, 1, &location_, prefetchLocIdxs, 1, 0, prefetch_stream_)); + } + prefetch_stream_.synchronize(); +#endif + break; + case cudaMemoryTypeHost: + case cudaMemoryTypeUnregistered: + if (host_writeback_ && batch_id_ > 0) { + // TODO managed API call to copy back last active + uint32_t writeback_pos = (next_buffer_pos_ + num_buffers_ - 1) % num_buffers_; + uint64_t writeback_offset = + (offset_ - actual_batch_size_[writeback_pos]) * host_view_.extent(1); + raft::copy(host_view_.data_handle() + writeback_offset, + device_ptr[writeback_pos], + actual_batch_size_[writeback_pos] * host_view_.extent(1), + writeback_stream_); + } + writeback_stream_.synchronize(); + break; + case cudaMemoryTypeDevice: break; + } + } + + /** + * Returns the next view of the batch + * + * This function will ensure the next batch is ready and will trigger the prefetch of the + * subsequent next batch + * + * @return The next view of the batch + */ + raft::device_matrix_view next_view() + { + RAFT_EXPECTS(batch_id_ * batch_size_ < host_view_.extent(0), "Batch index out of bounds"); + + // ensure current batch is ready + prefetch_stream_.synchronize(); + + // trigger prefetch of next batch + bool next_batch_exists = prefetch_next_batch(); + + batch_id_++; + + uint32_t current_pos = + (next_buffer_pos_ + num_buffers_ - (next_batch_exists ? 2 : 1)) % num_buffers_; + return raft::make_device_matrix_view( + device_ptr[current_pos], actual_batch_size_[current_pos], host_view_.extent(1)); + } + + private: + cudaMemoryType mem_type_; + const raft::resources& res_; + uint64_t batch_size_; + uint64_t offset_; + uint64_t num_buffers_; + bool initialize_; + rmm::cuda_stream_view prefetch_stream_; + rmm::cuda_stream_view writeback_stream_; + bool read_only_; + bool host_writeback_; + bool evict_; + int32_t next_buffer_pos_; + int32_t batch_id_; + cudaMemLocation location_; + std::optional> device_mem_[3]; + raft::host_matrix_view host_view_; + T* device_ptr[3]; + uint32_t actual_batch_size_[3]; +}; + } // namespace cuvs::neighbors::cagra::detail From a38ad525570d31882a1c86ff04eb679a6b1c4476 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Mon, 9 Mar 2026 20:49:08 +0000 Subject: [PATCH 076/119] fix batched iterator --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 123 +++---- cpp/src/neighbors/detail/cagra/utils.hpp | 313 ++++++++++-------- 2 files changed, 233 insertions(+), 203 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 28006fa133..ef8b1f8daf 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -22,6 +22,7 @@ #include #include +#include #include @@ -324,14 +325,14 @@ __device__ void thread_shift_array(T* array, uint64_t num) } template -__global__ void kern_merge_graph(IdxT* output_graph, // [batch_size, output_graph_degree] - const IdxT* const rev_graph, +__global__ void kern_merge_graph(IdxT* output_graph, // [batch_size, output_graph_degree] + const IdxT* const rev_graph, // [graph_size, output_graph_degree] uint32_t* const rev_graph_count, // [graph_size] const uint32_t graph_size, const uint32_t output_graph_degree, - const IdxT* const mst_graph, + const IdxT* const mst_graph, // [batch_size, output_graph_degree] const uint32_t mst_graph_degree, - const uint32_t* const mst_graph_num_edges_ptr, + const uint32_t* const mst_graph_num_edges_ptr, // [batch_size] const uint32_t batch_size, const uint32_t batch_id, bool guarantee_connectivity, @@ -350,12 +351,12 @@ __global__ void kern_merge_graph(IdxT* output_graph, // [batch_size, output_gra if (nid >= graph_size) { return; } - const auto mst_graph_num_edges = guarantee_connectivity ? mst_graph_num_edges_ptr[nid] : 0; + const auto mst_graph_num_edges = guarantee_connectivity ? mst_graph_num_edges_ptr[nid_batch] : 0; // If guarantee_connectivity == true, use a temporal list to merge the // neighbor lists of the graphs. if (guarantee_connectivity) { for (uint32_t i = lane_id; i < mst_graph_degree; i += raft::WarpSize) { - smem_sorted_output_graph[i] = mst_graph[nid * mst_graph_degree + i]; + smem_sorted_output_graph[i] = mst_graph[nid_batch * mst_graph_degree + i]; } __syncwarp(); for (uint32_t pruned_j = 0, output_j = mst_graph_num_edges; @@ -788,52 +789,54 @@ void merge_graph_gpu(raft::resources const& res, std::min(static_cast(graph_size), static_cast(256 * 1024)); const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size; - bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr); - auto d_output_graph = raft::make_device_mdarray( + batched_device_view_from_host d_output_graph( res, - default_ws_mr, - raft::make_extents(output_device_accessible ? 0 : batch_size, output_graph_degree)); + raft::make_host_matrix_view(output_graph_ptr, graph_size, output_graph_degree), + /*batch_size*/ batch_size, + /*host_writeback*/ true, + /*initialize*/ true, + /*hmm_as_managed*/ false); - device_matrix_view_from_host d_mst_graph( + batched_device_view_from_host d_mst_graph( res, raft::make_host_matrix_view( - mst_graph_ptr, guarantee_connectivity ? graph_size : 0, output_graph_degree)); + mst_graph_ptr, guarantee_connectivity ? graph_size : 0, output_graph_degree), + /*batch_size*/ batch_size, + /*host_writeback*/ false, + /*initialize*/ true, + /*hmm_as_managed*/ false); - device_matrix_view_from_host d_mst_graph_num_edges( + batched_device_view_from_host d_mst_graph_num_edges( res, - raft::make_host_matrix_view( - mst_graph_num_edges_ptr, guarantee_connectivity ? graph_size : 0, 1)); + raft::make_host_matrix_view( + mst_graph_ptr, guarantee_connectivity ? graph_size : 0, output_graph_degree), + /*batch_size*/ batch_size, + /*host_writeback*/ false, + /*initialize*/ true, + /*hmm_as_managed*/ false); const uint32_t num_warps = 4; const dim3 threads_merge(raft::WarpSize * num_warps, 1, 1); - const dim3 blocks_merge(batch_size / num_warps, 1, 1); + const dim3 blocks_merge(raft::ceildiv(batch_size, num_warps), 1, 1); const size_t merge_smem_size = num_warps * output_graph_degree * sizeof(IdxT); for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { + auto mst_graph_view = d_mst_graph.next_view(); + auto mst_graph_num_edges_view = d_mst_graph_num_edges.next_view(); + auto output_view = d_output_graph.next_view(); kern_merge_graph <<>>( - output_device_accessible ? output_graph_ptr + (i_batch * batch_size * output_graph_degree) - : d_output_graph.data_handle(), + output_view.data_handle(), d_rev_graph_ptr, d_rev_graph_count_ptr, static_cast(graph_size), static_cast(output_graph_degree), - d_mst_graph.data_handle(), + mst_graph_view.data_handle(), static_cast(output_graph_degree), - d_mst_graph_num_edges.data_handle(), + mst_graph_num_edges_view.data_handle(), batch_size, i_batch, guarantee_connectivity, d_check_num_protected_edges.data_handle()); - - if (!output_device_accessible) { - size_t copy_size = - std::min(static_cast(batch_size), graph_size - i_batch * batch_size) * - output_graph_degree; - raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree, - d_output_graph.data_handle(), - copy_size, - raft::resource::get_cuda_stream(res)); - } } bool check_num_protected_edges = true; @@ -879,28 +882,21 @@ void make_reverse_graph_gpu(raft::resources const& res, std::min(static_cast(graph_size), static_cast(256 * 1024)); const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size; - bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr); - auto d_output_graph = raft::make_device_mdarray( + batched_device_view_from_host d_output_graph( res, - default_ws_mr, - raft::make_extents(output_device_accessible ? 0 : batch_size, output_graph_degree)); + raft::make_host_matrix_view(output_graph_ptr, graph_size, output_graph_degree), + /*batch_size*/ batch_size, + /*host_writeback*/ false, + /*initialize*/ true, + /*hmm_as_managed*/ false); for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { dim3 threads(256, 1, 1); dim3 blocks(1024, 1, 1); + auto output_view = d_output_graph.next_view(); - if (!output_device_accessible) { - size_t copy_size = - std::min(static_cast(batch_size), graph_size - i_batch * batch_size) * - output_graph_degree; - raft::copy(d_output_graph.data_handle(), - output_graph_ptr + i_batch * batch_size * output_graph_degree, - copy_size, - raft::resource::get_cuda_stream(res)); - } kern_rev_graph_batched<<>>( - output_device_accessible ? output_graph_ptr + (i_batch * batch_size * output_graph_degree) - : d_output_graph.data_handle(), + output_view.data_handle(), d_rev_graph_ptr, d_rev_graph_count_ptr, static_cast(graph_size), @@ -1543,38 +1539,35 @@ void prune_graph_gpu(raft::resources const& res, auto host_stats = raft::make_host_vector(2); raft::matrix::fill(res, dev_stats.view(), uint64_t(0)); - // device_matrix_view_from_host d_input_graph( - // res, raft::make_host_matrix_view(knn_graph_ptr, graph_size, - // knn_graph_degree)); - batched_device_view_from_host d_input_graph( res, raft::make_host_matrix_view(knn_graph_ptr, graph_size, knn_graph_degree), /*batch_size*/ graph_size, - /*read_only*/ true, /*host_writeback*/ false, /*initialize*/ true, - /*evict*/ true); + /*hmm_as_managed*/ true); auto input_view = d_input_graph.next_view(); - auto d_invalid_neighbor_list = raft::make_device_scalar(res, 0u); - - bool output_device_accessible = is_ptr_device_accessible(output_graph_ptr); - auto d_output_graph = raft::make_device_mdarray( + batched_device_view_from_host d_output_graph( res, - default_ws_mr, - raft::make_extents(output_device_accessible ? 0 : batch_size, output_graph_degree)); + raft::make_host_matrix_view(output_graph_ptr, graph_size, output_graph_degree), + /*batch_size*/ batch_size, + /*host_writeback*/ true, + /*initialize*/ false, + /*hmm_as_managed*/ false); + + auto d_invalid_neighbor_list = raft::make_device_scalar(res, 0u); for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { + auto output_view = d_output_graph.next_view(); const uint32_t num_warps = 4; const dim3 threads_prune(raft::WarpSize * num_warps, 1, 1); - const dim3 blocks_prune(batch_size / num_warps, 1, 1); + const dim3 blocks_prune(raft::ceildiv(batch_size, num_warps), 1, 1); const size_t prune_smem_size = num_warps * knn_graph_degree * (sizeof(IdxT) + sizeof(uint32_t)); kern_fused_prune <<>>( input_view.data_handle(), - output_device_accessible ? output_graph_ptr + i_batch * batch_size * output_graph_degree - : d_output_graph.data_handle(), + output_view.data_handle(), graph_size, knn_graph_degree, output_graph_degree, @@ -1583,16 +1576,6 @@ void prune_graph_gpu(raft::resources const& res, d_invalid_neighbor_list.data_handle(), dev_stats.data_handle()); - if (!output_device_accessible) { - size_t copy_size = - std::min(static_cast(batch_size), graph_size - i_batch * batch_size) * - output_graph_degree; - raft::copy(output_graph_ptr + i_batch * batch_size * output_graph_degree, - d_output_graph.data_handle(), - copy_size, - raft::resource::get_cuda_stream(res)); - } - raft::resource::sync_stream(res); RAFT_LOG_DEBUG( "# Pruning kNN Graph on GPUs (%.1lf %%)\r", diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp index c3d15e59f4..df6ef1ce6f 100644 --- a/cpp/src/neighbors/detail/cagra/utils.hpp +++ b/cpp/src/neighbors/detail/cagra/utils.hpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,7 @@ #include #include +#include #include namespace cuvs::neighbors::cagra::detail { @@ -328,7 +330,7 @@ void copy_with_padding( * @param read_only Whether the data is read only (only for managed memory) * @param host_writeback Whether to write back the data to the host (only for host memory) * @param initialize Whether to initialize the data (only for managed memory) - * @param evict Whether to evict the data (only for managed memory) + * @param discard Whether to discard the data (only for managed memory) * * @return The batched device view */ @@ -338,22 +340,24 @@ class batched_device_view_from_host { batched_device_view_from_host(raft::resources const& res, raft::host_matrix_view host_view, uint64_t batch_size, - bool read_only = false, bool host_writeback = false, bool initialize = true, - bool evict = false) + bool hmm_as_managed = false) : res_(res), host_view_(host_view), batch_size_(batch_size), offset_(0), - batch_id_(0), + batch_id_(-2), num_buffers_(2), - read_only_(read_only), host_writeback_(host_writeback), - next_buffer_pos_(0), - evict_(evict), - initialize_(initialize) + initialize_(initialize), + hmm_as_managed_(hmm_as_managed) { + if (host_view.extent(0) == 0) { + mem_type_ = cudaMemoryTypeDevice; + return; + } + cudaPointerAttributes attr; RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle())); mem_type_ = attr.type; @@ -361,27 +365,35 @@ class batched_device_view_from_host { // cudaMemoryTypeHost = 1 // cudaMemoryTypeDevice = 2 // cudaMemoryTypeManaged = 3 + // + // On HMM systems, unregistered (malloc) memory can have devicePointer != nullptr, + // meaning it's directly accessible from the GPU. Treat it like managed memory: + if (mem_type_ == cudaMemoryTypeUnregistered && attr.devicePointer != nullptr && + hmm_as_managed) { + mem_type_ = cudaMemoryTypeManaged; + } - prefetch_stream_ = raft::resource::get_cuda_stream(res); - writeback_stream_ = raft::resource::get_cuda_stream(res); - if (res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL)) { - if (raft::resource::get_stream_pool_size(res) >= 1) { - prefetch_stream_ = raft::resource::get_stream_from_stream_pool(res); - writeback_stream_ = raft::resource::get_stream_from_stream_pool(res); - } + if (res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL) && + raft::resource::get_stream_pool_size(res) >= 1) { + prefetch_stream_ = raft::resource::get_stream_from_stream_pool(res); + writeback_stream_ = raft::resource::get_stream_from_stream_pool(res); + } else { + local_stream_pool_ = std::make_shared(2); + prefetch_stream_ = local_stream_pool_.value()->get_stream(); + writeback_stream_ = local_stream_pool_.value()->get_stream(); } // allocations if (mem_type_ == cudaMemoryTypeHost || mem_type_ == cudaMemoryTypeUnregistered) { device_mem_[0].emplace(raft::make_device_mdarray( res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_workspace_resource(res), raft::make_extents(batch_size, host_view.extent(1)))); device_ptr[0] = device_mem_[0]->data_handle(); if (batch_size < static_cast(host_view.extent(0))) { device_mem_[1].emplace(raft::make_device_mdarray( res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_workspace_resource(res), raft::make_extents(batch_size, host_view.extent(1)))); device_ptr[1] = device_mem_[1]->data_handle(); } @@ -389,7 +401,7 @@ class batched_device_view_from_host { num_buffers_ = 3; device_mem_[2].emplace(raft::make_device_mdarray( res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_workspace_resource(res), raft::make_extents(batch_size, host_view.extent(1)))); device_ptr[2] = device_mem_[2]->data_handle(); } @@ -400,18 +412,9 @@ class batched_device_view_from_host { // location_.type = CU_MEM_LOCATION_TYPE_DEVICE; location_.type = cudaMemLocationTypeDevice; location_.id = static_cast(raft::resource::get_device_id(res_)); - if (read_only_) { -#if CUDA_VERSION >= 13000 - RAFT_CUDA_TRY(cudaMemAdvise(host_view_.data_handle(), - host_view_.extent(0) * host_view_.extent(1) * sizeof(T), - cudaMemAdviseSetReadMostly, - location_)); -#else - RAFT_CUDA_TRY(cudaMemAdvise_v2(host_view_.data_handle(), - host_view_.extent(0) * host_view_.extent(1) * sizeof(T), - cudaMemAdviseSetReadMostly, - location_)); -#endif + if (!host_writeback_) { + advise_read_mostly(host_view_.data_handle(), + host_view_.extent(0) * host_view_.extent(1) * sizeof(T)); // TODO maybe also reset upon destruction } } @@ -422,95 +425,72 @@ class batched_device_view_from_host { bool prefetch_next_batch() { - // this function will ensure the device_ptr [next_buffer_pos_] is pointing to the correct memory - // after the next synchronization with the prefetch stream + batch_id_++; + + // ensure previous batch at position batch_id_ is ready + prefetch_stream_.synchronize(); + if (host_writeback_) { writeback_stream_.synchronize(); } - // if data is on host and we are writing to it we will have to copy it back - // if data is on host we will have to copy it to the device_ptr + // this step will + // * write back data from batch_id_ - 1 + // * prefetch data for batch_id_ + 1 - // if data is managed and evict_ is true we can evict the data from device memory - // if data is managed we have to prefetch it + // if data is on host and host_writeback_ is true we will have to copy it back + // if data is on host and initialize_ is true we will have to copy it to the device_ptr + + // if data is managed and !host_writeback_ we can discard the data from device memory + // if data is managed and initialize_ is true we can prefetch it to the device + // if data is managed and !initialize_ we can discard and prefetch the data location + + // if data is on device only this is almost a noop, just prepping the pointers + + RAFT_EXPECTS(offset_ <= host_view_.extent(0), "Offset out of bounds"); bool next_batch_exists = offset_ < static_cast(host_view_.extent(0)); if (next_batch_exists) { - actual_batch_size_[next_buffer_pos_] = - next_batch_exists ? min(batch_size_, host_view_.extent(0) - offset_) : 0; + // synchronize to ensure all previous operations are completed + // in particular all work on batch_id_ - 1 + raft::resource::sync_stream(res_); + + int32_t prefetch_pos = (batch_id_ + 1) % num_buffers_; + actual_batch_size_[prefetch_pos] = min(batch_size_, host_view_.extent(0) - offset_); switch (mem_type_) { case cudaMemoryTypeManaged: -#if CUDA_VERSION >= 13000 - if (evict_ && batch_id_ > 1) { - // evict last active - CUdeviceptr dptrs[] = {device_ptr[next_buffer_pos_]}; - size_t sizes[] = {batch_size_ * host_view_.extent(1) * sizeof(T)}; - size_t prefetchLocIdxs[] = {0}; - RAFT_CUDA_TRY(cuMemDiscardBatchAsync( - dptrs, sizes, 1, &location_, prefetchLocIdxs, 1, 0, prefetch_stream_)); + if (!host_writeback_ && batch_id_ > 1) { + uint32_t discard_pos = (batch_id_ - 1) % num_buffers_; + size_t discard_size = batch_size_ * host_view_.extent(1) * sizeof(T); + discard_managed_region(device_ptr[discard_pos], discard_size); } -#endif - // prefetch - device_ptr[next_buffer_pos_] = host_view_.data_handle() + offset_ * host_view_.extent(1); - if (initialize_) { - // managed API call to prefetch async -#if CUDA_VERSION >= 13000 - RAFT_CUDA_TRY(cudaMemPrefetchAsync( - device_ptr[next_buffer_pos_], - actual_batch_size_[next_buffer_pos_] * host_view_.extent(1) * sizeof(T), - location_, - 0, - prefetch_stream_)); -#else - RAFT_CUDA_TRY(cudaMemPrefetchAsync_v2( - device_ptr[next_buffer_pos_], - actual_batch_size_[next_buffer_pos_] * host_view_.extent(1) * sizeof(T), - location_, - 0, - prefetch_stream_)); -#endif - } else { - // managed API call to cuMemDiscardAndPrefetchBatchAsync (discard and prefetch batch) -#if CUDA_VERSION >= 13000 - CUdeviceptr dptrs[] = {device_ptr[next_buffer_pos_]}; - size_t sizes[] = {actual_batch_size_[next_buffer_pos_] * host_view_.extent(1) * - sizeof(T)}; - size_t prefetchLocIdxs[] = {0}; - RAFT_CUDA_TRY(cuMemDiscardAndPrefetchBatchAsync( - dptrs, sizes, 1, &location_, prefetchLocIdxs, 1, 0, prefetch_stream_)); -#endif - } - + // prefetch next position + device_ptr[prefetch_pos] = host_view_.data_handle() + offset_ * host_view_.extent(1); + prefetch_managed_region( + device_ptr[prefetch_pos], + actual_batch_size_[prefetch_pos] * host_view_.extent(1) * sizeof(T)); break; case cudaMemoryTypeHost: case cudaMemoryTypeUnregistered: - if (host_writeback_ && batch_id_ > 1) { - writeback_stream_.synchronize(); + if (host_writeback_ && batch_id_ > 0) { // copy back last active - uint32_t writeback_pos = (next_buffer_pos_ + num_buffers_ - 2) % num_buffers_; - uint64_t writeback_offset = (offset_ - 2 * batch_size_) * host_view_.extent(1); - raft::copy(host_view_.data_handle() + writeback_offset, - device_ptr[writeback_pos], - actual_batch_size_[writeback_pos] * host_view_.extent(1), - writeback_stream_); + uint32_t writeback_pos = (batch_id_ - 1) % num_buffers_; + uint64_t writeback_offset = (batch_id_ - 1) * batch_size_; + writeback_from_device_to_host(device_ptr[writeback_pos], writeback_offset, batch_size_); } if (initialize_) { // prefetch next position - raft::copy(device_ptr[next_buffer_pos_], - host_view_.data_handle() + offset_ * host_view_.extent(1), - actual_batch_size_[next_buffer_pos_] * host_view_.extent(1), - prefetch_stream_); + prefetch_from_host_to_device( + device_ptr[prefetch_pos], offset_, actual_batch_size_[prefetch_pos]); } break; case cudaMemoryTypeDevice: // just move pointer to next position - device_ptr[next_buffer_pos_] = host_view_.data_handle() + offset_ * host_view_.extent(1); + device_ptr[prefetch_pos] = host_view_.data_handle() + offset_ * host_view_.extent(1); break; } - offset_ += actual_batch_size_[next_buffer_pos_]; - // swap next_buffer_pos_ - next_buffer_pos_ = (next_buffer_pos_ + 1) % num_buffers_; + offset_ += actual_batch_size_[prefetch_pos]; } return next_batch_exists; @@ -525,33 +505,36 @@ class batched_device_view_from_host { // if data is on host and for_write --> make sure to copy back last active // if data is managed and evict --> evict last active - // make sure to sync on prefetch & writeback stream & res + // make sure to sync on prefetch stream & res switch (mem_type_) { case cudaMemoryTypeManaged: -#if CUDA_VERSION >= 13000 - if (evict_ && batch_id_ > 0) { - // managed API call to evict 2 - uint32_t evict_pos = (next_buffer_pos_ + num_buffers_ - 1) % num_buffers_; - CUdeviceptr dptrs[] = {device_ptr[evict_pos]}; - size_t sizes[] = {batch_size_ * host_view_.extent(1) * sizeof(T)}; - size_t prefetchLocIdxs[] = {0}; - RAFT_CUDA_TRY(cuMemDiscardBatchAsync( - dptrs, sizes, 1, &location_, prefetchLocIdxs, 1, 0, prefetch_stream_)); + if (!host_writeback_) { + uint32_t discard_pos = batch_id_ % num_buffers_; + size_t discard_size_rows = actual_batch_size_[discard_pos]; + if (batch_id_ > 0) { + discard_pos = (batch_id_ - 1) % num_buffers_; + discard_size_rows += batch_size_; + } + discard_managed_region(device_ptr[discard_pos], + discard_size_rows * host_view_.extent(1) * sizeof(T)); } - prefetch_stream_.synchronize(); -#endif + writeback_stream_.synchronize(); break; case cudaMemoryTypeHost: case cudaMemoryTypeUnregistered: - if (host_writeback_ && batch_id_ > 0) { - // TODO managed API call to copy back last active - uint32_t writeback_pos = (next_buffer_pos_ + num_buffers_ - 1) % num_buffers_; - uint64_t writeback_offset = - (offset_ - actual_batch_size_[writeback_pos]) * host_view_.extent(1); - raft::copy(host_view_.data_handle() + writeback_offset, - device_ptr[writeback_pos], - actual_batch_size_[writeback_pos] * host_view_.extent(1), - writeback_stream_); + if (host_writeback_) { + uint32_t writeback_pos_last = batch_id_ % num_buffers_; + if (batch_id_ > 0) { + uint32_t writeback_pos = (batch_id_ - 1) % num_buffers_; + uint64_t writeback_offset = (batch_id_ - 1) * batch_size_; + writeback_from_device_to_host(device_ptr[writeback_pos], writeback_offset, batch_size_); + } + { + uint64_t writeback_offset_last = batch_id_ * batch_size_; + writeback_from_device_to_host(device_ptr[writeback_pos_last], + writeback_offset_last, + actual_batch_size_[writeback_pos_last]); + } } writeback_stream_.synchronize(); break; @@ -569,39 +552,103 @@ class batched_device_view_from_host { */ raft::device_matrix_view next_view() { - RAFT_EXPECTS(batch_id_ * batch_size_ < host_view_.extent(0), "Batch index out of bounds"); - - // ensure current batch is ready - prefetch_stream_.synchronize(); + // special case for empty host view + if (host_view_.extent(0) == 0) { + return raft::make_device_matrix_view(nullptr, 0, host_view_.extent(1)); + } // trigger prefetch of next batch bool next_batch_exists = prefetch_next_batch(); - batch_id_++; + RAFT_EXPECTS(batch_id_ * batch_size_ < host_view_.extent(0), "Batch index out of bounds"); - uint32_t current_pos = - (next_buffer_pos_ + num_buffers_ - (next_batch_exists ? 2 : 1)) % num_buffers_; + uint32_t current_pos = batch_id_ % num_buffers_; return raft::make_device_matrix_view( device_ptr[current_pos], actual_batch_size_[current_pos], host_view_.extent(1)); } private: - cudaMemoryType mem_type_; - const raft::resources& res_; - uint64_t batch_size_; - uint64_t offset_; - uint64_t num_buffers_; - bool initialize_; + void advise_read_mostly(T* ptr, size_t size) + { +#if CUDA_VERSION >= 13000 + RAFT_CUDA_TRY(cudaMemAdvise(ptr, size, cudaMemAdviseSetReadMostly, location_)); +#else + RAFT_CUDA_TRY(cudaMemAdvise_v2(ptr, size, cudaMemAdviseSetReadMostly, location_)); +#endif + } + + void discard_managed_region(T* dev_ptr, size_t size) + { +#if CUDA_VERSION >= 13000 + void* dptrs[1] = {dev_ptr}; + size_t sizes[1] = {size}; + RAFT_CUDA_TRY(cudaMemDiscardBatchAsync(dptrs, sizes, 1, 0, writeback_stream_)); +#endif + // FIXME: CUDA12 does not support discard + } + + void prefetch_managed_region(T* dev_ptr, size_t size) + { +#if CUDA_VERSION >= 13000 + if (initialize_) { + RAFT_CUDA_TRY(cudaMemPrefetchAsync(dev_ptr, size, location_, 0, prefetch_stream_)); + } else { + void* dptrs[1] = {dev_ptr}; + size_t sizes[1] = {size}; + RAFT_CUDA_TRY( + cudaMemDiscardAndPrefetchBatchAsync(dptrs, sizes, 1, location_, 0, prefetch_stream_)); + } +#else + // FIXME: CUDA12 does not support discard - so we just prefetch + if (initialize_) { + RAFT_CUDA_TRY(cudaMemPrefetchAsync_v2(dev_ptr, size, location_, 0, prefetch_stream_)); + } else { + RAFT_CUDA_TRY(cudaMemPrefetchAsync_v2(dev_ptr, size, location_, 0, prefetch_stream_)); + } +#endif + } + + void prefetch_from_host_to_device(T* dev_ptr, size_t src_row_offset, size_t num_rows) + { + raft::copy(dev_ptr, + host_view_.data_handle() + src_row_offset * host_view_.extent(1), + num_rows * host_view_.extent(1), + prefetch_stream_); + } + + void writeback_from_device_to_host(T* dev_ptr, size_t dst_row_offset, size_t num_rows) + { + raft::copy(host_view_.data_handle() + dst_row_offset * host_view_.extent(1), + dev_ptr, + num_rows * host_view_.extent(1), + writeback_stream_); + } + + // stream pool for local streams + std::optional> local_stream_pool_; rmm::cuda_stream_view prefetch_stream_; rmm::cuda_stream_view writeback_stream_; - bool read_only_; - bool host_writeback_; - bool evict_; - int32_t next_buffer_pos_; + + // configuration + const raft::resources& res_; + bool initialize_; // initialize the data on the device + bool host_writeback_; // write back the data to the host + bool hmm_as_managed_; // treat unregistered memory as managed memory + + // batch position information + uint64_t batch_size_; int32_t batch_id_; + uint64_t offset_; + cudaMemLocation location_; - std::optional> device_mem_[3]; + + // input pointer information + cudaMemoryType mem_type_; raft::host_matrix_view host_view_; + + // internal device buffers + uint64_t num_buffers_; + std::optional> device_mem_[3]; T* device_ptr[3]; uint32_t actual_batch_size_[3]; }; From 89b0d1c25bbff782cf906be7d9b2dc58a5927116 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Mon, 9 Mar 2026 21:57:25 +0000 Subject: [PATCH 077/119] implement fallback / simplify strategy --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 18 +-- cpp/src/neighbors/detail/cagra/utils.hpp | 110 ++++++++++-------- 2 files changed, 66 insertions(+), 62 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index ef8b1f8daf..a6e4c08350 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -794,8 +794,7 @@ void merge_graph_gpu(raft::resources const& res, raft::make_host_matrix_view(output_graph_ptr, graph_size, output_graph_degree), /*batch_size*/ batch_size, /*host_writeback*/ true, - /*initialize*/ true, - /*hmm_as_managed*/ false); + /*initialize*/ true); batched_device_view_from_host d_mst_graph( res, @@ -803,8 +802,7 @@ void merge_graph_gpu(raft::resources const& res, mst_graph_ptr, guarantee_connectivity ? graph_size : 0, output_graph_degree), /*batch_size*/ batch_size, /*host_writeback*/ false, - /*initialize*/ true, - /*hmm_as_managed*/ false); + /*initialize*/ true); batched_device_view_from_host d_mst_graph_num_edges( res, @@ -812,8 +810,7 @@ void merge_graph_gpu(raft::resources const& res, mst_graph_ptr, guarantee_connectivity ? graph_size : 0, output_graph_degree), /*batch_size*/ batch_size, /*host_writeback*/ false, - /*initialize*/ true, - /*hmm_as_managed*/ false); + /*initialize*/ true); const uint32_t num_warps = 4; const dim3 threads_merge(raft::WarpSize * num_warps, 1, 1); @@ -887,8 +884,7 @@ void make_reverse_graph_gpu(raft::resources const& res, raft::make_host_matrix_view(output_graph_ptr, graph_size, output_graph_degree), /*batch_size*/ batch_size, /*host_writeback*/ false, - /*initialize*/ true, - /*hmm_as_managed*/ false); + /*initialize*/ true); for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { dim3 threads(256, 1, 1); @@ -1544,8 +1540,7 @@ void prune_graph_gpu(raft::resources const& res, raft::make_host_matrix_view(knn_graph_ptr, graph_size, knn_graph_degree), /*batch_size*/ graph_size, /*host_writeback*/ false, - /*initialize*/ true, - /*hmm_as_managed*/ true); + /*initialize*/ true); auto input_view = d_input_graph.next_view(); batched_device_view_from_host d_output_graph( @@ -1553,8 +1548,7 @@ void prune_graph_gpu(raft::resources const& res, raft::make_host_matrix_view(output_graph_ptr, graph_size, output_graph_degree), /*batch_size*/ batch_size, /*host_writeback*/ true, - /*initialize*/ false, - /*hmm_as_managed*/ false); + /*initialize*/ false); auto d_invalid_neighbor_list = raft::make_device_scalar(res, 0u); diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp index df6ef1ce6f..8f6cfb063f 100644 --- a/cpp/src/neighbors/detail/cagra/utils.hpp +++ b/cpp/src/neighbors/detail/cagra/utils.hpp @@ -327,22 +327,25 @@ void copy_with_padding( * @param res The resources * @param host_view The host view to create the batched device view from * @param batch_size The batch size - * @param read_only Whether the data is read only (only for managed memory) * @param host_writeback Whether to write back the data to the host (only for host memory) * @param initialize Whether to initialize the data (only for managed memory) - * @param discard Whether to discard the data (only for managed memory) * * @return The batched device view */ template class batched_device_view_from_host { public: + enum class memory_strategy { + device_only, // data is on device only (no copy needed) + copy_device, // data is explicitly moved to/from device buffers + managed_only, // data is on managed memory (system managed) + }; + batched_device_view_from_host(raft::resources const& res, raft::host_matrix_view host_view, uint64_t batch_size, bool host_writeback = false, - bool initialize = true, - bool hmm_as_managed = false) + bool initialize = true) : res_(res), host_view_(host_view), batch_size_(batch_size), @@ -350,29 +353,23 @@ class batched_device_view_from_host { batch_id_(-2), num_buffers_(2), host_writeback_(host_writeback), - initialize_(initialize), - hmm_as_managed_(hmm_as_managed) + initialize_(initialize) { if (host_view.extent(0) == 0) { - mem_type_ = cudaMemoryTypeDevice; + mem_strategy_ = memory_strategy::device_only; return; } cudaPointerAttributes attr; RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle())); - mem_type_ = attr.type; - // cudaMemoryTypeUnregistered = 0 - // cudaMemoryTypeHost = 1 - // cudaMemoryTypeDevice = 2 - // cudaMemoryTypeManaged = 3 - // - // On HMM systems, unregistered (malloc) memory can have devicePointer != nullptr, - // meaning it's directly accessible from the GPU. Treat it like managed memory: - if (mem_type_ == cudaMemoryTypeUnregistered && attr.devicePointer != nullptr && - hmm_as_managed) { - mem_type_ = cudaMemoryTypeManaged; + switch (attr.type) { + case cudaMemoryTypeUnregistered: + case cudaMemoryTypeHost: + case cudaMemoryTypeManaged: mem_strategy_ = memory_strategy::copy_device; break; + case cudaMemoryTypeDevice: mem_strategy_ = memory_strategy::device_only; break; } + // setup streams if (res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL) && raft::resource::get_stream_pool_size(res) >= 1) { prefetch_stream_ = raft::resource::get_stream_from_stream_pool(res); @@ -383,32 +380,48 @@ class batched_device_view_from_host { writeback_stream_ = local_stream_pool_.value()->get_stream(); } - // allocations - if (mem_type_ == cudaMemoryTypeHost || mem_type_ == cudaMemoryTypeUnregistered) { - device_mem_[0].emplace(raft::make_device_mdarray( - res, - raft::resource::get_workspace_resource(res), - raft::make_extents(batch_size, host_view.extent(1)))); - device_ptr[0] = device_mem_[0]->data_handle(); - if (batch_size < static_cast(host_view.extent(0))) { - device_mem_[1].emplace(raft::make_device_mdarray( - res, - raft::resource::get_workspace_resource(res), - raft::make_extents(batch_size, host_view.extent(1)))); - device_ptr[1] = device_mem_[1]->data_handle(); - } - if (host_writeback_ && batch_size * 2 < static_cast(host_view.extent(0))) { - num_buffers_ = 3; - device_mem_[2].emplace(raft::make_device_mdarray( + // buffer allocations + if (mem_strategy_ == memory_strategy::copy_device) { + try { + device_mem_[0].emplace(raft::make_device_mdarray( res, raft::resource::get_workspace_resource(res), raft::make_extents(batch_size, host_view.extent(1)))); - device_ptr[2] = device_mem_[2]->data_handle(); + device_ptr[0] = device_mem_[0]->data_handle(); + if (batch_size < static_cast(host_view.extent(0))) { + device_mem_[1].emplace(raft::make_device_mdarray( + res, + raft::resource::get_workspace_resource(res), + raft::make_extents(batch_size, host_view.extent(1)))); + device_ptr[1] = device_mem_[1]->data_handle(); + } + if (host_writeback_ && batch_size * 2 < static_cast(host_view.extent(0))) { + num_buffers_ = 3; + device_mem_[2].emplace(raft::make_device_mdarray( + res, + raft::resource::get_workspace_resource(res), + raft::make_extents(batch_size, host_view.extent(1)))); + device_ptr[2] = device_mem_[2]->data_handle(); + } + } catch (std::bad_alloc& e) { + RAFT_LOG_DEBUG("Insufficient memory for device buffers"); + if (attr.devicePointer != nullptr) { + mem_strategy_ = memory_strategy::managed_only; + } else { + throw std::bad_alloc(); + } + } catch (raft::logic_error& e) { + RAFT_LOG_DEBUG("Insufficient memory for device buffers (logic error)"); + if (attr.devicePointer != nullptr) { + mem_strategy_ = memory_strategy::managed_only; + } else { + throw raft::logic_error("Insufficient memory for device buffers (logic error)"); + } } } // if data is managed and not for_write_ we can set the attribute on the device ptr - if (mem_type_ == cudaMemoryTypeManaged) { + if (mem_strategy_ == memory_strategy::managed_only) { // location_.type = CU_MEM_LOCATION_TYPE_DEVICE; location_.type = cudaMemLocationTypeDevice; location_.id = static_cast(raft::resource::get_device_id(res_)); @@ -428,7 +441,7 @@ class batched_device_view_from_host { batch_id_++; // ensure previous batch at position batch_id_ is ready - prefetch_stream_.synchronize(); + if (initialize_) { prefetch_stream_.synchronize(); } if (host_writeback_) { writeback_stream_.synchronize(); } // this step will @@ -456,8 +469,8 @@ class batched_device_view_from_host { int32_t prefetch_pos = (batch_id_ + 1) % num_buffers_; actual_batch_size_[prefetch_pos] = min(batch_size_, host_view_.extent(0) - offset_); - switch (mem_type_) { - case cudaMemoryTypeManaged: + switch (mem_strategy_) { + case memory_strategy::managed_only: if (!host_writeback_ && batch_id_ > 1) { uint32_t discard_pos = (batch_id_ - 1) % num_buffers_; size_t discard_size = batch_size_ * host_view_.extent(1) * sizeof(T); @@ -469,8 +482,7 @@ class batched_device_view_from_host { device_ptr[prefetch_pos], actual_batch_size_[prefetch_pos] * host_view_.extent(1) * sizeof(T)); break; - case cudaMemoryTypeHost: - case cudaMemoryTypeUnregistered: + case memory_strategy::copy_device: if (host_writeback_ && batch_id_ > 0) { // copy back last active uint32_t writeback_pos = (batch_id_ - 1) % num_buffers_; @@ -484,7 +496,7 @@ class batched_device_view_from_host { } break; - case cudaMemoryTypeDevice: + case memory_strategy::device_only: // just move pointer to next position device_ptr[prefetch_pos] = host_view_.data_handle() + offset_ * host_view_.extent(1); break; @@ -506,8 +518,8 @@ class batched_device_view_from_host { // if data is managed and evict --> evict last active // make sure to sync on prefetch stream & res - switch (mem_type_) { - case cudaMemoryTypeManaged: + switch (mem_strategy_) { + case memory_strategy::managed_only: if (!host_writeback_) { uint32_t discard_pos = batch_id_ % num_buffers_; size_t discard_size_rows = actual_batch_size_[discard_pos]; @@ -520,8 +532,7 @@ class batched_device_view_from_host { } writeback_stream_.synchronize(); break; - case cudaMemoryTypeHost: - case cudaMemoryTypeUnregistered: + case memory_strategy::copy_device: if (host_writeback_) { uint32_t writeback_pos_last = batch_id_ % num_buffers_; if (batch_id_ > 0) { @@ -538,7 +549,7 @@ class batched_device_view_from_host { } writeback_stream_.synchronize(); break; - case cudaMemoryTypeDevice: break; + case memory_strategy::device_only: break; } } @@ -630,10 +641,10 @@ class batched_device_view_from_host { rmm::cuda_stream_view writeback_stream_; // configuration + memory_strategy mem_strategy_; const raft::resources& res_; bool initialize_; // initialize the data on the device bool host_writeback_; // write back the data to the host - bool hmm_as_managed_; // treat unregistered memory as managed memory // batch position information uint64_t batch_size_; @@ -643,7 +654,6 @@ class batched_device_view_from_host { cudaMemLocation location_; // input pointer information - cudaMemoryType mem_type_; raft::host_matrix_view host_view_; // internal device buffers From d0e3daefdfc7fcdec3ceaaa62a8d95134a726f15 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Tue, 10 Mar 2026 17:31:23 +0000 Subject: [PATCH 078/119] add logging / remove stats compute --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 51 +++------------ cpp/src/neighbors/detail/cagra/utils.hpp | 62 ++++++++++++------- 2 files changed, 46 insertions(+), 67 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index a6e4c08350..b5e055820d 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -648,44 +648,6 @@ __global__ void kern_mst_opt_postprocessing(IdxT* outgoing_num_edges, // [graph } } -template -uint64_t pos_in_array(T val, const T* array, uint64_t num) -{ - for (uint64_t i = 0; i < num; i++) { - if (val == array[i]) { return i; } - } - return num; -} - -template -void shift_array(T* array, uint64_t num) -{ - for (uint64_t i = num; i > 0; i--) { - array[i] = array[i - 1]; - } -} - -template -void log_replaced_edges_stats(const IdxT* output_graph_ptr, - uint64_t graph_size, - uint64_t output_graph_degree) -{ - raft::common::nvtx::range block_scope( - "cagra::graph::optimize/stats"); - uint64_t num_replaced_edges = 0; -#pragma omp parallel for reduction(+ : num_replaced_edges) - for (uint64_t i = 0; i < graph_size; i++) { - for (uint64_t k = 0; k < output_graph_degree; k++) { - const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)]; - const uint64_t pos = - pos_in_array(j, output_graph_ptr + (output_graph_degree * i), output_graph_degree); - if (pos == output_graph_degree) { num_replaced_edges += 1; } - } - } - RAFT_LOG_DEBUG("# Average number of replaced edges per node: %.2f", - (double)num_replaced_edges / graph_size); -} - template void log_incoming_edges_histogram(const IdxT* output_graph_ptr, uint64_t graph_size, @@ -755,7 +717,10 @@ void check_duplicates_and_out_of_range(const IdxT* output_graph_ptr, for (uint32_t k = j + 1; k < output_graph_degree; k++) { const auto neighbor_b = my_out_graph[k]; - if (neighbor_a == neighbor_b) { num_dup++; } + if (neighbor_a == neighbor_b) { + num_dup++; + break; + } } } } @@ -1606,10 +1571,10 @@ void prune_graph_gpu(raft::resources const& res, } // TODO allow pinned input for both knn_graph and new_graph -template +template void optimize(raft::resources const& res, - InOutMatrixView knn_graph, - InOutMatrixView new_graph, + InputMatrixView knn_graph, + OutputMatrixView new_graph, const bool guarantee_connectivity = true, const bool use_gpu = true) { @@ -1707,8 +1672,6 @@ void optimize(raft::resources const& res, if (is_ptr_host_accessible(new_graph.data_handle())) { // following checks require host access - log_replaced_edges_stats(new_graph.data_handle(), graph_size, output_graph_degree); - log_incoming_edges_histogram(new_graph.data_handle(), graph_size, output_graph_degree); check_duplicates_and_out_of_range( diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp index 8f6cfb063f..75883a9636 100644 --- a/cpp/src/neighbors/detail/cagra/utils.hpp +++ b/cpp/src/neighbors/detail/cagra/utils.hpp @@ -360,25 +360,18 @@ class batched_device_view_from_host { return; } - cudaPointerAttributes attr; - RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle())); - switch (attr.type) { + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr_, host_view.data_handle())); + switch (attr_.type) { case cudaMemoryTypeUnregistered: case cudaMemoryTypeHost: case cudaMemoryTypeManaged: mem_strategy_ = memory_strategy::copy_device; break; case cudaMemoryTypeDevice: mem_strategy_ = memory_strategy::device_only; break; } - // setup streams - if (res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL) && - raft::resource::get_stream_pool_size(res) >= 1) { - prefetch_stream_ = raft::resource::get_stream_from_stream_pool(res); - writeback_stream_ = raft::resource::get_stream_from_stream_pool(res); - } else { - local_stream_pool_ = std::make_shared(2); - prefetch_stream_ = local_stream_pool_.value()->get_stream(); - writeback_stream_ = local_stream_pool_.value()->get_stream(); - } + RAFT_LOG_DEBUG("Memory strategy: %d for type %d, size %zu", + static_cast(mem_strategy_), + static_cast(attr_.type), + host_view.extent(0) * host_view.extent(1) * sizeof(T)); // buffer allocations if (mem_strategy_ == memory_strategy::copy_device) { @@ -405,14 +398,14 @@ class batched_device_view_from_host { } } catch (std::bad_alloc& e) { RAFT_LOG_DEBUG("Insufficient memory for device buffers"); - if (attr.devicePointer != nullptr) { + if (attr_.devicePointer != nullptr) { mem_strategy_ = memory_strategy::managed_only; } else { throw std::bad_alloc(); } } catch (raft::logic_error& e) { RAFT_LOG_DEBUG("Insufficient memory for device buffers (logic error)"); - if (attr.devicePointer != nullptr) { + if (attr_.devicePointer != nullptr) { mem_strategy_ = memory_strategy::managed_only; } else { throw raft::logic_error("Insufficient memory for device buffers (logic error)"); @@ -420,6 +413,17 @@ class batched_device_view_from_host { } } + // setup streams + if (res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL) && + raft::resource::get_stream_pool_size(res) >= 1) { + prefetch_stream_ = raft::resource::get_stream_from_stream_pool(res); + writeback_stream_ = raft::resource::get_stream_from_stream_pool(res); + } else { + local_stream_pool_ = std::make_shared(2); + prefetch_stream_ = local_stream_pool_.value()->get_stream(); + writeback_stream_ = local_stream_pool_.value()->get_stream(); + } + // if data is managed and not for_write_ we can set the attribute on the device ptr if (mem_strategy_ == memory_strategy::managed_only) { // location_.type = CU_MEM_LOCATION_TYPE_DEVICE; @@ -621,18 +625,29 @@ class batched_device_view_from_host { void prefetch_from_host_to_device(T* dev_ptr, size_t src_row_offset, size_t num_rows) { - raft::copy(dev_ptr, - host_view_.data_handle() + src_row_offset * host_view_.extent(1), - num_rows * host_view_.extent(1), - prefetch_stream_); + const size_t n_elem = num_rows * host_view_.extent(1); + const size_t n_bytes = n_elem * sizeof(T); + RAFT_CUDA_TRY(cudaHostRegister(host_view_.data_handle() + src_row_offset * host_view_.extent(1), + n_bytes, + cudaHostRegisterDefault)); + // use memcpy instead of raft::copy to avoid strange behavior with HMM/ATS memory + RAFT_CUDA_TRY(cudaMemcpyAsync(dev_ptr, + host_view_.data_handle() + src_row_offset * host_view_.extent(1), + n_bytes, + cudaMemcpyHostToDevice, + prefetch_stream_)); } void writeback_from_device_to_host(T* dev_ptr, size_t dst_row_offset, size_t num_rows) { - raft::copy(host_view_.data_handle() + dst_row_offset * host_view_.extent(1), - dev_ptr, - num_rows * host_view_.extent(1), - writeback_stream_); + const size_t n_elem = num_rows * host_view_.extent(1); + const size_t n_bytes = n_elem * sizeof(T); + // use memcpy instead of raft::copy to avoid strange behavior with HMM/ATS memory + RAFT_CUDA_TRY(cudaMemcpyAsync(host_view_.data_handle() + dst_row_offset * host_view_.extent(1), + dev_ptr, + n_bytes, + cudaMemcpyDeviceToHost, + writeback_stream_)); } // stream pool for local streams @@ -655,6 +670,7 @@ class batched_device_view_from_host { // input pointer information raft::host_matrix_view host_view_; + cudaPointerAttributes attr_; // internal device buffers uint64_t num_buffers_; From ec45fd251d90cd8713c58252d8258ebee3b700a8 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Tue, 10 Mar 2026 22:46:18 +0000 Subject: [PATCH 079/119] add test, persist stream pool, cleanup --- cpp/src/neighbors/detail/cagra/utils.hpp | 214 ++++++++++-------- cpp/tests/CMakeLists.txt | 1 + .../test_batched_device_view_from_host.cu | 205 +++++++++++++++++ 3 files changed, 326 insertions(+), 94 deletions(-) create mode 100644 cpp/tests/neighbors/ann_cagra/test_batched_device_view_from_host.cu diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp index 75883a9636..44d87d2993 100644 --- a/cpp/src/neighbors/detail/cagra/utils.hpp +++ b/cpp/src/neighbors/detail/cagra/utils.hpp @@ -322,15 +322,22 @@ void copy_with_padding( * writeback of the data Each batch can be referenced exactlyonce by calling the next_view() * function * + * Usage: + * ``` + * batched_device_view_from_host view(res, host_view, batch_size, host_writeback, + * initialize); while (view.next_view().extent(0) > 0) { auto device_view = view.next_view(); + * // use device_view + * } + * ``` + * + * The call to next_view() will + * * synchronize on all previous operations / increments batch_id_ + * * (optionally) write back the data of the previous batch to the host + * * (optionally) prefetch the data of the next batch + * * return the view of the current batch + * * @tparam T The type of the data * @tparam IdxT The type of the index - * @param res The resources - * @param host_view The host view to create the batched device view from - * @param batch_size The batch size - * @param host_writeback Whether to write back the data to the host (only for host memory) - * @param initialize Whether to initialize the data (only for managed memory) - * - * @return The batched device view */ template class batched_device_view_from_host { @@ -341,6 +348,18 @@ class batched_device_view_from_host { managed_only, // data is on managed memory (system managed) }; + /** + * Create a batched device view from a host view and will handle the prefetch and + * writeback of the data. Each batch can be referenced exactly once by calling the next_view() + * method. + * + * @param res The resources to use + * @param host_view The host view to create the batched device view from + * @param batch_size The batch size + * @param host_writeback Whether to write back the data to the host (only for host memory) + * (default: false) + * @param initialize Whether to initialize the data (only for managed memory) (default: true) + */ batched_device_view_from_host(raft::resources const& res, raft::host_matrix_view host_view, uint64_t batch_size, @@ -360,6 +379,9 @@ class batched_device_view_from_host { return; } + RAFT_EXPECTS(host_writeback_ || initialize_, + "At least one of host_writeback or initialize must be true"); + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr_, host_view.data_handle())); switch (attr_.type) { case cudaMemoryTypeUnregistered: @@ -388,7 +410,8 @@ class batched_device_view_from_host { raft::make_extents(batch_size, host_view.extent(1)))); device_ptr[1] = device_mem_[1]->data_handle(); } - if (host_writeback_ && batch_size * 2 < static_cast(host_view.extent(0))) { + if (host_writeback_ && initialize_ && + batch_size * 2 < static_cast(host_view.extent(0))) { num_buffers_ = 3; device_mem_[2].emplace(raft::make_device_mdarray( res, @@ -397,15 +420,16 @@ class batched_device_view_from_host { device_ptr[2] = device_mem_[2]->data_handle(); } } catch (std::bad_alloc& e) { - RAFT_LOG_DEBUG("Insufficient memory for device buffers"); if (attr_.devicePointer != nullptr) { + RAFT_LOG_DEBUG("Insufficient memory for device buffers, switching to managed memory"); mem_strategy_ = memory_strategy::managed_only; } else { throw std::bad_alloc(); } } catch (raft::logic_error& e) { - RAFT_LOG_DEBUG("Insufficient memory for device buffers (logic error)"); if (attr_.devicePointer != nullptr) { + RAFT_LOG_DEBUG( + "Insufficient memory for device buffers (logic error), switching to managed memory"); mem_strategy_ = memory_strategy::managed_only; } else { throw raft::logic_error("Insufficient memory for device buffers (logic error)"); @@ -413,20 +437,18 @@ class batched_device_view_from_host { } } - // setup streams - if (res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL) && - raft::resource::get_stream_pool_size(res) >= 1) { - prefetch_stream_ = raft::resource::get_stream_from_stream_pool(res); - writeback_stream_ = raft::resource::get_stream_from_stream_pool(res); - } else { - local_stream_pool_ = std::make_shared(2); - prefetch_stream_ = local_stream_pool_.value()->get_stream(); - writeback_stream_ = local_stream_pool_.value()->get_stream(); + // setup stream pool if not already present + size_t required_streams = host_writeback_ && initialize_ ? 2 : 1; + if (!res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL) || + raft::resource::get_stream_pool_size(res) < required_streams) { + // always create at least 2 streams to account for subsequent iterator calls + raft::resource::set_cuda_stream_pool(res, std::make_shared(2)); } + prefetch_stream_ = raft::resource::get_stream_from_stream_pool(res); + writeback_stream_ = raft::resource::get_stream_from_stream_pool(res); // if data is managed and not for_write_ we can set the attribute on the device ptr if (mem_strategy_ == memory_strategy::managed_only) { - // location_.type = CU_MEM_LOCATION_TYPE_DEVICE; location_.type = cudaMemLocationTypeDevice; location_.id = static_cast(raft::resource::get_device_id(res_)); if (!host_writeback_) { @@ -440,6 +462,84 @@ class batched_device_view_from_host { prefetch_next_batch(); } + ~batched_device_view_from_host() noexcept + { + raft::resource::sync_stream(res_); + + // if data is on host and for_write --> make sure to copy back last active + // if data is managed and evict --> evict last active + + // make sure to sync on prefetch stream & res + switch (mem_strategy_) { + case memory_strategy::managed_only: + if (!host_writeback_) { + uint32_t discard_pos = batch_id_ % num_buffers_; + size_t discard_size_rows = actual_batch_size_[discard_pos]; + if (batch_id_ > 0) { + discard_pos = (batch_id_ - 1) % num_buffers_; + discard_size_rows += batch_size_; + } + discard_managed_region(device_ptr[discard_pos], + discard_size_rows * host_view_.extent(1) * sizeof(T)); + writeback_stream_.synchronize(); + } + break; + case memory_strategy::copy_device: + if (host_writeback_) { + uint32_t writeback_pos_last = batch_id_ % num_buffers_; + if (batch_id_ > 0) { + uint32_t writeback_pos = (batch_id_ - 1) % num_buffers_; + uint64_t writeback_offset = (batch_id_ - 1) * batch_size_; + writeback_from_device_to_host(device_ptr[writeback_pos], writeback_offset, batch_size_); + } + { + uint64_t writeback_offset_last = batch_id_ * batch_size_; + writeback_from_device_to_host(device_ptr[writeback_pos_last], + writeback_offset_last, + actual_batch_size_[writeback_pos_last]); + } + writeback_stream_.synchronize(); + } + break; + case memory_strategy::device_only: break; + } + } + + /** + * Returns the next view of the batch + * + * This function will ensure the next batch is ready and will trigger the prefetch of the + * subsequent next batch. If writeback is enabled, the last active batch will be written back to + * the host. + * + * @return The next view of the batch + */ + raft::device_matrix_view next_view() + { + bool end_of_data = static_cast((batch_id_ + 1) * batch_size_) >= + static_cast(host_view_.extent(0)); + + // special case for empty host view or last batch surpassed + if (end_of_data) { + return raft::make_device_matrix_view(nullptr, 0, host_view_.extent(1)); + } + + // trigger prefetch of next batch (also increments batch_id_) + prefetch_next_batch(); + + uint32_t current_pos = batch_id_ % num_buffers_; + return raft::make_device_matrix_view( + device_ptr[current_pos], actual_batch_size_[current_pos], host_view_.extent(1)); + } + + private: + /** + * Prefetch the next batch + * + * This function will prefetch the next batch and will handle the writeback of the data. + * + * @return True if the next batch exists, false otherwise + */ bool prefetch_next_batch() { batch_id_++; @@ -512,77 +612,6 @@ class batched_device_view_from_host { return next_batch_exists; } - ~batched_device_view_from_host() noexcept - { - prefetch_stream_.synchronize(); - writeback_stream_.synchronize(); - raft::resource::sync_stream(res_); - - // if data is on host and for_write --> make sure to copy back last active - // if data is managed and evict --> evict last active - - // make sure to sync on prefetch stream & res - switch (mem_strategy_) { - case memory_strategy::managed_only: - if (!host_writeback_) { - uint32_t discard_pos = batch_id_ % num_buffers_; - size_t discard_size_rows = actual_batch_size_[discard_pos]; - if (batch_id_ > 0) { - discard_pos = (batch_id_ - 1) % num_buffers_; - discard_size_rows += batch_size_; - } - discard_managed_region(device_ptr[discard_pos], - discard_size_rows * host_view_.extent(1) * sizeof(T)); - } - writeback_stream_.synchronize(); - break; - case memory_strategy::copy_device: - if (host_writeback_) { - uint32_t writeback_pos_last = batch_id_ % num_buffers_; - if (batch_id_ > 0) { - uint32_t writeback_pos = (batch_id_ - 1) % num_buffers_; - uint64_t writeback_offset = (batch_id_ - 1) * batch_size_; - writeback_from_device_to_host(device_ptr[writeback_pos], writeback_offset, batch_size_); - } - { - uint64_t writeback_offset_last = batch_id_ * batch_size_; - writeback_from_device_to_host(device_ptr[writeback_pos_last], - writeback_offset_last, - actual_batch_size_[writeback_pos_last]); - } - } - writeback_stream_.synchronize(); - break; - case memory_strategy::device_only: break; - } - } - - /** - * Returns the next view of the batch - * - * This function will ensure the next batch is ready and will trigger the prefetch of the - * subsequent next batch - * - * @return The next view of the batch - */ - raft::device_matrix_view next_view() - { - // special case for empty host view - if (host_view_.extent(0) == 0) { - return raft::make_device_matrix_view(nullptr, 0, host_view_.extent(1)); - } - - // trigger prefetch of next batch - bool next_batch_exists = prefetch_next_batch(); - - RAFT_EXPECTS(batch_id_ * batch_size_ < host_view_.extent(0), "Batch index out of bounds"); - - uint32_t current_pos = batch_id_ % num_buffers_; - return raft::make_device_matrix_view( - device_ptr[current_pos], actual_batch_size_[current_pos], host_view_.extent(1)); - } - - private: void advise_read_mostly(T* ptr, size_t size) { #if CUDA_VERSION >= 13000 @@ -627,9 +656,6 @@ class batched_device_view_from_host { { const size_t n_elem = num_rows * host_view_.extent(1); const size_t n_bytes = n_elem * sizeof(T); - RAFT_CUDA_TRY(cudaHostRegister(host_view_.data_handle() + src_row_offset * host_view_.extent(1), - n_bytes, - cudaHostRegisterDefault)); // use memcpy instead of raft::copy to avoid strange behavior with HMM/ATS memory RAFT_CUDA_TRY(cudaMemcpyAsync(dev_ptr, host_view_.data_handle() + src_row_offset * host_view_.extent(1), diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 35794adf9b..77fd18c7d3 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -173,6 +173,7 @@ ConfigureTest( ConfigureTest( NAME NEIGHBORS_ANN_CAGRA_HELPERS_TEST PATH neighbors/ann_cagra/test_optimize_uint32_t.cu + neighbors/ann_cagra/test_batched_device_view_from_host.cu GPUS 1 PERCENT 100 ) diff --git a/cpp/tests/neighbors/ann_cagra/test_batched_device_view_from_host.cu b/cpp/tests/neighbors/ann_cagra/test_batched_device_view_from_host.cu new file mode 100644 index 0000000000..1e1cc13093 --- /dev/null +++ b/cpp/tests/neighbors/ann_cagra/test_batched_device_view_from_host.cu @@ -0,0 +1,205 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "../../../src/neighbors/detail/cagra/utils.hpp" + +#include +#include +#include +#include + +namespace cuvs::neighbors::cagra { + +using IdxT = uint32_t; + +struct BatchConfig { + bool initialize; + bool host_writeback; +}; + +struct DimsConfig { + int64_t n_rows; + int64_t n_cols; + uint64_t batch_size; +}; + +class BatchedDeviceViewFromHostTest : public ::testing::Test { + protected: + void SetUp() override { raft::resource::sync_stream(res); } + + /** + * Run batched_device_view_from_host over host data, copy device views back, + * and verify against the input. + */ + template + void run_and_verify_batched(InputMatrixView input_view, + uint64_t batch_size, + bool host_writeback, + bool initialize) + { + int64_t n_rows = input_view.extent(0); + int64_t n_cols = input_view.extent(1); + + std::vector readback(n_rows * n_cols); + + int64_t total_processed = 0; + + { + cagra::detail::batched_device_view_from_host batched( + res, + raft::make_host_matrix_view(input_view.data_handle(), n_rows, n_cols), + batch_size, + host_writeback, + initialize); + while (true) { + auto dev_view = batched.next_view(); + if (dev_view.extent(0) == 0) break; + + if (initialize) { + raft::copy(readback.data() + total_processed * n_cols, + dev_view.data_handle(), + dev_view.extent(0) * dev_view.extent(1), + raft::resource::get_cuda_stream(res)); + } + if (host_writeback) { raft::matrix::fill(res, dev_view, IdxT(17)); } + total_processed += dev_view.extent(0); + } + } + raft::resource::sync_stream(res); + + EXPECT_EQ(total_processed, n_rows); + if (initialize) { + for (int64_t i = 0; i < n_rows * n_cols; ++i) { + EXPECT_EQ(readback[i], IdxT(13)) << "Mismatch (initialize) at index " << i; + } + } + if (host_writeback) { + auto readback_view = + raft::make_host_matrix_view(readback.data(), n_rows, n_cols); + raft::copy(res, readback_view, input_view); + raft::resource::sync_stream(res); + for (int64_t i = 0; i < n_rows * n_cols; ++i) { + EXPECT_EQ(readback[i], IdxT(17)) << "Mismatch (host_writeback) at index " << i; + } + } + } + + raft::resources res; +}; + +TEST_F(BatchedDeviceViewFromHostTest, EmptyView) +{ + auto host_empty = raft::make_host_matrix(0, 8); + auto host_view = host_empty.view(); + cagra::detail::batched_device_view_from_host batched( + res, host_view, /*batch_size=*/128, /*host_writeback=*/false, /*initialize=*/true); + + auto view = batched.next_view(); + EXPECT_EQ(view.extent(0), 0); + EXPECT_EQ(view.extent(1), 8); + EXPECT_EQ(view.data_handle(), nullptr); +} + +using BatchDimsParam = std::tuple; + +class BatchedDeviceViewFromHostParameterizedTest + : public BatchedDeviceViewFromHostTest, + public ::testing::WithParamInterface {}; + +TEST_P(BatchedDeviceViewFromHostParameterizedTest, VectorHostData) +{ + auto [batch_config, dims_config] = GetParam(); + auto [initialize, host_writeback] = batch_config; + auto [n_rows, n_cols, batch_size] = dims_config; + + std::vector host_data(n_rows * n_cols); + auto host_view = raft::make_host_matrix_view(host_data.data(), n_rows, n_cols); + + std::fill(host_view.data_handle(), host_view.data_handle() + n_rows * n_cols, IdxT(13)); + + run_and_verify_batched(host_view, batch_size, host_writeback, initialize); +} + +TEST_P(BatchedDeviceViewFromHostParameterizedTest, PinnedMemory) +{ + auto [batch_config, dims_config] = GetParam(); + auto [initialize, host_writeback] = batch_config; + auto [n_rows, n_cols, batch_size] = dims_config; + + auto host_matrix = raft::make_pinned_matrix(res, n_rows, n_cols); + auto host_view = host_matrix.view(); + + std::fill(host_view.data_handle(), host_view.data_handle() + n_rows * n_cols, IdxT(13)); + + run_and_verify_batched(host_view, batch_size, host_writeback, initialize); +} + +TEST_P(BatchedDeviceViewFromHostParameterizedTest, ManagedMemory) +{ + auto [batch_config, dims_config] = GetParam(); + auto [initialize, host_writeback] = batch_config; + auto [n_rows, n_cols, batch_size] = dims_config; + + auto host_matrix = raft::make_managed_matrix(res, n_rows, n_cols); + auto host_view = host_matrix.view(); + + std::fill(host_view.data_handle(), host_view.data_handle() + n_rows * n_cols, IdxT(13)); + + run_and_verify_batched(host_view, batch_size, host_writeback, initialize); +} + +TEST_P(BatchedDeviceViewFromHostParameterizedTest, DeviceMemory) +{ + auto [batch_config, dims_config] = GetParam(); + auto [initialize, host_writeback] = batch_config; + auto [n_rows, n_cols, batch_size] = dims_config; + + auto host_matrix = raft::make_device_matrix(res, n_rows, n_cols); + auto host_view = host_matrix.view(); + + raft::matrix::fill(res, host_view, IdxT(13)); + + run_and_verify_batched(host_view, batch_size, host_writeback, initialize); +} + +static const std::array kBatchConfigs = {{ + {/*initialize=*/true, /*host_writeback=*/false}, + {/*initialize=*/false, /*host_writeback=*/true}, + {/*initialize=*/true, /*host_writeback=*/true}, +}}; + +static const std::array kDimsConfigs = {{ + {/*n_rows=*/64, /*n_cols=*/32, /*batch_size=*/256}, // rows less than batch size, single batch + {/*n_rows=*/64, /*n_cols=*/32, /*batch_size=*/64}, // single batch + {/*n_rows=*/256, /*n_cols=*/32, /*batch_size=*/32}, // multiple batches + {/*n_rows=*/500, + /*n_cols=*/32, + /*batch_size=*/128}, // multiple batches, partial batch in the end +}}; + +INSTANTIATE_TEST_SUITE_P(BatchConfigs, + BatchedDeviceViewFromHostParameterizedTest, + ::testing::Combine(::testing::ValuesIn(kBatchConfigs), + ::testing::ValuesIn(kDimsConfigs))); + +} // namespace cuvs::neighbors::cagra From c412138a0dd6e3b81fa9bc4e10a1b546d71c5476 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 11 Mar 2026 00:04:52 +0000 Subject: [PATCH 080/119] switch to cooperative groups as __reduce_min_sync causes issues --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index b5e055820d..2444350253 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -29,11 +29,16 @@ #include #include +#include +#include + #include #include #include #include +namespace cg = cooperative_groups; + namespace cuvs::neighbors::cagra::detail::graph { // unnamed namespace to avoid multiple definition error @@ -196,6 +201,9 @@ __global__ void kern_fused_prune(const IdxT* const knn_graph, // [graph_chunk_ { extern __shared__ unsigned char smem_buf[]; + cg::thread_block block = cg::this_thread_block(); + cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block); + const uint32_t wid = threadIdx.x / raft::WarpSize; const uint32_t lane_id = threadIdx.x % raft::WarpSize; @@ -207,8 +215,7 @@ __global__ void kern_fused_prune(const IdxT* const knn_graph, // [graph_chunk_ uint64_t* const num_retain = stats; uint64_t* const num_full = stats + 1; - const unsigned warp_mask = 0xffffffff; - const uint32_t maxval16 = 0x0000ffff; + const uint32_t maxval16 = 0x0000ffff; const uint64_t nid_batch = blockIdx.x * num_warps + wid; const uint64_t nid = nid_batch + (batch_size * batch_id); @@ -255,11 +262,7 @@ __global__ void kern_fused_prune(const IdxT* const knn_graph, // [graph_chunk_ __syncwarp(); - num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 1); - num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 2); - num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 4); - num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 8); - num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 16); + num_edges_no_detour = cg::reduce(warp, num_edges_no_detour, cg::plus()); num_edges_no_detour = min(num_edges_no_detour, output_graph_degree); if (lane_id == 0) { @@ -280,7 +283,7 @@ __global__ void kern_fused_prune(const IdxT* const knn_graph, // [graph_chunk_ } uint32_t local_min_with_tag = (local_min << 16) | ((uint32_t)local_idx); - uint32_t warp_min_with_tag = __reduce_min_sync(warp_mask, local_min_with_tag); + uint32_t warp_min_with_tag = cg::reduce(warp, local_min_with_tag, cg::less()); uint32_t warp_min_count = warp_min_with_tag >> 16; uint32_t warp_local_idx = warp_min_with_tag & 0xffff; @@ -294,7 +297,7 @@ __global__ void kern_fused_prune(const IdxT* const knn_graph, // [graph_chunk_ for (uint32_t k = lane_id; k < knn_graph_degree; k += raft::WarpSize) { if (smem_indices[k] == selected_node) { smem_num_detour[k] = maxval16; } } - __syncwarp(warp_mask); + __syncwarp(); if (lane_id == 0) { output_graph_ptr[nid_batch * output_graph_degree + i] = selected_node; } } @@ -312,7 +315,10 @@ __device__ unsigned int warp_pos_in_array(T val, const T* array, uint64_t num) break; } } - ret = __reduce_min_sync(0xffffffff, ret); + + cg::thread_block block = cg::this_thread_block(); + cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block); + ret = cg::reduce(warp, ret, cg::less()); return ret; } From ab01bab594e4337a9b6530a686e0d8642ce61866 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 13 Mar 2026 18:43:55 +0000 Subject: [PATCH 081/119] back to column wise reverse graph creation to boost closer connections --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 124 +++++++++++------- 1 file changed, 78 insertions(+), 46 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 88c13c139e..5d43da851b 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -171,24 +171,38 @@ __global__ void kern_sort(const DATA_T* const dataset, // [dataset_chunk_size, } } +template +__global__ void kern_make_rev_graph(const IdxT* const dest_nodes, // [graph_size] + IdxT* const rev_graph, // [size, degree] + uint32_t* const rev_graph_count, // [graph_size] + const uint32_t graph_size, + const uint32_t degree) +{ + const uint32_t tid = threadIdx.x + (blockDim.x * blockIdx.x); + const uint32_t tnum = blockDim.x * gridDim.x; + + for (uint32_t src_id = tid; src_id < graph_size; src_id += tnum) { + const IdxT dest_id = dest_nodes[src_id]; + if (dest_id >= graph_size) continue; + + const uint32_t pos = atomicAdd(rev_graph_count + dest_id, 1); + if (pos < degree) { rev_graph[pos + ((uint64_t)degree * dest_id)] = src_id; } + } +} + template -__global__ void kern_rev_graph_batched(const IdxT* const dest_nodes, // [batch_size, degree] - IdxT* const rev_graph, // [graph_size, degree] - uint32_t* const rev_graph_count, // [graph_size] - const uint32_t graph_size, - const uint32_t degree, - const uint32_t batch_size, - const uint32_t batch_id) +__global__ void kern_make_rev_graph_k(const IdxT* const output_graph, // [graph_size, degree] + IdxT* const rev_graph, // [graph_size, degree] + uint32_t* const rev_graph_count, // [graph_size] + const uint32_t graph_size, + const uint32_t degree, + uint64_t k) { const uint64_t tid = threadIdx.x + (blockDim.x * blockIdx.x); const uint64_t tnum = blockDim.x * gridDim.x; - const uint64_t block_batch_size = min(batch_size, graph_size - batch_id * batch_size); - - for (uint64_t idx = tid; idx < block_batch_size * degree; idx += tnum) { - const IdxT dest_id = dest_nodes[idx]; - const uint32_t src_id = idx / degree; - + for (uint64_t src_id = tid; src_id < graph_size; src_id += tnum) { + IdxT dest_id = output_graph[k + (degree * src_id)]; if (dest_id >= graph_size) continue; const uint32_t pos = atomicAdd(rev_graph_count + dest_id, 1); @@ -840,50 +854,67 @@ void make_reverse_graph_gpu(raft::resources const& res, uint64_t output_graph_degree) { raft::common::nvtx::range block_scope( - "cagra::graph::optimize/reverse"); + "cagra::graph::optimize/reverse2"); - auto default_ws_mr = raft::resource::get_workspace_resource(res); + auto d_rev_graph = + raft::make_device_vector_view(d_rev_graph_ptr, graph_size * output_graph_degree); + auto d_rev_graph_count = + raft::make_device_vector_view(d_rev_graph_count_ptr, graph_size); - raft::matrix::fill( - res, - raft::make_device_vector_view(d_rev_graph_ptr, graph_size * output_graph_degree), - IdxT(-1)); + // + // Make reverse graph + // + const double time_make_start = cur_time(); - raft::matrix::fill( - res, - raft::make_device_vector_view(d_rev_graph_count_ptr, graph_size), - uint32_t(0)); + raft::matrix::fill(res, d_rev_graph, IdxT(-1)); + raft::matrix::fill(res, d_rev_graph_count, uint32_t(0)); - const uint32_t batch_size = - std::min(static_cast(graph_size), static_cast(256 * 1024)); - const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size; + if (is_ptr_host_accessible(output_graph_ptr)) { + auto d_dest_nodes = + raft::make_device_mdarray(res, raft::make_extents(graph_size)); - batched_device_view_from_host d_output_graph( - res, - raft::make_host_matrix_view(output_graph_ptr, graph_size, output_graph_degree), - /*batch_size*/ batch_size, - /*host_writeback*/ false, - /*initialize*/ true); - - for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { + for (uint64_t k = 0; k < output_graph_degree; k++) { + RAFT_CUDA_TRY(cudaMemcpy2DAsync(d_dest_nodes.data_handle(), + sizeof(IdxT), + output_graph_ptr + k, + output_graph_degree * sizeof(IdxT), + 1 * sizeof(IdxT), + graph_size, + cudaMemcpyHostToDevice, + raft::resource::get_cuda_stream(res))); + + dim3 threads(256, 1, 1); + dim3 blocks(1024, 1, 1); + kern_make_rev_graph<<>>( + d_dest_nodes.data_handle(), + d_rev_graph.data_handle(), + d_rev_graph_count.data_handle(), + graph_size, + output_graph_degree); + RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u \r", k, output_graph_degree); + } + } else { + // output graph is fully device accessible, so we need no copy to device dim3 threads(256, 1, 1); dim3 blocks(1024, 1, 1); - auto output_view = d_output_graph.next_view(); - - kern_rev_graph_batched<<>>( - output_view.data_handle(), - d_rev_graph_ptr, - d_rev_graph_count_ptr, - static_cast(graph_size), - static_cast(output_graph_degree), - static_cast(batch_size), - static_cast(i_batch)); + for (uint64_t k = 0; k < output_graph_degree; k++) { + kern_make_rev_graph_k<<>>( + output_graph_ptr, + d_rev_graph.data_handle(), + d_rev_graph_count.data_handle(), + graph_size, + output_graph_degree, + k); + } } raft::resource::sync_stream(res); RAFT_LOG_DEBUG("\n"); + + const double time_make_end = cur_time(); + RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms", + (time_make_end - time_make_start) * 1000.0); } -} // namespace template void optimize(raft::resources const& res, InputMatrixView knn_graph, From 68f78839a5437d48f54d876b484efded20e8448d Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 13 Mar 2026 20:00:55 +0000 Subject: [PATCH 082/119] fix signness --- cpp/src/neighbors/detail/cagra/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp index 44d87d2993..7dae487863 100644 --- a/cpp/src/neighbors/detail/cagra/utils.hpp +++ b/cpp/src/neighbors/detail/cagra/utils.hpp @@ -561,7 +561,7 @@ class batched_device_view_from_host { // if data is on device only this is almost a noop, just prepping the pointers - RAFT_EXPECTS(offset_ <= host_view_.extent(0), "Offset out of bounds"); + RAFT_EXPECTS(static_cast(offset_) <= host_view_.extent(0), "Offset out of bounds"); bool next_batch_exists = offset_ < static_cast(host_view_.extent(0)); From add206a7697aaf019543a43e39f763858992c5a2 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Fri, 13 Mar 2026 22:51:05 +0000 Subject: [PATCH 083/119] stupid me trusting cursor to fix this --- cpp/src/neighbors/detail/cagra/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp index 7dae487863..79d1ed1cae 100644 --- a/cpp/src/neighbors/detail/cagra/utils.hpp +++ b/cpp/src/neighbors/detail/cagra/utils.hpp @@ -561,7 +561,7 @@ class batched_device_view_from_host { // if data is on device only this is almost a noop, just prepping the pointers - RAFT_EXPECTS(static_cast(offset_) <= host_view_.extent(0), "Offset out of bounds"); + RAFT_EXPECTS(static_cast(offset_) <= host_view_.extent(0), "Offset out of bounds"); bool next_batch_exists = offset_ < static_cast(host_view_.extent(0)); From ab21766205138c20603ca1abfe7832f20bb37cda Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Mon, 16 Mar 2026 08:23:42 -0700 Subject: [PATCH 084/119] leftover files --- .gitignore | 3 +++ cpp/src/neighbors/detail/cagra/cagra_build.cuh | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 317c28d997..0066d2b89a 100644 --- a/.gitignore +++ b/.gitignore @@ -88,5 +88,8 @@ ivf_pq_index /datasets/ /*.json +# clangd +*/.clangd + # java .classpath diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 58c48e4023..31797acb5b 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -2254,7 +2254,7 @@ auto iterative_build_graph( idx_opt->update_dataset( res, // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later - cuvs::neighbors::vpq_build( + cuvs::preprocessing::quantize::pq::vpq_build( res, *params.compression, dev_dataset)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_ms = std::chrono::duration_cast(end - start).count(); From 6f450cdd066dc71ec44da92304915fc182e154d5 Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Mon, 16 Mar 2026 09:44:03 -0700 Subject: [PATCH 085/119] Revert graph_core.cuh to merge base before merging PR 1830 Discarding local GPU reverse graph / edge selection changes in graph_core.cuh to cleanly accept upstream PR 1830 optimizations. Made-with: Cursor --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 341 +++++------------- 1 file changed, 92 insertions(+), 249 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index b088c41e49..d94e279829 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -31,7 +31,6 @@ #include #include #include -#include #include namespace cuvs::neighbors::cagra::detail::graph { @@ -250,67 +249,6 @@ __global__ void kern_make_rev_graph(const IdxT* const dest_nodes, // [graph_ } } -template -__global__ void kern_extract_column(const IdxT* const d_matrix, - IdxT* const d_column, - const uint32_t n_rows, - const uint32_t n_cols, - const uint32_t col_idx) -{ - const uint32_t tid = threadIdx.x + (blockDim.x * blockIdx.x); - const uint32_t tnum = blockDim.x * gridDim.x; - for (uint32_t i = tid; i < n_rows; i += tnum) { - d_column[i] = d_matrix[col_idx + (static_cast(n_cols) * i)]; - } -} - -template -__global__ void kern_select_edges(const uint8_t* const d_detour_count, - const IdxT* const d_knn_graph, - IdxT* const d_output_graph, - const uint32_t graph_size, - const uint32_t knn_graph_degree, - const uint32_t output_graph_degree, - uint32_t* const d_invalid_count) -{ - const uint64_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; - if (i >= graph_size) return; - - const uint8_t* my_detour = d_detour_count + i * knn_graph_degree; - const IdxT* my_knn = d_knn_graph + i * knn_graph_degree; - IdxT* my_output = d_output_graph + i * output_graph_degree; - - uint32_t pk = 0; - uint32_t num_detour = 0; - for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) { - uint32_t next_num_detour = 0xFFFFFFFFu; - for (uint32_t k = 0; k < knn_graph_degree; k++) { - const uint32_t d = my_detour[k]; - if (d > num_detour) { next_num_detour = min(next_num_detour, d); } - if (d != num_detour) { continue; } - - const IdxT candidate = my_knn[k]; - bool dup = false; - for (uint32_t dk = 0; dk < pk; dk++) { - if (candidate == my_output[dk]) { - dup = true; - break; - } - } - if (!dup && candidate < static_cast(graph_size)) { - my_output[pk] = candidate; - pk++; - } - if (pk >= output_graph_degree) break; - } - if (pk >= output_graph_degree) break; - if (next_num_detour == 0xFFFFFFFFu) break; - num_detour = next_num_detour; - } - - if (pk != output_graph_degree) { atomicAdd(d_invalid_count, 1); } -} - template __device__ __host__ LabelT get_root_label(IdxT i, const LabelT* label) { @@ -1214,8 +1152,7 @@ void optimize( raft::mdspan, raft::row_major, g_accessor> knn_graph, raft::host_matrix_view new_graph, const bool guarantee_connectivity = true, - const bool use_gpu = true, - const IdxT* d_knn_graph_ptr = nullptr) + const bool use_gpu = true) { RAFT_LOG_DEBUG( "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1)); @@ -1257,10 +1194,6 @@ void optimize( } } - // Device pruned graph: populated by GPU edge selection, reused by GPU reverse graph. - auto d_pruned_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(0, 0)); - { raft::common::nvtx::range block_scope( "cagra::graph::optimize/prune"); @@ -1324,18 +1257,11 @@ void optimize( RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r"); - // Use device knn_graph directly if provided; otherwise copy from host. - std::optional> d_input_graph_copy; - const IdxT* d_input_graph_handle; - if (d_knn_graph_ptr != nullptr) { - d_input_graph_handle = d_knn_graph_ptr; - } else { - d_input_graph_copy.emplace( - res, - raft::make_host_matrix_view( - knn_graph.data_handle(), graph_size, knn_graph_degree)); - d_input_graph_handle = d_input_graph_copy->data_handle(); - } + // Copy knn_graph over to device if necessary + device_matrix_view_from_host d_input_graph( + res, + raft::make_host_matrix_view( + knn_graph.data_handle(), graph_size, knn_graph_degree)); constexpr int MAX_DEGREE = 1024; if (knn_graph_degree > MAX_DEGREE) { @@ -1356,7 +1282,7 @@ void optimize( for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { kern_prune <<>>( - d_input_graph_handle, + d_input_graph.data_handle(), graph_size, knn_graph_degree, output_graph_degree, @@ -1373,6 +1299,8 @@ void optimize( raft::resource::sync_stream(res); RAFT_LOG_DEBUG("\n"); + raft::copy(res, detour_count.view(), raft::make_const_mdspan(d_detour_count.view())); + raft::copy(res, host_stats.view(), raft::make_const_mdspan(dev_stats.view())); num_keep = host_stats.data_handle()[0]; num_full = host_stats.data_handle()[1]; @@ -1386,45 +1314,6 @@ void optimize( (double)num_keep / graph_size, output_graph_degree, (double)num_full / graph_size * 100); - - // GPU edge selection: pick output_graph_degree edges per node with lowest detour counts. - d_pruned_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); - { - raft::common::nvtx::range select_scope( - "cagra::graph::optimize/prune/edge-selection-by-GPU"); - auto d_invalid_count = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(1)); - raft::matrix::fill(res, d_invalid_count.view(), uint32_t(0)); - - const uint32_t select_threads = 256; - const uint32_t select_blocks = (graph_size + select_threads - 1) / select_threads; - kern_select_edges - <<>>( - d_detour_count.data_handle(), - d_input_graph_handle, - d_pruned_graph.data_handle(), - graph_size, - knn_graph_degree, - output_graph_degree, - d_invalid_count.data_handle()); - raft::resource::sync_stream(res); - - auto h_invalid_count = raft::make_host_vector(1); - raft::copy(res, h_invalid_count.view(), raft::make_const_mdspan(d_invalid_count.view())); - raft::resource::sync_stream(res); - RAFT_EXPECTS( - h_invalid_count.data_handle()[0] == 0, - "Could not generate an intermediate CAGRA graph because the initial kNN graph " - "contains too many invalid or duplicated neighbor nodes. (%u nodes failed)", - h_invalid_count.data_handle()[0]); - - raft::copy(output_graph_ptr, - d_pruned_graph.data_handle(), - graph_size * output_graph_degree, - raft::resource::get_cuda_stream(res)); - raft::resource::sync_stream(res); - } } else { // Count 2-hop detours on CPU raft::common::nvtx::range block_scope( @@ -1436,66 +1325,66 @@ void optimize( const double time_2hop_count_end = cur_time(); RAFT_LOG_DEBUG("# Time for 2-hop detour counting on CPU: %.1lf sec", time_2hop_count_end - time_2hop_count_start); + } - // Create pruned kNN graph - bool invalid_neighbor_list = false; + // Create pruned kNN graph + bool invalid_neighbor_list = false; #pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable - // count of the neighbors while increasing the target detourable count from zero. - uint64_t pk = 0; - uint32_t num_detour = 0; - for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) { - uint32_t next_num_detour = std::numeric_limits::max(); - for (uint64_t k = 0; k < knn_graph_degree; k++) { - const auto num_detour_k = detour_count(i, k); - // Find the detourable count to check in the next iteration - if (num_detour_k > num_detour) { - next_num_detour = std::min(static_cast(num_detour_k), next_num_detour); - } + for (uint64_t i = 0; i < graph_size; i++) { + // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable + // count of the neighbors while increasing the target detourable count from zero. + uint64_t pk = 0; + uint32_t num_detour = 0; + for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) { + uint32_t next_num_detour = std::numeric_limits::max(); + for (uint64_t k = 0; k < knn_graph_degree; k++) { + const auto num_detour_k = detour_count(i, k); + // Find the detourable count to check in the next iteration + if (num_detour_k > num_detour) { + next_num_detour = std::min(static_cast(num_detour_k), next_num_detour); + } - // Store the neighbor index if its detourable count is equal to `num_detour`. - if (num_detour_k != num_detour) { continue; } - - // Check duplication and append - const auto candidate_node = knn_graph(i, k); - bool dup = false; - for (uint32_t dk = 0; dk < pk; dk++) { - if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) { - dup = true; - break; - } - } - if (!dup && candidate_node < graph_size) { - output_graph_ptr[i * output_graph_degree + pk] = candidate_node; - pk += 1; + // Store the neighbor index if its detourable count is equal to `num_detour`. + if (num_detour_k != num_detour) { continue; } + + // Check duplication and append + const auto candidate_node = knn_graph(i, k); + bool dup = false; + for (uint32_t dk = 0; dk < pk; dk++) { + if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) { + dup = true; + break; } - if (pk >= output_graph_degree) break; } - if (pk >= output_graph_degree) break; - - if (next_num_detour == std::numeric_limits::max()) { - // There are no valid edges enough in the initial kNN graph. Break the loop here and - // catch the error at the next validation (pk != output_graph_degree). - break; + if (!dup && candidate_node < graph_size) { + output_graph_ptr[i * output_graph_degree + pk] = candidate_node; + pk += 1; } - num_detour = next_num_detour; + if (pk >= output_graph_degree) break; } - if (pk != output_graph_degree) { - RAFT_LOG_DEBUG( - "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for " - "node %lu in the rank-based node reranking process", - output_graph_degree, - i); - invalid_neighbor_list = true; + if (pk >= output_graph_degree) break; + + if (next_num_detour == std::numeric_limits::max()) { + // There are no valid edges enough in the initial kNN graph. Break the loop here and catch + // the error at the next validation (pk != output_graph_degree). + break; } + num_detour = next_num_detour; + } + if (pk != output_graph_degree) { + RAFT_LOG_DEBUG( + "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for " + "node %lu in the rank-based node reranking process", + output_graph_degree, + i); + invalid_neighbor_list = true; } - RAFT_EXPECTS( - !invalid_neighbor_list, - "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " - "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " - "overflows occur during the norm computation between the dataset vectors."); } + RAFT_EXPECTS( + !invalid_neighbor_list, + "Could not generate an intermediate CAGRA graph because the initial kNN graph contains too " + "many invalid or duplicated neighbor nodes. This error can occur, for example, if too many " + "overflows occur during the norm computation between the dataset vectors."); const double time_prune_end = cur_time(); RAFT_LOG_DEBUG("# Pruning time: %.1lf ms", (time_prune_end - time_prune_start) * 1000.0); @@ -1512,94 +1401,48 @@ void optimize( // const double time_make_start = cur_time(); - if (d_pruned_graph.extent(0) > 0) { - // GPU path: d_pruned_graph is on device; extract columns on device to preserve - // column-priority ordering (earlier columns get priority in the reverse graph). - auto d_rev_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); - raft::matrix::fill(res, - raft::make_device_vector_view( - d_rev_graph.data_handle(), graph_size * output_graph_degree), - IdxT(-1)); - - auto d_rev_graph_count = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size)); - raft::matrix::fill(res, d_rev_graph_count.view(), uint32_t(0)); + device_matrix_view_from_host d_rev_graph(res, rev_graph.view()); + raft::matrix::fill(res, + raft::make_device_vector_view( + d_rev_graph.data_handle(), graph_size * output_graph_degree), + IdxT(-1)); - auto d_dest_nodes = - raft::make_device_mdarray(res, large_tmp_mr, raft::make_extents(graph_size)); + auto d_rev_graph_count = raft::make_device_mdarray( + res, large_tmp_mr, raft::make_extents(graph_size)); + raft::matrix::fill(res, d_rev_graph_count.view(), uint32_t(0)); - for (uint64_t k = 0; k < output_graph_degree; k++) { - dim3 ext_threads(256, 1, 1); - dim3 ext_blocks(std::min(static_cast((graph_size + 255) / 256), 65535u), 1, 1); - kern_extract_column - <<>>( - d_pruned_graph.data_handle(), - d_dest_nodes.data_handle(), - graph_size, - output_graph_degree, - k); - - dim3 threads(256, 1, 1); - dim3 blocks(1024, 1, 1); - kern_make_rev_graph<<>>( - d_dest_nodes.data_handle(), - d_rev_graph.data_handle(), - d_rev_graph_count.data_handle(), - graph_size, - output_graph_degree); + auto dest_nodes = raft::make_host_vector(graph_size); + auto d_dest_nodes = + raft::make_device_mdarray(res, large_tmp_mr, raft::make_extents(graph_size)); + + for (uint64_t k = 0; k < output_graph_degree; k++) { +#pragma omp parallel for + for (uint64_t i = 0; i < graph_size; i++) { + // dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)]; + dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)]; } raft::resource::sync_stream(res); - d_pruned_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(0, 0)); - - raft::copy(res, rev_graph.view(), raft::make_const_mdspan(d_rev_graph.view())); - raft::copy(res, rev_graph_count.view(), raft::make_const_mdspan(d_rev_graph_count.view())); - } else { - // CPU fallback: per-column H-to-D copy approach. - device_matrix_view_from_host d_rev_graph(res, rev_graph.view()); - raft::matrix::fill(res, - raft::make_device_vector_view( - d_rev_graph.data_handle(), graph_size * output_graph_degree), - IdxT(-1)); - - auto d_rev_graph_count = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size)); - raft::matrix::fill(res, d_rev_graph_count.view(), uint32_t(0)); - - auto dest_nodes = raft::make_host_vector(graph_size); - auto d_dest_nodes = - raft::make_device_mdarray(res, large_tmp_mr, raft::make_extents(graph_size)); - - for (uint64_t k = 0; k < output_graph_degree; k++) { -#pragma omp parallel for - for (uint64_t i = 0; i < graph_size; i++) { - dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)]; - } - raft::resource::sync_stream(res); + raft::copy(res, d_dest_nodes.view(), raft::make_const_mdspan(dest_nodes.view())); - raft::copy(res, d_dest_nodes.view(), raft::make_const_mdspan(dest_nodes.view())); - - dim3 threads(256, 1, 1); - dim3 blocks(1024, 1, 1); - kern_make_rev_graph<<>>( - d_dest_nodes.data_handle(), - d_rev_graph.data_handle(), - d_rev_graph_count.data_handle(), - graph_size, - output_graph_degree); - RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u \r", k, output_graph_degree); - } + dim3 threads(256, 1, 1); + dim3 blocks(1024, 1, 1); + kern_make_rev_graph<<>>( + d_dest_nodes.data_handle(), + d_rev_graph.data_handle(), + d_rev_graph_count.data_handle(), + graph_size, + output_graph_degree); + RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u \r", k, output_graph_degree); + } - raft::resource::sync_stream(res); - RAFT_LOG_DEBUG("\n"); + raft::resource::sync_stream(res); + RAFT_LOG_DEBUG("\n"); - if (d_rev_graph.allocated_memory()) { - raft::copy(res, rev_graph.view(), raft::make_const_mdspan(d_rev_graph.view())); - } - raft::copy(res, rev_graph_count.view(), raft::make_const_mdspan(d_rev_graph_count.view())); + if (d_rev_graph.allocated_memory()) { + raft::copy(res, rev_graph.view(), raft::make_const_mdspan(d_rev_graph.view())); } + raft::copy(res, rev_graph_count.view(), raft::make_const_mdspan(d_rev_graph_count.view())); const double time_make_end = cur_time(); RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms", From d2195fd5fb20209205a46ea17e3c35349450875f Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Mon, 16 Mar 2026 10:18:26 -0700 Subject: [PATCH 086/119] older api artefact --- .../neighbors/detail/cagra/cagra_build.cuh | 27 +++---------------- 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 5931bfa156..690a706619 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1903,8 +1903,7 @@ void optimize( raft::resources const& res, raft::mdspan, raft::row_major, g_accessor> knn_graph, raft::host_matrix_view new_graph, - const bool guarantee_connectivity = false, - const IdxT* d_knn_graph_ptr = nullptr) + const bool guarantee_connectivity = false) { using internal_IdxT = typename std::make_unsigned::type; @@ -1922,12 +1921,7 @@ void optimize( knn_graph.extent(1)); cagra::detail::graph::optimize( - res, - knn_graph_internal, - new_graph_internal, - guarantee_connectivity, - true, - reinterpret_cast(d_knn_graph_ptr)); + res, knn_graph_internal, new_graph_internal, guarantee_connectivity); } // RAII wrapper for allocating memory with Transparent HugePage @@ -2093,12 +2087,6 @@ void search_and_optimize(raft::resources const& res, { auto stream = raft::resource::get_cuda_stream(res); - // Accumulate search results on device to avoid D-to-H + H-to-D round-trip. - auto dev_knn_graph = - raft::make_device_matrix(res, curr_query_size, curr_topk); - - // Search in batches, accumulate results on both device and host. - // Host copy is needed by optimize Phase 3 (edge selection) which currently runs on CPU. cuvs::spatial::knn::detail::utils::batch_load_iterator query_batch( dev_query_view.data_handle(), curr_query_size, @@ -2121,28 +2109,19 @@ void search_and_optimize(raft::resources const& res, batch_dev_neighbors_view, batch_dev_distances_view); - // D-to-D: accumulate into device knn_graph - raft::copy(dev_knn_graph.data_handle() + batch.offset() * curr_topk, - batch_dev_neighbors_view.data_handle(), - batch.size() * curr_topk, - stream); - - // D-to-H: still needed for optimize Phase 3 (host edge selection) raft::copy(neighbors_view.data_handle() + batch.offset() * curr_topk, batch_dev_neighbors_view.data_handle(), batch.size() * curr_topk, stream); } - // Optimize graph, passing device knn_graph to skip H-to-D copy inside optimize Phase 2. auto next_graph_size = curr_query_size; cagra_graph = raft::make_host_matrix(0, 0); cagra_graph = raft::make_host_matrix(next_graph_size, next_graph_degree); optimize(res, neighbors_view, cagra_graph.view(), - flag_last ? params.guarantee_connectivity : 0, - dev_knn_graph.data_handle()); + flag_last ? params.guarantee_connectivity : false); } template Date: Tue, 17 Mar 2026 04:33:55 -0700 Subject: [PATCH 087/119] put all optimize() steps onto device, no more extra copies d->h; also no connectivity guarantee --- .../neighbors/detail/cagra/cagra_build.cuh | 147 ++++-------------- cpp/src/neighbors/detail/cagra/graph_core.cuh | 2 +- 2 files changed, 35 insertions(+), 114 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 690a706619..8c1fb2b0d7 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -2022,53 +2022,6 @@ void reconstruct_vpq_queries(raft::resources const& res, output.data_handle()); } -template -void search_to_device_graph(raft::resources const& res, - const cuvs::neighbors::cagra::search_params& search_params, - const index& idx, - raft::device_matrix_view dev_query_view, - raft::device_matrix_view dev_neighbors, - raft::device_matrix_view dev_distances, - raft::device_matrix_view dev_graph_output, - size_t curr_query_size, - size_t next_graph_degree, - size_t curr_topk, - uint64_t max_chunk_size) -{ - cuvs::spatial::knn::detail::utils::batch_load_iterator query_batch( - dev_query_view.data_handle(), - curr_query_size, - dev_query_view.extent(1), - max_chunk_size, - raft::resource::get_cuda_stream(res), - raft::resource::get_workspace_resource(res)); - - for (const auto& batch : query_batch) { - auto batch_dev_query_view = raft::make_device_matrix_view( - batch.data(), batch.size(), dev_query_view.extent(1)); - auto batch_dev_neighbors_view = raft::make_device_matrix_view( - dev_neighbors.data_handle(), batch.size(), curr_topk); - auto batch_dev_distances_view = raft::make_device_matrix_view( - dev_distances.data_handle(), batch.size(), curr_topk); - - cuvs::neighbors::cagra::search( - res, search_params, idx, batch_dev_query_view, batch_dev_neighbors_view, - batch_dev_distances_view); - - RAFT_CUDA_TRY(cudaMemcpy2DAsync( - dev_graph_output.data_handle() + batch.offset() * next_graph_degree, - next_graph_degree * sizeof(IdxT), - dev_neighbors.data_handle(), - curr_topk * sizeof(IdxT), - next_graph_degree * sizeof(IdxT), - batch.size(), - cudaMemcpyDeviceToDevice, - raft::resource::get_cuda_stream(res))); - } - - raft::resource::sync_stream(res); -} - template void search_and_optimize(raft::resources const& res, const cuvs::neighbors::cagra::search_params& search_params, @@ -2076,17 +2029,16 @@ void search_and_optimize(raft::resources const& res, raft::device_matrix_view dev_query_view, raft::device_matrix_view dev_neighbors, raft::device_matrix_view dev_distances, - raft::host_matrix_view neighbors_view, - raft::host_matrix& cagra_graph, + raft::device_matrix& dev_output_graph, size_t curr_query_size, size_t next_graph_degree, size_t curr_topk, - uint64_t max_chunk_size, - bool flag_last, - const index_params& params) + uint64_t max_chunk_size) { auto stream = raft::resource::get_cuda_stream(res); + auto dev_knn_graph = raft::make_device_matrix(res, curr_query_size, curr_topk); + cuvs::spatial::knn::detail::utils::batch_load_iterator query_batch( dev_query_view.data_handle(), curr_query_size, @@ -2109,19 +2061,16 @@ void search_and_optimize(raft::resources const& res, batch_dev_neighbors_view, batch_dev_distances_view); - raft::copy(neighbors_view.data_handle() + batch.offset() * curr_topk, + raft::copy(dev_knn_graph.data_handle() + batch.offset() * curr_topk, batch_dev_neighbors_view.data_handle(), batch.size() * curr_topk, stream); } - auto next_graph_size = curr_query_size; - cagra_graph = raft::make_host_matrix(0, 0); - cagra_graph = raft::make_host_matrix(next_graph_size, next_graph_degree); - optimize(res, - neighbors_view, - cagra_graph.view(), - flag_last ? params.guarantee_connectivity : false); + dev_output_graph = + raft::make_device_matrix(res, curr_query_size, next_graph_degree); + + graph::optimize(res, dev_knn_graph.view(), dev_output_graph.view(), false); } template (res, 0, 0); bool use_device_graph = false; @@ -2244,13 +2184,13 @@ auto iterative_build_graph( "# Freed original dataset from device (%.1f MiB); queries will use VPQ reconstruction", to_mib(final_graph_size * dataset_dim * sizeof(T))); } - bool do_skip = true; while (true) { auto start = std::chrono::high_resolution_clock::now(); auto curr_query_size = std::min(2 * curr_graph_size, final_graph_size); auto next_graph_degree = small_graph_degree; if (curr_graph_size == final_graph_size) { next_graph_degree = graph_degree; } + RAFT_LOG_INFO("Current graph size %lu: # current graph degree = %lu", (uint64_t)curr_graph_size, (uint64_t)next_graph_degree); // The search count (topk) is set to the next graph degree + 1, because // pruning is not used except in the last iteration. @@ -2262,9 +2202,6 @@ auto iterative_build_graph( curr_itopk_size = curr_topk + 32; } - do_skip = false;//params.skip_graph_optimization && !flag_last; - RAFT_LOG_INFO("# do_skip = %s", do_skip ? "true" : "false"); - cuvs::neighbors::cagra::search_params search_params; search_params.algo = cuvs::neighbors::cagra::search_algo::AUTO; search_params.max_queries = max_chunk_size; @@ -2311,46 +2248,22 @@ auto iterative_build_graph( : raft::make_device_matrix_view( dev_dataset.data_handle(), (int64_t)curr_query_size, dev_dataset.extent(1)); - if (do_skip) { - auto dev_graph_next = - raft::make_device_matrix(res, curr_query_size, next_graph_degree); - - search_to_device_graph(res, - search_params, - idx, - dev_query_view, - dev_neighbors.view(), - dev_distances.view(), - dev_graph_next.view(), - curr_query_size, - next_graph_degree, - curr_topk, - max_chunk_size); - - dev_graph = std::move(dev_graph_next); - use_device_graph = true; - } else { - auto neighbors_view = - raft::make_host_matrix_view(neighbors_ptr, curr_query_size, curr_topk); - - search_and_optimize(res, - search_params, - idx, - dev_query_view, - dev_neighbors.view(), - dev_distances.view(), - neighbors_view, - cagra_graph, - curr_query_size, - next_graph_degree, - curr_topk, - max_chunk_size, - flag_last, - params); - - dev_graph = raft::make_device_matrix(res, 0, 0); - use_device_graph = false; - } + auto dev_optimized_graph = raft::make_device_matrix(res, 0, 0); + + search_and_optimize(res, + search_params, + idx, + dev_query_view, + dev_neighbors.view(), + dev_distances.view(), + dev_optimized_graph, + curr_query_size, + next_graph_degree, + curr_topk, + max_chunk_size); + + dev_graph = std::move(dev_optimized_graph); + use_device_graph = true; auto end = std::chrono::high_resolution_clock::now(); auto elapsed_ms = std::chrono::duration_cast(end - start).count(); @@ -2362,6 +2275,14 @@ auto iterative_build_graph( curr_graph_size = next_graph_size; } + auto stream = raft::resource::get_cuda_stream(res); + cagra_graph = raft::make_host_matrix(dev_graph.extent(0), dev_graph.extent(1)); + raft::copy(cagra_graph.data_handle(), + dev_graph.data_handle(), + dev_graph.extent(0) * dev_graph.extent(1), + stream); + raft::resource::sync_stream(res); + return cagra_graph; } diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 5d43da851b..d5647dac00 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -1683,7 +1683,7 @@ void optimize(raft::resources const& res, // reverse graph creation will always use the GPU auto d_rev_graph = raft::make_device_mdarray( - res, large_tmp_mr, raft::make_extents(graph_size, output_graph_degree)); + res, default_ws_mr, raft::make_extents(graph_size, output_graph_degree)); // This should use the default workspace resource for random access / atomics auto d_rev_graph_count = raft::make_device_mdarray( From 1fc5acbc16294290cb6b637d9e712d723e1398e1 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Tue, 31 Mar 2026 15:55:29 +0900 Subject: [PATCH 088/119] Fix copyright --- cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh | 2 +- cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp | 2 +- cpp/tests/neighbors/ann_utils.cuh | 2 +- cpp/tests/neighbors/vpq_utils.cuh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 5de2478702..0cd3bd03a5 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp index 0f55b3efb2..06bcff2072 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ diff --git a/cpp/tests/neighbors/ann_utils.cuh b/cpp/tests/neighbors/ann_utils.cuh index a5bb7c5268..dc4a335e3c 100644 --- a/cpp/tests/neighbors/ann_utils.cuh +++ b/cpp/tests/neighbors/ann_utils.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ diff --git a/cpp/tests/neighbors/vpq_utils.cuh b/cpp/tests/neighbors/vpq_utils.cuh index 8ceb371413..35a14edc76 100644 --- a/cpp/tests/neighbors/vpq_utils.cuh +++ b/cpp/tests/neighbors/vpq_utils.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #include From 9fb763a2da2672e97dd8c2b85a8ec728e9236297 Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Wed, 8 Apr 2026 07:17:58 -0700 Subject: [PATCH 089/119] cmake fix --- cpp/cmake/modules/generate_jit_lto_kernels.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake/modules/generate_jit_lto_kernels.cmake b/cpp/cmake/modules/generate_jit_lto_kernels.cmake index e27f432b76..69356095c6 100644 --- a/cpp/cmake/modules/generate_jit_lto_kernels.cmake +++ b/cpp/cmake/modules/generate_jit_lto_kernels.cmake @@ -75,7 +75,7 @@ function(add_jit_lto_kernel kernel_target) OUTPUT "${_JIT_LTO_EMBEDDED_HEADER_FILE}" COMMAND "${bin_to_c}" --const --name embedded_fatbin --static $ > "${_JIT_LTO_EMBEDDED_HEADER_FILE}" - DEPENDS $ + DEPENDS ${kernel_target} $ ) endfunction() From c91011ff426a59a8a8c01ed89d826e176c1e9b1f Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Wed, 8 Apr 2026 08:56:17 -0700 Subject: [PATCH 090/119] Decoupled compression parameters used during iterative graph construction from the target compression --- .../src/cuvs/cuvs_ann_bench_param_parser.h | 8 ++++ cpp/include/cuvs/neighbors/cagra.hpp | 21 ++++++++- cpp/include/cuvs/neighbors/common.hpp | 12 +++++ .../neighbors/detail/cagra/cagra_build.cuh | 46 ++++++++++++++++--- 4 files changed, 79 insertions(+), 8 deletions(-) diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h index 2eaf3123a0..9b6acaaf10 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h +++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h @@ -295,6 +295,7 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_"); nlohmann::json nn_descent_conf = collect_conf_with_prefix(conf, "nn_descent_"); nlohmann::json ace_conf = collect_conf_with_prefix(conf, "ace_"); + nlohmann::json build_compression_conf = collect_conf_with_prefix(conf, "build_compression_"); // When graph_build_algo is not specified, leave graph_build_params as monostate so the // CAGRA build uses AUTO selection (NN_DESCENT or IVF_PQ based on dataset/heuristics). @@ -325,6 +326,13 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index } else if constexpr (std::is_same_v) { parse_build_param(nn_descent_conf, arg); + } else if constexpr (std::is_same_v< + U, cuvs::neighbors::graph_build_params::iterative_search_params>) { + if (!build_compression_conf.empty()) { + auto vpq_pams = arg.build_compression.value_or(cuvs::neighbors::vpq_params{}); + parse_build_param(build_compression_conf, vpq_pams); + arg.build_compression.emplace(vpq_pams); + } } }, params.graph_build_params); diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index 2c32a6c08e..0b0fb90d0e 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -31,7 +31,26 @@ #include namespace cuvs::neighbors::graph_build_params { -using iterative_search_params = cuvs::neighbors::search_params; + +/** + * Parameters for the iterative CAGRA graph build algorithm. + * + * The iterative builder repeatedly runs CAGRA search() and optimize() to + * refine the graph. When compression is used during graph construction, + * `build_compression` controls the VPQ parameters applied to the dataset + * *while building the graph*. This is independent of `index_params::compression`, + * which controls the compression of the dataset stored in the final index. + */ +struct iterative_search_params : cuvs::neighbors::search_params { + /** + * Optional VPQ compression parameters used during iterative graph construction. + * + * When set, the dataset is compressed with these parameters for the + * search-and-optimize loop. When std::nullopt (default), the builder + * falls back to `index_params::compression` (original behaviour). + */ + std::optional build_compression = std::nullopt; +}; /** Specialized parameters for ACE (Augmented Core Extraction) graph build */ struct ace_params { diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp index c7111aaf4a..2c12f6ef34 100644 --- a/cpp/include/cuvs/neighbors/common.hpp +++ b/cpp/include/cuvs/neighbors/common.hpp @@ -94,6 +94,18 @@ struct vpq_params { * The max number of data points to use per VQ cluster during training. */ uint32_t max_train_points_per_vq_cluster = 1024; + + friend bool operator==(const vpq_params& a, const vpq_params& b) + { + return a.pq_bits == b.pq_bits && a.pq_dim == b.pq_dim && a.vq_n_centers == b.vq_n_centers && + a.kmeans_n_iters == b.kmeans_n_iters && + a.vq_kmeans_trainset_fraction == b.vq_kmeans_trainset_fraction && + a.pq_kmeans_trainset_fraction == b.pq_kmeans_trainset_fraction && + a.pq_kmeans_type == b.pq_kmeans_type && + a.max_train_points_per_pq_code == b.max_train_points_per_pq_code && + a.max_train_points_per_vq_cluster == b.max_train_points_per_vq_cluster; + } + friend bool operator!=(const vpq_params& a, const vpq_params& b) { return !(a == b); } }; /** @} */ // end group cagra_cpp_index_params diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index eaa4411c52..13e91868d5 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -2085,6 +2085,34 @@ auto iterative_build_graph( size_t intermediate_degree = params.intermediate_graph_degree; size_t graph_degree = params.graph_degree; + // Resolve the compression parameters for the build loop. + // `build_compression` (from iterative_search_params) takes priority; + // if unset, fall back to `params.compression` (original behaviour). + const auto& iter_params = + std::get(params.graph_build_params); + const auto& build_compression = + iter_params.build_compression.has_value() ? iter_params.build_compression : params.compression; + + if (build_compression.has_value()) { + const auto& bc = *build_compression; + RAFT_LOG_INFO( + "Build compression params: pq_bits=%u, pq_dim=%u, vq_n_centers=%u, kmeans_n_iters=%u, " + "vq_kmeans_trainset_fraction=%.4f, pq_kmeans_trainset_fraction=%.4f, " + "max_train_points_per_pq_code=%u, max_train_points_per_vq_cluster=%u%s", + bc.pq_bits, + bc.pq_dim, + bc.vq_n_centers, + bc.kmeans_n_iters, + bc.vq_kmeans_trainset_fraction, + bc.pq_kmeans_trainset_fraction, + bc.max_train_points_per_pq_code, + bc.max_train_points_per_vq_cluster, + iter_params.build_compression.has_value() ? " (from build_compression)" + : " (from compression)"); + } else { + RAFT_LOG_INFO("Build compression: disabled (uncompressed build)"); + } + auto cagra_graph = raft::make_host_matrix(0, 0); // Iteratively improve the accuracy of the graph by repeatedly running @@ -2164,7 +2192,7 @@ auto iterative_build_graph( // Generate the compressed index once if compression is enabled const uint64_t dataset_dim = dev_dataset.extent(1); std::optional> idx_opt; - if (params.compression.has_value()) { + if (build_compression.has_value()) { auto start = std::chrono::high_resolution_clock::now(); RAFT_EXPECTS(params.metric == cuvs::distance::DistanceType::L2Expanded, "VPQ compression is only supported with L2Expanded distance mertric"); @@ -2173,7 +2201,7 @@ auto iterative_build_graph( res, // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later cuvs::preprocessing::quantize::pq::vpq_build( - res, *params.compression, dev_dataset)); + res, *build_compression, dev_dataset)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_ms = std::chrono::duration_cast(end - start).count(); RAFT_LOG_INFO("# VPQ compression time: %.3lf sec", (double)elapsed_ms / 1000); @@ -2219,7 +2247,7 @@ auto iterative_build_graph( search_params.search_width = 1; // Create index and query views. - if (!params.compression.has_value()) { + if (!build_compression.has_value()) { auto dev_dataset_view = raft::make_device_matrix_view( dev_dataset.data_handle(), (int64_t)curr_graph_size, dev_dataset.extent(1)); if (use_device_graph) { @@ -2241,17 +2269,17 @@ auto iterative_build_graph( // When compression is enabled, reconstruct queries from VPQ codes instead of // reading from the (freed) original dataset. auto dev_reconstructed_queries = - params.compression.has_value() + build_compression.has_value() ? raft::make_device_matrix(res, curr_query_size, dataset_dim) : raft::make_device_matrix(res, 0, 0); - if (params.compression.has_value()) { + if (build_compression.has_value()) { auto* vpq_dset = dynamic_cast*>(&idx.data()); RAFT_EXPECTS(vpq_dset != nullptr, "Expected VPQ dataset in compressed index"); reconstruct_vpq_queries( res, *vpq_dset, 0, curr_query_size, dev_reconstructed_queries.view()); } - auto dev_query_view = params.compression.has_value() + auto dev_query_view = build_compression.has_value() ? raft::make_device_matrix_view( dev_reconstructed_queries.data_handle(), (int64_t)curr_query_size, dataset_dim) : raft::make_device_matrix_view( @@ -2284,6 +2312,9 @@ auto iterative_build_graph( curr_graph_size = next_graph_size; } + // TODO: when build_compression matches params.compression, the dataset is compressed twice + // (once for the build loop and once in build()'s shared tail). We could avoid this by returning + // the index directly (with its VPQ dataset and device-side graph) instead of just the host graph. auto stream = raft::resource::get_cuda_stream(res); cagra_graph = raft::make_host_matrix(dev_graph.extent(0), dev_graph.extent(1)); raft::copy(cagra_graph.data_handle(), @@ -2384,7 +2415,8 @@ index build( } if (nn_descent_params.graph_degree != intermediate_degree) { RAFT_LOG_WARN( - "Graph degree (%lu) for nn-descent needs to match cagra intermediate graph degree (%lu), " + "Graph degree (%lu) for nn-descent needs to match cagra intermediate graph degree " + "(%lu), " "aligning " "nn-descent graph_degree.", nn_descent_params.graph_degree, From fbba6b259e37bf3b3d47255cb94c13422863dddf Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 8 Apr 2026 18:28:38 +0900 Subject: [PATCH 091/119] Add enable_fp8 --- cpp/CMakeLists.txt | 6 +++--- .../neighbors/detail/cagra/compute_distance_vpq_inst.cu.in | 4 +++- .../neighbors/detail/cagra/compute_distance_vpq_matrix.json | 3 ++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 72a4d65a5a..aa2b92d825 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -268,12 +268,12 @@ if(NOT BUILD_CPU_ONLY) INPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in" OUTPUT_FILE_FORMAT - "${CMAKE_CURRENT_BINARY_DIR}/src/neighbors/detail/cagra/compute_distance_vpq_inst_data_@data_abbrev@_index_@index_abbrev@_distance_@distance_abbrev@_codebook_@codebook_abbrev@_metric_@metric@_team_@team_size@_dim_@dim@_pq_bits_@pq_bits@_pq_len_@pq_len@.cu" + "${CMAKE_CURRENT_BINARY_DIR}/src/neighbors/detail/cagra/compute_distance_vpq_inst_data_@data_abbrev@_index_@index_abbrev@_distance_@distance_abbrev@_codebook_@codebook_abbrev@_metric_@metric@_team_@team_size@_dim_@dim@_pq_bits_@pq_bits@_pq_len_@pq_len@_fp8@enable_fp8@.cu" ) generate_string_matrix( cagra_compute_distance_vpq_selector_template_params ITEM_FORMAT - "\nvpq_descriptor_spec" + "\nvpq_descriptor_spec" GLUE "," MATRIX_JSON_FILE @@ -282,7 +282,7 @@ if(NOT BUILD_CPU_ONLY) generate_string_matrix( cagra_compute_distance_vpq_template_inst ITEM_FORMAT - "extern template struct vpq_descriptor_spec@semicolon@" + "extern template struct vpq_descriptor_spec@semicolon@" GLUE "\n" MATRIX_JSON_FILE diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in index c159da3229..676f25c9fd 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in @@ -13,6 +13,7 @@ constexpr uint32_t team_size = @team_size@; constexpr uint32_t dim = @dim@; constexpr uint32_t pq_bits = @pq_bits@; constexpr uint32_t pq_len = @pq_len@; +constexpr bool enable_fp8 = @enable_fp8@; using codebook_t = @codebook_type@; using data_t = @data_type@; using index_t = @index_type@; @@ -30,6 +31,7 @@ template struct vpq_descriptor_spec; + distance_t, + enable_fp8>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json index cf6e060d33..76c0f888eb 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json @@ -55,5 +55,6 @@ ], "pq_bits": ["8"], "pq_len": ["2", "4"], - "metric": ["L2Expanded"] + "metric": ["L2Expanded"], + "enable_fp8": ["true", "false"] } From 7ed0fe7e9a15ddc826940c146da8b93791188de1 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Thu, 9 Apr 2026 12:41:46 +0900 Subject: [PATCH 092/119] Fix smem_dtype validation --- cpp/src/neighbors/detail/cagra/cagra_search.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index f49e1a4d4c..7c6687efbb 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -152,7 +152,7 @@ void search_main(raft::resources const& res, // Dispatch search parameters based on the dataset kind. if (auto* strided_dset = dynamic_cast*>(&index.data()); strided_dset != nullptr) { - if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO || + if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO && params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::F16) { RAFT_LOG_WARN("In this search mode, only AUTO or F16 are supported as the smem_dtype."); } From b7f52101a10ed5a1bbf97dab370aeb8e666c6282 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Thu, 9 Apr 2026 12:44:58 +0900 Subject: [PATCH 093/119] Fix params.smem_dtype set --- cpp/src/neighbors/detail/cagra/cagra_search.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index 7c6687efbb..1445676631 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -154,7 +154,8 @@ void search_main(raft::resources const& res, strided_dset != nullptr) { if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO && params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::F16) { - RAFT_LOG_WARN("In this search mode, only AUTO or F16 are supported as the smem_dtype."); + RAFT_LOG_WARN("In this search mode, smem_dtype supports only AUTO or F16. Set it to AUTO."); + params.smem_dtype = cuvs::neighbors::cagra::internal_dtype::AUTO; } // Search using a plain (strided) row-major dataset RAFT_EXPECTS(index.metric() != cuvs::distance::DistanceType::CosineExpanded || From beb0a47fffa4722b76f5d86ddabdad7ba716f864 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Thu, 9 Apr 2026 13:51:42 +0900 Subject: [PATCH 094/119] Fix CAGRA VPQ instance list --- .../cagra/compute_distance_vpq_matrix.json | 55 +++++++++++++++++-- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json index 76c0f888eb..06aea3f10d 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json @@ -34,17 +34,65 @@ } ], "_mxdim_team": [ + { + "dim": "64", + "team_size": "4", + "pq_len": "2" + }, + { + "dim": "128", + "team_size": "8", + "pq_len": "2" + }, + { + "dim": "256", + "team_size": "16", + "pq_len": "2" + }, + { + "dim": "512", + "team_size": "32", + "pq_len": "2" + }, + { + "dim": "64", + "team_size": "4", + "pq_len": "4" + }, + { + "dim": "128", + "team_size": "8", + "pq_len": "4" + }, + { + "dim": "256", + "team_size": "16", + "pq_len": "4" + }, + { + "dim": "512", + "team_size": "32", + "pq_len": "4" + }, { "dim": "128", - "team_size": "8" + "team_size": "4", + "pq_len": "8" }, { "dim": "256", - "team_size": "16" + "team_size": "8", + "pq_len": "8" }, { "dim": "512", - "team_size": "32" + "team_size": "16", + "pq_len": "8" + }, + { + "dim": "1024", + "team_size": "32", + "pq_len": "8" } ], "_codebook": [ @@ -54,7 +102,6 @@ } ], "pq_bits": ["8"], - "pq_len": ["2", "4"], "metric": ["L2Expanded"], "enable_fp8": ["true", "false"] } From d430c39a15eb9c0eda4b6a2d536b83c029d8fdfd Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Thu, 9 Apr 2026 00:50:47 -0700 Subject: [PATCH 095/119] fixed the In this search mode, only AUTO or F16 are supported as the smem_dtype. warning bug --- cpp/src/neighbors/detail/cagra/cagra_build.cuh | 3 +-- cpp/src/neighbors/detail/cagra/cagra_search.cuh | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 13e91868d5..1866a0ab7d 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -2415,8 +2415,7 @@ index build( } if (nn_descent_params.graph_degree != intermediate_degree) { RAFT_LOG_WARN( - "Graph degree (%lu) for nn-descent needs to match cagra intermediate graph degree " - "(%lu), " + "Graph degree (%lu) for nn-descent needs to match cagra intermediate graph degree (%lu), " "aligning " "nn-descent graph_degree.", nn_descent_params.graph_degree, diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index 3917b3160b..63743a8197 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -154,7 +154,7 @@ void search_main(raft::resources const& res, // Dispatch search parameters based on the dataset kind. if (auto* strided_dset = dynamic_cast*>(&index.data()); strided_dset != nullptr) { - if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO || + if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO && params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::F16) { RAFT_LOG_WARN("In this search mode, only AUTO or F16 are supported as the smem_dtype."); } From 471069b221adc284ec6f91c0ef9b77b85aaea5f4 Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Thu, 9 Apr 2026 01:04:28 -0700 Subject: [PATCH 096/119] Fix structured binding mismatch in calc_recall and add explicit return types Made-with: Cursor --- cpp/tests/neighbors/ann_utils.cuh | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/tests/neighbors/ann_utils.cuh b/cpp/tests/neighbors/ann_utils.cuh index dc4a335e3c..5fd010806e 100644 --- a/cpp/tests/neighbors/ann_utils.cuh +++ b/cpp/tests/neighbors/ann_utils.cuh @@ -128,10 +128,10 @@ struct idx_dist_pair { /** Calculate recall value using only neighbor indices */ template -auto calc_recall(const std::vector& expected_idx, - const std::vector& actual_idx, - size_t rows, - size_t cols) +std::tuple calc_recall(const std::vector& expected_idx, + const std::vector& actual_idx, + size_t rows, + size_t cols) { size_t match_count = 0; size_t total_count = static_cast(rows) * static_cast(cols); @@ -196,7 +196,7 @@ auto eval_recall(const std::vector& expected_idx, double min_recall, bool test_unique = true) -> testing::AssertionResult { - auto [actual_recall, index_based_actual_recall, match_count, total_count] = + auto [actual_recall, match_count, total_count] = calc_recall(expected_idx, actual_idx, rows, cols); double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps); RAFT_LOG_INFO("Recall = %f (%zu/%zu), the error is %2.1f%% %s the threshold (eps = %f).", @@ -220,13 +220,13 @@ auto eval_recall(const std::vector& expected_idx, /** Overload of calc_recall to account for distances */ template -auto calc_recall(const std::vector& expected_idx, - const std::vector& actual_idx, - const std::vector& expected_dist, - const std::vector& actual_dist, - size_t rows, - size_t cols, - double eps) +std::tuple calc_recall(const std::vector& expected_idx, + const std::vector& actual_idx, + const std::vector& expected_dist, + const std::vector& actual_dist, + size_t rows, + size_t cols, + double eps) { size_t match_count = 0; size_t index_match_count = 0; From 648ade4f55713af43928bb021bbba6ec96302f2b Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Thu, 9 Apr 2026 02:09:29 -0700 Subject: [PATCH 097/119] Search parameters used during iterative cagra graph construction are now also configurable --- .../src/cuvs/cuvs_ann_bench_param_parser.h | 62 +++++ cpp/include/cuvs/neighbors/cagra.hpp | 220 +++++++++--------- .../neighbors/detail/cagra/cagra_build.cuh | 12 +- 3 files changed, 184 insertions(+), 110 deletions(-) diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h index 9b6acaaf10..137732ca92 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h +++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h @@ -296,6 +296,7 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index nlohmann::json nn_descent_conf = collect_conf_with_prefix(conf, "nn_descent_"); nlohmann::json ace_conf = collect_conf_with_prefix(conf, "ace_"); nlohmann::json build_compression_conf = collect_conf_with_prefix(conf, "build_compression_"); + nlohmann::json build_search_conf = collect_conf_with_prefix(conf, "build_search_"); // When graph_build_algo is not specified, leave graph_build_params as monostate so the // CAGRA build uses AUTO selection (NN_DESCENT or IVF_PQ based on dataset/heuristics). @@ -333,6 +334,67 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index parse_build_param(build_compression_conf, vpq_pams); arg.build_compression.emplace(vpq_pams); } + if (build_search_conf.contains("width")) { + arg.search_width = build_search_conf.at("width"); + } + if (build_search_conf.contains("max_iterations")) { + arg.max_iterations = build_search_conf.at("max_iterations"); + } + if (build_search_conf.contains("min_iterations")) { + arg.min_iterations = build_search_conf.at("min_iterations"); + } + if (build_search_conf.contains("itopk")) { + arg.itopk_size = build_search_conf.at("itopk"); + } + if (build_search_conf.contains("max_queries")) { + arg.max_queries = build_search_conf.at("max_queries"); + } + if (build_search_conf.contains("team_size")) { + arg.team_size = build_search_conf.at("team_size"); + } + if (build_search_conf.contains("thread_block_size")) { + arg.thread_block_size = build_search_conf.at("thread_block_size"); + } + if (build_search_conf.contains("hashmap_min_bitlen")) { + arg.hashmap_min_bitlen = build_search_conf.at("hashmap_min_bitlen"); + } + if (build_search_conf.contains("hashmap_max_fill_rate")) { + arg.hashmap_max_fill_rate = build_search_conf.at("hashmap_max_fill_rate"); + } + if (build_search_conf.contains("num_random_samplings")) { + arg.num_random_samplings = build_search_conf.at("num_random_samplings"); + } + if (build_search_conf.contains("persistent")) { + arg.persistent = build_search_conf.at("persistent"); + } + if (build_search_conf.contains("persistent_lifetime")) { + arg.persistent_lifetime = build_search_conf.at("persistent_lifetime"); + } + if (build_search_conf.contains("persistent_device_usage")) { + arg.persistent_device_usage = build_search_conf.at("persistent_device_usage"); + } + if (build_search_conf.contains("algo")) { + std::string algo = build_search_conf.at("algo"); + if (algo == "single_cta") { + arg.algo = cuvs::neighbors::cagra::search_algo::SINGLE_CTA; + } else if (algo == "multi_cta") { + arg.algo = cuvs::neighbors::cagra::search_algo::MULTI_CTA; + } else if (algo == "multi_kernel") { + arg.algo = cuvs::neighbors::cagra::search_algo::MULTI_KERNEL; + } else if (algo == "auto") { + arg.algo = cuvs::neighbors::cagra::search_algo::AUTO; + } + } + if (build_search_conf.contains("hashmap_mode")) { + std::string mode = build_search_conf.at("hashmap_mode"); + if (mode == "hash") { + arg.hashmap_mode = cuvs::neighbors::cagra::hash_mode::HASH; + } else if (mode == "small") { + arg.hashmap_mode = cuvs::neighbors::cagra::hash_mode::SMALL; + } else if (mode == "auto") { + arg.hashmap_mode = cuvs::neighbors::cagra::hash_mode::AUTO; + } + } } }, params.graph_build_params); diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index 0b0fb90d0e..d7f8f54afc 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -30,18 +30,125 @@ #include #include +namespace cuvs::neighbors::cagra { + +/** + * @defgroup cagra_cpp_search_params CAGRA index search parameters + * @{ + */ + +enum class search_algo { + /** For large batch sizes. */ + SINGLE_CTA = 0, + /** For small batch sizes. */ + MULTI_CTA = 1, + MULTI_KERNEL = 2, + AUTO = 100 +}; + +enum class hash_mode { HASH = 0, SMALL = 1, AUTO = 100 }; + +enum class internal_dtype { F16 = 0, E5M2 = 1, AUTO = 100 }; + +struct search_params : cuvs::neighbors::search_params { + /** Maximum number of queries to search at the same time (batch size). Auto select when 0.*/ + size_t max_queries = 0; + + /** Number of intermediate search results retained during the search. + * + * This is the main knob to adjust trade off between accuracy and search speed. + * Higher values improve the search accuracy. + */ + size_t itopk_size = 64; + + /** Upper limit of search iterations. Auto select when 0.*/ + size_t max_iterations = 0; + + // In the following we list additional search parameters for fine tuning. + // Reasonable default values are automatically chosen. + + /** Which search implementation to use. */ + search_algo algo = search_algo::AUTO; + + /** Number of threads used to calculate a single distance. 4, 8, 16, or 32. */ + size_t team_size = 0; + + /** Number of graph nodes to select as the starting point for the search in each iteration. aka + * search width?*/ + size_t search_width = 1; + /** Lower limit of search iterations. */ + size_t min_iterations = 0; + + /** Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0. */ + size_t thread_block_size = 0; + /** Hashmap type. Auto selection when AUTO. */ + hash_mode hashmap_mode = hash_mode::AUTO; + /** Lower limit of hashmap bit length. More than 8. */ + size_t hashmap_min_bitlen = 0; + /** Upper limit of hashmap fill rate. More than 0.1, less than 0.9.*/ + float hashmap_max_fill_rate = 0.5; + + /** Number of iterations of initial random seed node selection. 1 or more. */ + uint32_t num_random_samplings = 1; + /** Bit mask used for initial random seed node selection. */ + uint64_t rand_xor_mask = 0x128394; + + /** Whether to use the persistent version of the kernel (only SINGLE_CTA is supported a.t.m.) */ + bool persistent = false; + /** Persistent kernel: time in seconds before the kernel stops if no requests received. */ + float persistent_lifetime = 2; + /** + * Set the fraction of maximum grid size used by persistent kernel. + * Value 1.0 means the kernel grid size is maximum possible for the selected device. + * The value must be greater than 0.0 and not greater than 1.0. + * + * One may need to run other kernels alongside this persistent kernel. This parameter can + * be used to reduce the grid size of the persistent kernel to leave a few SMs idle. + * Note: running any other work on GPU alongside with the persistent kernel makes the setup + * fragile. + * - Running another kernel in another thread usually works, but no progress guaranteed + * - Any CUDA allocations block the context (this issue may be obscured by using pools) + * - Memory copies to not-pinned host memory may block the context + * + * Even when we know there are no other kernels working at the same time, setting + * kDeviceUsage to 1.0 surprisingly sometimes hurts performance. Proceed with care. + * If you suspect this is an issue, you can reduce this number to ~0.9 without a significant + * impact on the throughput. + */ + float persistent_device_usage = 1.0; + + /** + * A parameter indicating the rate of nodes to be filtered-out, when filtering is used. + * The value must be equal to or greater than 0.0 and less than 1.0. Default value is + * negative, in which case the filtering rate is automatically calculated. + */ + float filtering_rate = -1.0; + + /** Data type of the query vector and codebook table on shared memory. Currently, only VPQ + * supports FP8. **/ + internal_dtype smem_dtype = internal_dtype::AUTO; +}; + +/** @} */ + +} // namespace cuvs::neighbors::cagra + namespace cuvs::neighbors::graph_build_params { /** * Parameters for the iterative CAGRA graph build algorithm. * - * The iterative builder repeatedly runs CAGRA search() and optimize() to - * refine the graph. When compression is used during graph construction, + * Inherits from cagra::search_params so that all search tuning knobs + * (search_width, max_iterations, itopk_size, etc.) are available for + * controlling the search-and-optimize loop during graph construction. + * The defaults are tuned for the build loop (e.g. search_width=1, + * max_iterations=8) and may differ from the regular search defaults. + * * `build_compression` controls the VPQ parameters applied to the dataset * *while building the graph*. This is independent of `index_params::compression`, * which controls the compression of the dataset stored in the final index. */ -struct iterative_search_params : cuvs::neighbors::search_params { +struct iterative_search_params : cuvs::neighbors::cagra::search_params { /** * Optional VPQ compression parameters used during iterative graph construction. * @@ -50,6 +157,12 @@ struct iterative_search_params : cuvs::neighbors::search_params { * falls back to `index_params::compression` (original behaviour). */ std::optional build_compression = std::nullopt; + + iterative_search_params() + { + this->search_width = 1; + this->max_iterations = 8; + } }; /** Specialized parameters for ACE (Augmented Core Extraction) graph build */ @@ -277,107 +390,6 @@ struct index_params : cuvs::neighbors::index_params { cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded); }; -/** - * @} - */ - -/** - * @defgroup cagra_cpp_search_params CAGRA index search parameters - * @{ - */ - -enum class search_algo { - /** For large batch sizes. */ - SINGLE_CTA = 0, - /** For small batch sizes. */ - MULTI_CTA = 1, - MULTI_KERNEL = 2, - AUTO = 100 -}; - -enum class hash_mode { HASH = 0, SMALL = 1, AUTO = 100 }; - -enum class internal_dtype { F16 = 0, E5M2 = 1, AUTO = 100 }; - -struct search_params : cuvs::neighbors::search_params { - /** Maximum number of queries to search at the same time (batch size). Auto select when 0.*/ - size_t max_queries = 0; - - /** Number of intermediate search results retained during the search. - * - * This is the main knob to adjust trade off between accuracy and search speed. - * Higher values improve the search accuracy. - */ - size_t itopk_size = 64; - - /** Upper limit of search iterations. Auto select when 0.*/ - size_t max_iterations = 0; - - // In the following we list additional search parameters for fine tuning. - // Reasonable default values are automatically chosen. - - /** Which search implementation to use. */ - search_algo algo = search_algo::AUTO; - - /** Number of threads used to calculate a single distance. 4, 8, 16, or 32. */ - size_t team_size = 0; - - /** Number of graph nodes to select as the starting point for the search in each iteration. aka - * search width?*/ - size_t search_width = 1; - /** Lower limit of search iterations. */ - size_t min_iterations = 0; - - /** Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0. */ - size_t thread_block_size = 0; - /** Hashmap type. Auto selection when AUTO. */ - hash_mode hashmap_mode = hash_mode::AUTO; - /** Lower limit of hashmap bit length. More than 8. */ - size_t hashmap_min_bitlen = 0; - /** Upper limit of hashmap fill rate. More than 0.1, less than 0.9.*/ - float hashmap_max_fill_rate = 0.5; - - /** Number of iterations of initial random seed node selection. 1 or more. */ - uint32_t num_random_samplings = 1; - /** Bit mask used for initial random seed node selection. */ - uint64_t rand_xor_mask = 0x128394; - - /** Whether to use the persistent version of the kernel (only SINGLE_CTA is supported a.t.m.) */ - bool persistent = false; - /** Persistent kernel: time in seconds before the kernel stops if no requests received. */ - float persistent_lifetime = 2; - /** - * Set the fraction of maximum grid size used by persistent kernel. - * Value 1.0 means the kernel grid size is maximum possible for the selected device. - * The value must be greater than 0.0 and not greater than 1.0. - * - * One may need to run other kernels alongside this persistent kernel. This parameter can - * be used to reduce the grid size of the persistent kernel to leave a few SMs idle. - * Note: running any other work on GPU alongside with the persistent kernel makes the setup - * fragile. - * - Running another kernel in another thread usually works, but no progress guaranteed - * - Any CUDA allocations block the context (this issue may be obscured by using pools) - * - Memory copies to not-pinned host memory may block the context - * - * Even when we know there are no other kernels working at the same time, setting - * kDeviceUsage to 1.0 surprisingly sometimes hurts performance. Proceed with care. - * If you suspect this is an issue, you can reduce this number to ~0.9 without a significant - * impact on the throughput. - */ - float persistent_device_usage = 1.0; - - /** - * A parameter indicating the rate of nodes to be filtered-out, when filtering is used. - * The value must be equal to or greater than 0.0 and less than 1.0. Default value is - * negative, in which case the filtering rate is automatically calculated. - */ - float filtering_rate = -1.0; - - /** Data type of the query vector and codebook table on shared memory. Currently, only VPQ - * supports FP8. **/ - internal_dtype smem_dtype = internal_dtype::AUTO; -}; - /** * @} */ diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 1866a0ab7d..03bbd45dae 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -2112,6 +2112,9 @@ auto iterative_build_graph( } else { RAFT_LOG_INFO("Build compression: disabled (uncompressed build)"); } + RAFT_LOG_INFO("Build search params: search_width=%zu, max_iterations=%zu", + iter_params.search_width, + iter_params.max_iterations); auto cagra_graph = raft::make_host_matrix(0, 0); @@ -2239,12 +2242,9 @@ auto iterative_build_graph( (uint64_t)curr_itopk_size, (uint64_t)curr_topk); - cuvs::neighbors::cagra::search_params search_params; - search_params.algo = cuvs::neighbors::cagra::search_algo::AUTO; - search_params.max_queries = max_chunk_size; - search_params.itopk_size = curr_itopk_size; - search_params.max_iterations = 8; - search_params.search_width = 1; + cuvs::neighbors::cagra::search_params search_params = iter_params; + search_params.max_queries = max_chunk_size; + search_params.itopk_size = curr_itopk_size; // Create index and query views. if (!build_compression.has_value()) { From d780bc7d9618b66c2e3d5b16d385db217ba7c3e7 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Thu, 9 Apr 2026 22:34:11 +0900 Subject: [PATCH 098/119] Remove unnecessary files --- .../detail/cagra/compute_distance.cu | 386 ++++++++++++++++++ 1 file changed, 386 insertions(+) create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance.cu diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu new file mode 100644 index 0000000000..4e1625fdd7 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu @@ -0,0 +1,386 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * NOTE: this file is generated by compute_distance_00_generate.py + * + * Make changes there and run in this directory: + * + * > python compute_distance_00_generate.py + * + */ + +#include "compute_distance-ext.cuh" + +namespace cuvs::neighbors::cagra::detail { + +using namespace cuvs::distance; + +template struct instance_selector< + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + vpq_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec, + standard_descriptor_spec>; + +} // namespace cuvs::neighbors::cagra::detail From 9d19f74f227ada9c48579586f8d8ab3cc94fd968 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Thu, 9 Apr 2026 22:34:54 +0900 Subject: [PATCH 099/119] Remove unnecessary files (2) --- .../detail/cagra/compute_distance.cu | 386 ------------------ 1 file changed, 386 deletions(-) delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance.cu diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu deleted file mode 100644 index 4e1625fdd7..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance.cu +++ /dev/null @@ -1,386 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance-ext.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; - -template struct instance_selector< - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - vpq_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec, - standard_descriptor_spec>; - -} // namespace cuvs::neighbors::cagra::detail From c3a3cd968b5461cf8887ef4d91c914f13cb22f86 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Thu, 9 Apr 2026 22:45:56 +0900 Subject: [PATCH 100/119] Remove unnecessary files (3) --- ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu | 31 ------------------- ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu | 31 ------------------- ...int32_dim128_t4_8pq_8subd_half_fp8false.cu | 31 ------------------- ...uint32_dim128_t4_8pq_8subd_half_fp8true.cu | 31 ------------------- ...int32_dim128_t8_8pq_2subd_half_fp8false.cu | 31 ------------------- ...uint32_dim128_t8_8pq_2subd_half_fp8true.cu | 31 ------------------- ...int32_dim128_t8_8pq_4subd_half_fp8false.cu | 31 ------------------- ...uint32_dim128_t8_8pq_4subd_half_fp8true.cu | 31 ------------------- ...nt32_dim256_t16_8pq_2subd_half_fp8false.cu | 31 ------------------- ...int32_dim256_t16_8pq_2subd_half_fp8true.cu | 31 ------------------- ...nt32_dim256_t16_8pq_4subd_half_fp8false.cu | 31 ------------------- ...int32_dim256_t16_8pq_4subd_half_fp8true.cu | 31 ------------------- ...int32_dim256_t8_8pq_8subd_half_fp8false.cu | 31 ------------------- ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu | 31 ------------------- ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu | 31 ------------------- ...int32_dim512_t16_8pq_8subd_half_fp8true.cu | 31 ------------------- ...nt32_dim512_t32_8pq_2subd_half_fp8false.cu | 31 ------------------- ...int32_dim512_t32_8pq_2subd_half_fp8true.cu | 31 ------------------- ...nt32_dim512_t32_8pq_4subd_half_fp8false.cu | 31 ------------------- ...int32_dim512_t32_8pq_4subd_half_fp8true.cu | 31 ------------------- ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu | 31 ------------------- ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu | 31 ------------------- ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu | 31 ------------------- ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu | 31 ------------------- ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu | 31 ------------------- ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu | 31 ------------------- ...int32_dim128_t4_8pq_8subd_half_fp8false.cu | 31 ------------------- ...int32_dim128_t8_8pq_4subd_half_fp8false.cu | 31 ------------------- ...int32_dim256_t8_8pq_8subd_half_fp8false.cu | 31 ------------------- ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu | 31 ------------------- ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu | 31 ------------------- ...int32_dim512_t16_8pq_8subd_half_fp8true.cu | 31 ------------------- ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu | 31 ------------------- ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu | 31 ------------------- ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu | 31 ------------------- ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu | 31 ------------------- ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu | 31 ------------------- ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu | 31 ------------------- ...int32_dim128_t4_8pq_8subd_half_fp8false.cu | 31 ------------------- ...int32_dim128_t8_8pq_2subd_half_fp8false.cu | 31 ------------------- ...int32_dim128_t8_8pq_4subd_half_fp8false.cu | 31 ------------------- ...uint32_dim128_t8_8pq_4subd_half_fp8true.cu | 31 ------------------- ...nt32_dim256_t16_8pq_2subd_half_fp8false.cu | 31 ------------------- ...nt32_dim256_t16_8pq_4subd_half_fp8false.cu | 31 ------------------- ...int32_dim256_t8_8pq_8subd_half_fp8false.cu | 31 ------------------- ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu | 31 ------------------- ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu | 31 ------------------- ...int32_dim512_t16_8pq_8subd_half_fp8true.cu | 31 ------------------- ...nt32_dim512_t32_8pq_2subd_half_fp8false.cu | 31 ------------------- ...nt32_dim512_t32_8pq_4subd_half_fp8false.cu | 31 ------------------- ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu | 31 ------------------- ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu | 31 ------------------- ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu | 31 ------------------- ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu | 31 ------------------- ...t32_dim1024_t32_8pq_8subd_half_fp8false.cu | 31 ------------------- ...nt32_dim1024_t32_8pq_8subd_half_fp8true.cu | 31 ------------------- ...int32_dim128_t4_8pq_8subd_half_fp8false.cu | 31 ------------------- ...int32_dim128_t8_8pq_2subd_half_fp8false.cu | 31 ------------------- ...int32_dim128_t8_8pq_4subd_half_fp8false.cu | 31 ------------------- ...uint32_dim128_t8_8pq_4subd_half_fp8true.cu | 31 ------------------- ...nt32_dim256_t16_8pq_2subd_half_fp8false.cu | 31 ------------------- ...nt32_dim256_t16_8pq_4subd_half_fp8false.cu | 31 ------------------- ...int32_dim256_t8_8pq_8subd_half_fp8false.cu | 31 ------------------- ...uint32_dim256_t8_8pq_8subd_half_fp8true.cu | 31 ------------------- ...nt32_dim512_t16_8pq_8subd_half_fp8false.cu | 31 ------------------- ...int32_dim512_t16_8pq_8subd_half_fp8true.cu | 31 ------------------- ...nt32_dim512_t32_8pq_2subd_half_fp8false.cu | 31 ------------------- ...nt32_dim512_t32_8pq_4subd_half_fp8false.cu | 31 ------------------- ...uint32_dim64_t4_8pq_2subd_half_fp8false.cu | 31 ------------------- ..._uint32_dim64_t4_8pq_2subd_half_fp8true.cu | 31 ------------------- ...uint32_dim64_t4_8pq_4subd_half_fp8false.cu | 31 ------------------- ..._uint32_dim64_t4_8pq_4subd_half_fp8true.cu | 31 ------------------- 72 files changed, 2232 deletions(-) delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu deleted file mode 100644 index 2070e7d8f2..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu deleted file mode 100644 index f97bd67591..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu deleted file mode 100644 index b039619c1c..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu deleted file mode 100644 index f280b96812..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t4_8pq_8subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu deleted file mode 100644 index fd9d5223da..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu deleted file mode 100644 index 5ffda49346..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu deleted file mode 100644 index 135144698c..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu deleted file mode 100644 index ec79a11832..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu deleted file mode 100644 index 458e056b0e..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu deleted file mode 100644 index c13492a8c7..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu deleted file mode 100644 index 461af81977..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu deleted file mode 100644 index 30083be1fa..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu deleted file mode 100644 index 972e309ce7..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu deleted file mode 100644 index 23f0fdb48c..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t8_8pq_8subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu deleted file mode 100644 index dbb86642ce..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu deleted file mode 100644 index 9aeb7dbf26..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t16_8pq_8subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu deleted file mode 100644 index 7c95cfb5eb..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu deleted file mode 100644 index efdec2d449..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu deleted file mode 100644 index 5788588ecd..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu deleted file mode 100644 index 4b8c9067e8..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu deleted file mode 100644 index fd4ccd8b25..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu deleted file mode 100644 index 148244bc70..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_2subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu deleted file mode 100644 index 072bc59621..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu deleted file mode 100644 index 012dbef416..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim64_t4_8pq_4subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu deleted file mode 100644 index 7763ce7d65..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu deleted file mode 100644 index 66a4c66046..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu deleted file mode 100644 index 279ee6f07f..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t4_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu deleted file mode 100644 index 587dc25379..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu deleted file mode 100644 index 628a18c8d1..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu deleted file mode 100644 index 8a485c8306..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t8_8pq_8subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu deleted file mode 100644 index 5482a736c0..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu deleted file mode 100644 index e0d191d086..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t16_8pq_8subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu deleted file mode 100644 index 785f75ed31..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu deleted file mode 100644 index 35ca051988..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_2subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu deleted file mode 100644 index 61af7cdc03..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu deleted file mode 100644 index 3683fe112b..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim64_t4_8pq_4subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu deleted file mode 100644 index 3f8378d722..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu deleted file mode 100644 index 3496957041..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu deleted file mode 100644 index 8e1ff80ddb..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu deleted file mode 100644 index ed5bf3fe56..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu deleted file mode 100644 index 4246762dbb..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu deleted file mode 100644 index 53e132c40d..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu deleted file mode 100644 index fc6496dc35..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu deleted file mode 100644 index 8298fa1460..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu deleted file mode 100644 index a1f1b5d019..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu deleted file mode 100644 index 9c5d18ca61..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu deleted file mode 100644 index 9d9ce9a74b..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu deleted file mode 100644 index 319803418d..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu deleted file mode 100644 index e9e5602c41..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu deleted file mode 100644 index 3372a83874..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu deleted file mode 100644 index ada26482fc..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu deleted file mode 100644 index 7abcf8596e..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu deleted file mode 100644 index 0ee39456de..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu deleted file mode 100644 index 47a7e2913b..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu deleted file mode 100644 index a1114b3c8c..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu deleted file mode 100644 index 920d6ff0d4..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim1024_t32_8pq_8subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu deleted file mode 100644 index 7c5d5f7a7e..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t4_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu deleted file mode 100644 index 3b36686271..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu deleted file mode 100644 index 50d0570acf..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu deleted file mode 100644 index ed67e72ee9..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu deleted file mode 100644 index f2bb24b6af..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu deleted file mode 100644 index 24a301551a..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu deleted file mode 100644 index 20a4bf007e..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu deleted file mode 100644 index 684930622e..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t8_8pq_8subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu deleted file mode 100644 index a66e1aef14..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu deleted file mode 100644 index 7ba74089f7..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t16_8pq_8subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu deleted file mode 100644 index 8b36b01c48..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu deleted file mode 100644 index 77ee8a588f..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu deleted file mode 100644 index bd91a70a63..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu deleted file mode 100644 index e3faa06b80..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_2subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu deleted file mode 100644 index 2426bcd2c9..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8false.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu deleted file mode 100644 index f0f99f2e73..0000000000 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim64_t4_8pq_4subd_half_fp8true.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * NOTE: this file is generated by compute_distance_00_generate.py - * - * Make changes there and run in this directory: - * - * > python compute_distance_00_generate.py - * - */ - -#include "compute_distance_vpq-impl.cuh" - -namespace cuvs::neighbors::cagra::detail { - -using namespace cuvs::distance; -template struct vpq_descriptor_spec; - -} // namespace cuvs::neighbors::cagra::detail From ec349597e081170d19925aaf8e5f5c208ec52c97 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Fri, 10 Apr 2026 15:09:26 +0900 Subject: [PATCH 101/119] Fix a compilation error --- cpp/tests/neighbors/ann_utils.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/neighbors/ann_utils.cuh b/cpp/tests/neighbors/ann_utils.cuh index dc4a335e3c..8c95d6aa81 100644 --- a/cpp/tests/neighbors/ann_utils.cuh +++ b/cpp/tests/neighbors/ann_utils.cuh @@ -196,7 +196,7 @@ auto eval_recall(const std::vector& expected_idx, double min_recall, bool test_unique = true) -> testing::AssertionResult { - auto [actual_recall, index_based_actual_recall, match_count, total_count] = + auto [actual_recall, match_count, total_count] = calc_recall(expected_idx, actual_idx, rows, cols); double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps); RAFT_LOG_INFO("Recall = %f (%zu/%zu), the error is %2.1f%% %s the threshold (eps = %f).", From 25398339b0e235d3fa252602b987a13167a83ae6 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Fri, 29 May 2026 17:50:22 +0900 Subject: [PATCH 102/119] Add pq_len=8 --- .../cagra/compute_distance_vpq_matrix.json | 53 ++++++++++++-- .../jit_lto_kernels/cagra_planner_base.hpp | 23 ++++-- .../compute_distance_matrix.json | 70 ++++++++++++++++++ .../setup_workspace_matrix.json | 70 ++++++++++++++++++ cpp/tests/neighbors/ann_cagra.cuh | 46 +++++++++++- cpp/tests/neighbors/vpq_utils.cuh | 73 +++++++++++++++++++ 6 files changed, 321 insertions(+), 14 deletions(-) create mode 100644 cpp/tests/neighbors/vpq_utils.cuh diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json index cf6e060d33..c6e2ae319c 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json @@ -36,15 +36,53 @@ "_mxdim_team": [ { "dim": "128", - "team_size": "8" + "team_size": "8", + "pq_len": "2" }, { "dim": "256", - "team_size": "16" + "team_size": "16", + "pq_len": "2" }, { "dim": "512", - "team_size": "32" + "team_size": "32", + "pq_len": "2" + }, + { + "dim": "128", + "team_size": "8", + "pq_len": "4" + }, + { + "dim": "256", + "team_size": "16", + "pq_len": "4" + }, + { + "dim": "512", + "team_size": "32", + "pq_len": "4" + }, + { + "dim": "128", + "team_size": "4", + "pq_len": "8" + }, + { + "dim": "256", + "team_size": "8", + "pq_len": "8" + }, + { + "dim": "512", + "team_size": "16", + "pq_len": "8" + }, + { + "dim": "1024", + "team_size": "32", + "pq_len": "8" } ], "_codebook": [ @@ -53,7 +91,10 @@ "codebook_abbrev": "h" } ], - "pq_bits": ["8"], - "pq_len": ["2", "4"], - "metric": ["L2Expanded"] + "pq_bits": [ + "8" + ], + "metric": [ + "L2Expanded" + ] } diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp index 317ca1a1b6..b9e7891723 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp @@ -64,8 +64,8 @@ struct CagraPlannerBase : AlgorithmPlanner { uint32_t dataset_block_dim, uint32_t pq_len) { - if (pq_len != 2 && pq_len != 4) { - RAFT_FAIL("CAGRA JIT VPQ setup_workspace expects pq_len in {2,4} (matrix uses pq_bits=8)"); + if (pq_len != 2 && pq_len != 4 && pq_len != 8) { + RAFT_FAIL("CAGRA JIT VPQ setup_workspace expects pq_len in {2,4,8} (matrix uses pq_bits=8)"); } auto add = [&]() { this->add_static_fragment() { if (pq_len == 2) { add.template operator()(); - } else { + } else if (pq_len == 4) { add.template operator()(); + } else { + add.template operator()(); } }); } @@ -120,8 +122,8 @@ struct CagraPlannerBase : AlgorithmPlanner { uint32_t dataset_block_dim, uint32_t pq_len) { - if (pq_len != 2 && pq_len != 4) { - RAFT_FAIL("CAGRA JIT VPQ compute_distance expects pq_len in {2,4} (matrix uses pq_bits=8)"); + if (pq_len != 2 && pq_len != 4 && pq_len != 8) { + RAFT_FAIL("CAGRA JIT VPQ compute_distance expects pq_len in {2,4,8} (matrix uses pq_bits=8)"); } auto add = [&]() { this->add_static_fragment() { if (pq_len == 2) { add.template operator()(); - } else { + } else if (pq_len == 4) { add.template operator()(); + } else { + add.template operator()(); } }); } @@ -219,6 +223,12 @@ struct CagraPlannerBase : AlgorithmPlanner { static void dispatch_cagra_team_dim(uint32_t team_size, uint32_t dataset_block_dim, Lambda&& l) { switch (team_size) { + case 4: + switch (dataset_block_dim) { + case 128: std::forward(l).template operator()<4u, 128u>(); return; + default: break; + } + break; case 8: switch (dataset_block_dim) { case 128: std::forward(l).template operator()<8u, 128u>(); return; @@ -240,6 +250,7 @@ struct CagraPlannerBase : AlgorithmPlanner { case 128: std::forward(l).template operator()<32u, 128u>(); return; case 256: std::forward(l).template operator()<32u, 256u>(); return; case 512: std::forward(l).template operator()<32u, 512u>(); return; + case 1024: std::forward(l).template operator()<32u, 1024u>(); return; default: break; } break; diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json index 82b8dbdf4e..2e64ee2ce1 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json @@ -150,5 +150,75 @@ "codebook_abbrev": "half" } ] + }, + { + "_data": [ + { + "data_type": "float", + "data_abbrev": "f" + }, + { + "data_type": "__half", + "data_abbrev": "h" + }, + { + "data_type": "uint8_t", + "data_abbrev": "u8" + }, + { + "data_type": "int8_t", + "data_abbrev": "i8" + } + ], + "_query": [ + { + "query_type": "half", + "query_abbrev": "h" + } + ], + "_index": [ + { + "index_type": "uint32_t", + "index_abbrev": "u32" + } + ], + "_distance": [ + { + "distance_type": "float", + "distance_abbrev": "f" + } + ], + "_pq": [ + { + "pq_len": "8", + "pq_bits": "8", + "pq_prefix": "_vpq", + "pq_suffix": "_8pq_8subd" + } + ], + "_codebook": [ + { + "codebook_type": "half", + "codebook_abbrev": "half" + } + ], + "_mxdim_team": [ + { + "dataset_block_dim": "128", + "team_size": "4" + }, + { + "dataset_block_dim": "256", + "team_size": "8" + }, + { + "dataset_block_dim": "512", + "team_size": "16" + }, + { + "dataset_block_dim": "1024", + "team_size": "32" + } + ] } ] diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json index 83aa8764bc..64c82ce13a 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json @@ -150,5 +150,75 @@ "codebook_abbrev": "half" } ] + }, + { + "_data": [ + { + "data_type": "float", + "data_abbrev": "f" + }, + { + "data_type": "__half", + "data_abbrev": "h" + }, + { + "data_type": "uint8_t", + "data_abbrev": "u8" + }, + { + "data_type": "int8_t", + "data_abbrev": "i8" + } + ], + "_query": [ + { + "query_type": "half", + "query_abbrev": "h" + } + ], + "_index": [ + { + "index_type": "uint32_t", + "index_abbrev": "u32" + } + ], + "_distance": [ + { + "distance_type": "float", + "distance_abbrev": "f" + } + ], + "_pq": [ + { + "pq_len": "8", + "pq_bits": "8", + "pq_prefix": "_vpq", + "pq_suffix": "_8pq_8subd" + } + ], + "_codebook": [ + { + "codebook_type": "half", + "codebook_abbrev": "half" + } + ], + "_mxdim_team": [ + { + "dataset_block_dim": "128", + "team_size": "4" + }, + { + "dataset_block_dim": "256", + "team_size": "8" + }, + { + "dataset_block_dim": "512", + "team_size": "16" + }, + { + "dataset_block_dim": "1024", + "team_size": "32" + } + ] } ] diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh index a6704f892a..826b8d1a3a 100644 --- a/cpp/tests/neighbors/ann_cagra.cuh +++ b/cpp/tests/neighbors/ann_cagra.cuh @@ -6,6 +6,7 @@ #include "../test_utils.cuh" #include "ann_utils.cuh" +#include "vpq_utils.cuh" #include #include "naive_knn.cuh" @@ -461,6 +462,46 @@ class AnnCagraTest : public ::testing::TestWithParam { raft::update_host(indices_Cagra.data(), indices_dev.data(), queries_size, stream_); raft::resource::sync_stream(handle_); + + reference_recall = 1; + if (ps.compression.has_value()) { + auto decoded_dataset = + raft::make_device_matrix(handle_, ps.n_rows, ps.dim); + cuvs::neighbors::decode_vpq_dataset( + decoded_dataset.view(), + dynamic_cast&>(index.data()), + raft::resource::get_cuda_stream(handle_)); + auto indices_out_view = raft::make_device_matrix_view( + indices_dev.data(), ps.n_queries, ps.k); + auto dists_out_view = raft::make_device_matrix_view( + distances_dev.data(), ps.n_queries, ps.k); + + cuvs::neighbors::naive_knn(handle_, + dists_out_view.data_handle(), + indices_out_view.data_handle(), + search_queries.data(), + decoded_dataset.data_handle(), + ps.n_queries, + ps.n_rows, + ps.dim, + ps.k, + ps.metric); + std::vector indices_vpq_dataset(queries_size); + std::vector distances_vpq_dataset(queries_size); + raft::update_host( + distances_vpq_dataset.data(), dists_out_view.data_handle(), queries_size, stream_); + raft::update_host( + indices_vpq_dataset.data(), indices_out_view.data_handle(), queries_size, stream_); + + reference_recall = std::get<1>(calc_recall(indices_naive, + indices_vpq_dataset, + distances_naive, + distances_vpq_dataset, + ps.n_queries, + ps.k, + 0)); + printf("reference_recall = %e\n", reference_recall); + } } // for (int i = 0; i < min(ps.n_queries, 10); i++) { @@ -470,7 +511,7 @@ class AnnCagraTest : public ::testing::TestWithParam { // print_vector("T", distances_naive.data() + i * ps.k, ps.k, std::cout); // print_vector("C", distances_Cagra.data() + i * ps.k, ps.k, std::cout); // } - double min_recall = ps.min_recall; + double min_recall = ps.min_recall * reference_recall; EXPECT_TRUE(eval_neighbours(indices_naive, indices_Cagra, distances_naive, @@ -519,6 +560,7 @@ class AnnCagraTest : public ::testing::TestWithParam { AnnCagraInputs ps; rmm::device_uvector database; rmm::device_uvector search_queries; + double reference_recall; }; template @@ -1652,7 +1694,7 @@ inline std::vector generate_inputs() {cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL, cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_LOGICAL}); // don't demand high recall // without refinement - for (uint32_t pq_len : {2}) { // for now, only pq_len = 2 is supported, more options coming soon + for (uint32_t pq_len : {2, 4, 8}) { for (uint32_t vq_n_centers : {100, 1000}) { for (auto input : inputs2) { vpq_params ps{}; diff --git a/cpp/tests/neighbors/vpq_utils.cuh b/cpp/tests/neighbors/vpq_utils.cuh new file mode 100644 index 0000000000..613b5fe1d5 --- /dev/null +++ b/cpp/tests/neighbors/vpq_utils.cuh @@ -0,0 +1,73 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include +#include + +#include +#include + +namespace cuvs::neighbors { +template +__global__ void decode_vpq_dataset_kernel(data_t* const decoded_dataset_ptr, + const uint32_t ldd, + const math_t* const vq_codebook_ptr, + const uint32_t ldv, + const math_t* const pq_codebook_ptr, + const uint32_t pq_subspace_dim, + const uint32_t pq_table_size, + const uint32_t dataset_dim, + const size_t dataset_size, + const uint8_t* const data_ptr, + const uint32_t ldi) +{ + constexpr uint32_t warp_size = 32; + const size_t batch_id = (blockIdx.x * blockDim.x + threadIdx.x) / warp_size; + if (batch_id >= dataset_size) { return; } + + const auto local_data_ptr = data_ptr + ldi * batch_id; + const auto vq_code = *reinterpret_cast(local_data_ptr); + const auto pq_code_ptr = local_data_ptr + sizeof(uint32_t); + const auto vq_vec_ptr = vq_codebook_ptr + vq_code * ldv; + auto local_dst_ptr = decoded_dataset_ptr + batch_id * ldd; + + const auto lane_id = threadIdx.x % warp_size; + for (uint32_t i = lane_id; i < dataset_dim; i += warp_size) { + const auto pq_code = pq_code_ptr[i / pq_subspace_dim]; + const auto pq_v = pq_codebook_ptr[pq_code * pq_subspace_dim + (i % pq_subspace_dim)]; + + local_dst_ptr[i] = static_cast(vq_vec_ptr[i]) + static_cast(pq_v); + } +} + +template +void decode_vpq_dataset(raft::device_matrix_view decoded_dataset, + const cuvs::neighbors::vpq_dataset& vpq_dataset, + cudaStream_t cuda_stream) +{ + const auto dataset_size = decoded_dataset.extent(0); + RAFT_EXPECTS(vpq_dataset.data.extent(0) == dataset_size, "Dataset sizes mismatch"); + + constexpr uint32_t block_size = 256; + constexpr uint32_t warp_size = 32; + constexpr int64_t vecs_per_cta = block_size / warp_size; + const auto grid_size = raft::div_rounding_up_safe(decoded_dataset.extent(0), vecs_per_cta); + + decode_vpq_dataset_kernel + <<>>(decoded_dataset.data_handle(), + decoded_dataset.stride(0), + vpq_dataset.vq_code_book.data_handle(), + vpq_dataset.vq_code_book.stride(0), + vpq_dataset.pq_code_book.data_handle(), + vpq_dataset.pq_len(), + 1u << vpq_dataset.pq_bits(), + vpq_dataset.dim(), + dataset_size, + vpq_dataset.data.data_handle(), + vpq_dataset.data.stride(0)); +} +} // namespace cuvs::neighbors From 19d5a0a788119c40f5e2041db50a90f947dd4c3b Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Thu, 4 Jun 2026 12:01:19 +0900 Subject: [PATCH 103/119] Update cagra-q test --- cpp/tests/neighbors/ann_utils.cuh | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/cpp/tests/neighbors/ann_utils.cuh b/cpp/tests/neighbors/ann_utils.cuh index cbc95d7bb7..64732a8a3a 100644 --- a/cpp/tests/neighbors/ann_utils.cuh +++ b/cpp/tests/neighbors/ann_utils.cuh @@ -227,8 +227,9 @@ auto calc_recall(const std::vector& expected_idx, size_t cols, double eps) { - size_t match_count = 0; - size_t total_count = static_cast(rows) * static_cast(cols); + size_t match_count = 0; + size_t index_match_count = 0; + size_t total_count = static_cast(rows) * static_cast(cols); for (size_t i = 0; i < rows; ++i) { for (size_t k = 0; k < cols; ++k) { size_t idx_k = i * cols + k; // row major assumption! @@ -247,8 +248,28 @@ auto calc_recall(const std::vector& expected_idx, } } } - return std::make_tuple( - static_cast(match_count) / static_cast(total_count), match_count, total_count); + + // Index based recall + for (size_t i = 0; i < rows; ++i) { + for (size_t k = 0; k < cols; ++k) { + size_t idx_k = i * cols + k; // row major assumption! + auto act_idx = actual_idx[idx_k]; + for (size_t j = 0; j < cols; ++j) { + size_t idx = i * cols + j; // row major assumption! + auto exp_idx = expected_idx[idx]; + + if (act_idx == exp_idx) { + index_match_count++; + break; + } + } + } + } + + return std::make_tuple(static_cast(match_count) / static_cast(total_count), + static_cast(index_match_count) / static_cast(total_count), + match_count, + total_count); } /** same as eval_recall, but in case indices do not match, @@ -265,7 +286,7 @@ auto eval_neighbours(const std::vector& expected_idx, bool test_unique = true, size_t max_duplicates = 0) -> testing::AssertionResult { - auto [actual_recall, match_count, total_count] = + auto [actual_recall, index_based_actual_recall, match_count, total_count] = calc_recall(expected_idx, actual_idx, expected_dist, actual_dist, rows, cols, eps); double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps); From 09deae59dfa21902bd23a83f671c405c6f9fc759 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Thu, 4 Jun 2026 12:05:17 +0900 Subject: [PATCH 104/119] Update the compute distance kernel --- .../cagra/compute_distance_vpq-impl.cuh | 17 ++++- .../detail/cagra/compute_distance_vpq.hpp | 8 ++- .../jit_lto_kernels/compute_distance_impl.cuh | 46 +++++++++---- .../jit_lto_kernels/setup_workspace_impl.cuh | 68 +++++++++++-------- 4 files changed, 93 insertions(+), 46 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index 6992ae979a..d0f12a20fd 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -14,6 +14,14 @@ namespace cuvs::neighbors::cagra::detail { +template +struct vpq_smem_value_config { + using smem_val_pack_t = half2; + using smem_val_t = half; + using smem_val_pack_uint_t = uint32_t; + static constexpr uint32_t num_packed_elements = 2; +}; + template ; + static constexpr std::uint32_t kSMemCodeBookSizeInBytes = - (1 << PQ_BITS) * PQ_LEN * utils::size_of(); + (1 << PQ_BITS) * PQ_LEN * utils::size_of() / + smem_val_config::num_packed_elements; _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(const std::uint8_t* encoded_dataset_ptr, std::uint32_t encoded_dataset_dim, @@ -108,7 +119,9 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t(dim, DatasetBlockDim) * sizeof(QUERY_T); + raft::round_up_safe(dim, DatasetBlockDim) * + utils::size_of() / + smem_val_config::num_packed_elements; } private: diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp index 2b69a1cef4..299916c6c7 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -69,6 +69,12 @@ struct vpq_descriptor_spec : public instance_spec { // Match codebook params if (dataset.pq_bits() != PqBits) { return -1.0; } if (dataset.pq_len() != PqLen) { return -1.0; } + // Keep auto-selection on the tuned VPQ diagonal while allowing explicit team_size requests to + // use the expanded team_size / dataset_block_dim grid. + constexpr std::uint32_t auto_dataset_block_dim_per_team = PqLen == 8 ? 32 : 16; + if (params.team_size == 0 && DatasetBlockDim != TeamSize * auto_dataset_block_dim_per_team) { + return -1.0; + } // Otherwise, favor the closest dataset dimensionality. constexpr std::uint32_t preferred_load_elmes_per_thread = 16; /*magic number that is good based on experiments.*/ diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh index 92a014bd2f..08f44c171e 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh @@ -100,10 +100,16 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker_impl( constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim; constexpr auto PQ_BITS = DescriptorT::kPqBits; constexpr auto PQ_LEN = DescriptorT::kPqLen; + using PQ_CODEBOOK_LOAD_T = uint32_t; + + using smem_val_config = vpq_smem_value_config; + using smem_val_pack_t = typename smem_val_config::smem_val_pack_t; + using smem_val_pack_uint_t = typename smem_val_config::smem_val_pack_uint_t; + constexpr uint32_t num_packed_elements = smem_val_config::num_packed_elements; const uint32_t query_ptr = pq_codebook_ptr + DescriptorT::kSMemCodeBookSizeInBytes; static_assert(PQ_BITS == 8, "Only pq_bits == 8 is supported at the moment."); - constexpr uint32_t vlen = 4; // **** DO NOT CHANGE **** + constexpr uint32_t vlen = utils::size_of() / utils::size_of(); constexpr uint32_t nelem = raft::div_rounding_up_unsafe(DatasetBlockDim / PQ_LEN, TeamSize * vlen); @@ -115,12 +121,17 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker_impl( DISTANCE_T norm = 0; for (uint32_t elem_offset = 0; elem_offset * PQ_LEN < dim; elem_offset += DatasetBlockDim / PQ_LEN) { - uint32_t pq_codes[nelem]; + PQ_CODEBOOK_LOAD_T pq_codes[nelem]; #pragma unroll for (std::uint32_t e = 0; e < nelem; e++) { const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen; if (k >= n_subspace) break; - device::ldg_cg(pq_codes[e], reinterpret_cast(dataset_ptr + 4 + k)); + if constexpr (std::is_same_v) { + device::ldg_cg(pq_codes[e], + reinterpret_cast(dataset_ptr + 4 + k)); + } else { + pq_codes[e] = *reinterpret_cast(dataset_ptr + 4 + k); + } } // if constexpr (PQ_LEN % 2 == 0) { @@ -135,23 +146,30 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker_impl( if (d >= dim) break; device::ldg_ca(vq_vals[m], vq_code_book_ptr + d); } - std::uint32_t pq_code = pq_codes[e]; + PQ_CODEBOOK_LOAD_T pq_code = pq_codes[e]; #pragma unroll for (std::uint32_t v = 0; v < vlen; v++) { if (PQ_LEN * (v + k) >= dim) break; #pragma unroll - for (std::uint32_t m = 0; m < PQ_LEN / 2; m++) { - constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN); - const std::uint32_t d1 = m + (PQ_LEN / 2) * v; - const std::uint32_t d = - d1 * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; - half2 q2, c2; - device::lds(q2, query_ptr + sizeof(half2) * d); + for (std::uint32_t m = 0; m < PQ_LEN / num_packed_elements; m++) { + constexpr uint32_t vq_val_pack_num_elements = 2; + constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN); + const std::uint32_t vq_half2_index = + m * (num_packed_elements / vq_val_pack_num_elements) + (PQ_LEN / 2) * v; + + static_assert(num_packed_elements == 2, + "CAGRA JIT VPQ currently stores pq_len=8 in half2 shared-memory packs"); + const uint32_t query_val_index = + vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; + + smem_val_pack_t q2, c2; + device::lds(q2, query_ptr + sizeof(smem_val_pack_t) * query_val_index); device::lds(c2, pq_codebook_ptr + - sizeof(CODE_BOOK_T) * ((1 << PQ_BITS) * 2 * m + (2 * (pq_code & 0xff)))); - auto dist = q2 - c2 - reinterpret_cast(vq_vals)[d1]; - dist = dist * dist; + sizeof(smem_val_pack_uint_t) * ((1 << PQ_BITS) * m + (pq_code & 0xff))); + auto dist = + q2 - c2 - reinterpret_cast(vq_vals)[vq_half2_index]; + dist = dist * dist; norm += static_cast(dist.x + dist.y); } pq_code >>= 8; diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh index 8cdd7febd5..ed83c181fe 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh @@ -79,12 +79,16 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq_impl( const typename DescriptorT::DATA_T* queries_ptr, uint32_t query_id) -> const DescriptorT* { - using QUERY_T = typename DescriptorT::QUERY_T; - using CODE_BOOK_T = typename DescriptorT::CODE_BOOK_T; - using word_type = uint32_t; - constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; - constexpr auto PQ_BITS = DescriptorT::kPqBits; - constexpr auto PQ_LEN = DescriptorT::kPqLen; + using QUERY_T = typename DescriptorT::QUERY_T; + using word_type = uint32_t; + constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; + constexpr auto PQ_BITS = DescriptorT::kPqBits; + constexpr auto PQ_LEN = DescriptorT::kPqLen; + using smem_val_config = vpq_smem_value_config; + using smem_val_t = typename smem_val_config::smem_val_t; + using smem_val_pack_t = typename smem_val_config::smem_val_pack_t; + using smem_val_pack_uint_t = typename smem_val_config::smem_val_pack_uint_t; + constexpr auto num_packed_elements = smem_val_config::num_packed_elements; auto* r = reinterpret_cast(smem_ptr); @@ -105,18 +109,22 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq_impl( } __syncthreads(); - for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) { - half2 buf2; - buf2.x = r->pq_code_book_ptr()[i]; - buf2.y = r->pq_code_book_ptr()[i + 1]; - - constexpr auto num_elements_per_bank = 4 / utils::size_of(); - constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank; - const auto j = i / num_elements_per_bank; - const auto smem_index = - (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS); - - device::sts(codebook_buf + smem_index * sizeof(half2), buf2); + for (unsigned i = threadIdx.x * num_packed_elements; i < (1 << PQ_BITS) * PQ_LEN; + i += blockDim.x * num_packed_elements) { + constexpr auto num_elements_per_bank = + num_packed_elements / (utils::size_of() / utils::size_of()); + + if constexpr (PQ_LEN >= num_elements_per_bank) { + constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank; + const auto j = i / num_elements_per_bank; + const auto smem_index = + (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS); + + smem_val_pack_t buf; + buf.x = r->pq_code_book_ptr()[i]; + buf.y = r->pq_code_book_ptr()[i + 1]; + device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_t), buf); + } } } @@ -125,19 +133,21 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq_impl( constexpr cuvs::spatial::knn::detail::utils::mapping mapping{}; auto smem_query_ptr = - reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT) + - DescriptorT::kSMemCodeBookSizeInBytes); - for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) { - half2 buf2{0, 0}; - if (i < dim) { buf2.x = mapping(queries_ptr[i]); } - if (i + 1 < dim) { buf2.y = mapping(queries_ptr[i + 1]); } - if constexpr ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) { + reinterpret_cast(reinterpret_cast(smem_ptr) + sizeof(DescriptorT) + + DescriptorT::kSMemCodeBookSizeInBytes); + for (unsigned i = threadIdx.x * num_packed_elements; i < dim; + i += blockDim.x * num_packed_elements) { + smem_val_pack_t buf{0, 0}; + if (i < dim) { buf.x = mapping(queries_ptr[i]); } + if (i + 1 < dim) { buf.y = mapping(queries_ptr[i + 1]); } + if constexpr ((PQ_BITS == 8) && (PQ_LEN % num_packed_elements == 0)) { constexpr uint32_t vlen = 4; // **** DO NOT CHANGE **** - constexpr auto kStride = vlen * PQ_LEN / 2; - reinterpret_cast(smem_query_ptr)[transpose(i / 2)] = - buf2; + constexpr auto kStride = vlen * PQ_LEN / num_packed_elements; + reinterpret_cast( + smem_query_ptr)[transpose( + i / num_packed_elements)] = buf; } else { - (reinterpret_cast(smem_query_ptr + i))[0] = buf2; + (reinterpret_cast(smem_query_ptr + i))[0] = buf; } } From 5fa53216cf33996f772ce3e10de4e5a545be3528 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Thu, 4 Jun 2026 18:10:24 +0900 Subject: [PATCH 105/119] Add FP8 support --- .../neighbors/detail/cagra/cagra_search.cuh | 5 + .../detail/cagra/compute_distance.hpp | 7 +- .../cagra/compute_distance_vpq-impl.cuh | 49 +++- .../detail/cagra/compute_distance_vpq.hpp | 10 +- .../cagra/compute_distance_vpq_inst.cu.in | 4 +- .../cagra/compute_distance_vpq_matrix.json | 4 + .../detail/cagra/device_memory_ops.hpp | 15 + cpp/src/neighbors/detail/cagra/factory.cuh | 12 +- .../cagra_jit_launcher_factory.hpp | 36 ++- .../jit_lto_kernels/cagra_planner_base.hpp | 274 ++++++++++++++---- .../jit_lto_kernels/compute_distance_impl.cuh | 69 +++-- .../compute_distance_kernel.cu.in | 7 +- .../compute_distance_matrix.json | 26 ++ .../jit_lto_kernels/setup_workspace_impl.cuh | 45 ++- .../setup_workspace_kernel.cu.in | 4 +- .../setup_workspace_matrix.json | 26 ++ .../neighbors/detail/cagra/packed_type.hpp | 49 ++++ 17 files changed, 520 insertions(+), 122 deletions(-) create mode 100644 cpp/src/neighbors/detail/cagra/packed_type.hpp diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index bca8d3314d..f199cf7882 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -153,6 +153,11 @@ void search_main(raft::resources const& res, // Dispatch search parameters based on the dataset kind. if (auto* strided_dset = dynamic_cast*>(&index.data()); strided_dset != nullptr) { + if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO && + params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::F16) { + RAFT_LOG_WARN("In this search mode, smem_dtype supports only AUTO or F16. Set it to AUTO."); + params.smem_dtype = cuvs::neighbors::cagra::internal_dtype::AUTO; + } // Search using a plain (strided) row-major dataset RAFT_EXPECTS(index.metric() != cuvs::distance::DistanceType::CosineExpanded || index.dataset_norms().has_value(), diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index 75b56860bb..7f921ce948 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -207,6 +207,7 @@ struct dataset_descriptor_host { bool is_vpq = false; uint32_t pq_bits = 0; uint32_t pq_len = 0; + bool enable_fp8 = false; // Codebook type is determined by DataT for VPQ (always half for now) struct state { @@ -258,7 +259,8 @@ struct dataset_descriptor_host { uint32_t dataset_block_dim_val, bool is_vpq_val = false, uint32_t pq_bits_val = 0, - uint32_t pq_len_val = 0) + uint32_t pq_len_val = 0, + bool enable_fp8_val = false) : value_{std::make_shared(init, sizeof(DescriptorImpl))}, smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()}, team_size{dd_host.team_size()}, @@ -266,7 +268,8 @@ struct dataset_descriptor_host { dataset_block_dim{dataset_block_dim_val}, is_vpq{is_vpq_val}, pq_bits{pq_bits_val}, - pq_len{pq_len_val} + pq_len{pq_len_val}, + enable_fp8{enable_fp8_val} { } diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index d0f12a20fd..ea994c450a 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -6,6 +6,7 @@ #pragma once #include "compute_distance_vpq.hpp" +#include "packed_type.hpp" #include #include @@ -14,14 +15,27 @@ namespace cuvs::neighbors::cagra::detail { -template -struct vpq_smem_value_config { +template +struct vpq_smem_value_config; + +template +struct vpq_smem_value_config> { using smem_val_pack_t = half2; using smem_val_t = half; using smem_val_pack_uint_t = uint32_t; static constexpr uint32_t num_packed_elements = 2; }; +template +struct vpq_smem_value_config> { + using smem_val_pack_t = device::fp8xN; + using smem_val_t = typename smem_val_pack_t::unit_t; + using smem_val_pack_uint_t = typename smem_val_pack_t::uint_t; + static constexpr uint32_t num_packed_elements = smem_val_pack_t::num_elements; +}; + template + typename QueryT, + bool EnableFP8> struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t { using base_type = dataset_descriptor_base_t; using CODE_BOOK_T = CodebookT; @@ -46,6 +61,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t, "Only CODE_BOOK_T = `half` is supported now"); @@ -88,7 +104,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t; + using smem_val_config = vpq_smem_value_config; static constexpr std::uint32_t kSMemCodeBookSizeInBytes = (1 << PQ_BITS) * PQ_LEN * utils::size_of() / @@ -135,7 +151,8 @@ template + typename DistanceT, + bool EnableFP8> RAFT_KERNEL __launch_bounds__(1, 1) vpq_dataset_descriptor_init_kernel(dataset_descriptor_base_t* out, const std::uint8_t* encoded_dataset_ptr, @@ -153,7 +170,8 @@ RAFT_KERNEL __launch_bounds__(1, 1) DataT, IndexT, DistanceT, - half>; + half, + EnableFP8>; new (out) desc_type( encoded_dataset_ptr, encoded_dataset_dim, vq_code_book_ptr, pq_code_book_ptr, size, dim); } @@ -166,7 +184,8 @@ template + typename DistanceT, + bool EnableFP8> dataset_descriptor_host vpq_descriptor_spec::init_(const cagra::search_params& params, + DistanceT, + EnableFP8>::init_(const cagra::search_params& params, const std::uint8_t* encoded_dataset_ptr, uint32_t encoded_dataset_dim, const CodebookT* vq_code_book_ptr, @@ -192,7 +212,8 @@ vpq_descriptor_spec; + half, + EnableFP8>; return host_type{ desc_type{ @@ -207,7 +228,8 @@ vpq_descriptor_spec<<<1, 1, 0, stream>>>(dev_ptr, + DistanceT, + EnableFP8><<<1, 1, 0, stream>>>(dev_ptr, encoded_dataset_ptr, encoded_dataset_dim, vq_code_book_ptr, @@ -218,9 +240,10 @@ vpq_descriptor_spec +#include #include @@ -21,7 +22,8 @@ template + typename DistanceT, + bool EnableFP8> struct vpq_descriptor_spec : public instance_spec { using base_type = instance_spec; using typename base_type::data_type; @@ -63,12 +65,18 @@ struct vpq_descriptor_spec : public instance_spec { const DatasetT& dataset, cuvs::distance::DistanceType metric) -> double { + const auto fp8_natively_supported = raft::getComputeCapability().first >= 9; + const auto use_fp8 = + params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::E5M2 || + (params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::AUTO && fp8_natively_supported); + // If explicit team_size is specified and doesn't match the instance, discard it if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } if (cuvs::distance::DistanceType::L2Expanded != metric) { return -1.0; } // Match codebook params if (dataset.pq_bits() != PqBits) { return -1.0; } if (dataset.pq_len() != PqLen) { return -1.0; } + if (use_fp8 != EnableFP8) { return -1.0; } // Keep auto-selection on the tuned VPQ diagonal while allowing explicit team_size requests to // use the expanded team_size / dataset_block_dim grid. constexpr std::uint32_t auto_dataset_block_dim_per_team = PqLen == 8 ? 32 : 16; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in index c159da3229..676f25c9fd 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in @@ -13,6 +13,7 @@ constexpr uint32_t team_size = @team_size@; constexpr uint32_t dim = @dim@; constexpr uint32_t pq_bits = @pq_bits@; constexpr uint32_t pq_len = @pq_len@; +constexpr bool enable_fp8 = @enable_fp8@; using codebook_t = @codebook_type@; using data_t = @data_type@; using index_t = @index_type@; @@ -30,6 +31,7 @@ template struct vpq_descriptor_spec; + distance_t, + enable_fp8>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json index c6e2ae319c..7dac07c2a4 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json @@ -96,5 +96,9 @@ ], "metric": [ "L2Expanded" + ], + "enable_fp8": [ + "true", + "false" ] } diff --git a/cpp/src/neighbors/detail/cagra/device_memory_ops.hpp b/cpp/src/neighbors/detail/cagra/device_memory_ops.hpp index cc164994ea..1bcf6f8fbd 100644 --- a/cpp/src/neighbors/detail/cagra/device_memory_ops.hpp +++ b/cpp/src/neighbors/detail/cagra/device_memory_ops.hpp @@ -54,6 +54,11 @@ RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, uint32_t addr) asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "r"(addr)); } +RAFT_DEVICE_INLINE_FUNCTION void lds(uint64_t& x, uint32_t addr) +{ + asm volatile("ld.shared.u64 {%0}, [%1];" : "=l"(x) : "r"(addr)); +} + RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, const uint32_t* addr) { lds(x, uint32_t(__cvta_generic_to_shared(addr))); @@ -71,6 +76,16 @@ RAFT_DEVICE_INLINE_FUNCTION void lds(uint4& x, const uint4* addr) lds(x, uint32_t(__cvta_generic_to_shared(addr))); } +RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const uint32_t& x) +{ + asm volatile("st.shared.u32 [%0], %1;" : : "r"(addr), "r"(reinterpret_cast(x))); +} + +RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const uint64_t& x) +{ + asm volatile("st.shared.u64 [%0], %1;" : : "r"(addr), "l"(reinterpret_cast(x))); +} + RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const half2& x) { asm volatile("st.shared.v2.u16 [%0], {%1, %2};" diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh index 26cd13bab8..a1e2f6be9c 100644 --- a/cpp/src/neighbors/detail/cagra/factory.cuh +++ b/cpp/src/neighbors/detail/cagra/factory.cuh @@ -87,6 +87,7 @@ struct key { uint32_t extra_val; // this one has different meanings for different descriptor types uint32_t team_size; uint32_t metric; + uint32_t smem_dtype; }; template @@ -100,7 +101,8 @@ auto make_key(const cagra::search_params& params, dataset.dim(), dataset.stride(), uint32_t(params.team_size), - uint32_t(metric)}; + uint32_t(metric), + uint32_t(params.smem_dtype)}; } template @@ -114,20 +116,22 @@ auto make_key(const cagra::search_params& params, dataset.dim(), uint32_t(reinterpret_cast(dataset.pq_code_book.data_handle()) >> 6), uint32_t(params.team_size), - uint32_t(metric)}; + uint32_t(metric), + uint32_t(params.smem_dtype)}; } inline auto operator==(const key& a, const key& b) -> bool { return a.data_ptr == b.data_ptr && a.n_rows == b.n_rows && a.dim == b.dim && - a.extra_val == b.extra_val && a.team_size == b.team_size && a.metric == b.metric; + a.extra_val == b.extra_val && a.team_size == b.team_size && a.metric == b.metric && + a.smem_dtype == b.smem_dtype; } struct key_hash { inline auto operator()(const key& x) const noexcept -> std::size_t { return size_t{x.data_ptr} + size_t{x.n_rows} * size_t{x.dim} * size_t{x.extra_val} + - (size_t{x.team_size} ^ size_t{x.metric}); + (size_t{x.team_size} ^ size_t{x.metric}) + size_t{x.smem_dtype}; } }; diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp index 60d17c5128..60e965796c 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp @@ -57,10 +57,14 @@ std::shared_ptr build_single_cta_launcher( persistent); if constexpr (std::is_same_v) { - planner.add_setup_workspace_device_function( - dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len); - planner.add_compute_distance_device_function( - dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len); + planner.add_setup_workspace_device_function(dataset_desc.team_size, + dataset_desc.dataset_block_dim, + dataset_desc.pq_len, + dataset_desc.enable_fp8); + planner.add_compute_distance_device_function(dataset_desc.team_size, + dataset_desc.dataset_block_dim, + dataset_desc.pq_len, + dataset_desc.enable_fp8); } else { planner.add_setup_workspace_device_function(dataset_desc.team_size, dataset_desc.dataset_block_dim); @@ -102,10 +106,14 @@ std::shared_ptr build_multi_cta_launcher( dataset_desc.pq_len); if constexpr (std::is_same_v) { - planner.add_setup_workspace_device_function( - dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len); - planner.add_compute_distance_device_function( - dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len); + planner.add_setup_workspace_device_function(dataset_desc.team_size, + dataset_desc.dataset_block_dim, + dataset_desc.pq_len, + dataset_desc.enable_fp8); + planner.add_compute_distance_device_function(dataset_desc.team_size, + dataset_desc.dataset_block_dim, + dataset_desc.pq_len, + dataset_desc.enable_fp8); } else { planner.add_setup_workspace_device_function(dataset_desc.team_size, dataset_desc.dataset_block_dim); @@ -147,10 +155,14 @@ std::shared_ptr build_multi_kernel_launcher( dataset_desc.pq_bits, dataset_desc.pq_len); if constexpr (std::is_same_v) { - planner.add_setup_workspace_device_function( - dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len); - planner.add_compute_distance_device_function( - dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len); + planner.add_setup_workspace_device_function(dataset_desc.team_size, + dataset_desc.dataset_block_dim, + dataset_desc.pq_len, + dataset_desc.enable_fp8); + planner.add_compute_distance_device_function(dataset_desc.team_size, + dataset_desc.dataset_block_dim, + dataset_desc.pq_len, + dataset_desc.enable_fp8); } else { planner.add_setup_workspace_device_function(dataset_desc.team_size, dataset_desc.dataset_block_dim); diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp index b9e7891723..14ef271c2a 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp @@ -50,11 +50,13 @@ struct CagraPlannerBase : AlgorithmPlanner { TeamSz, Dim, PqBitsV, - PqLenV>>(); + PqLenV, + tag_smem_f16>>(); }; - dispatch_cagra_team_dim(team_size, dataset_block_dim, [&add]() { - add.template operator()(); - }); + dispatch_cagra_standard_team_dim( + team_size, dataset_block_dim, [&add]() { + add.template operator()(); + }); } /// VPQ (`tag_codebook_half`): JIT matrix fixes `pq_bits=8`; only `pq_len` is selected at runtime. @@ -62,32 +64,39 @@ struct CagraPlannerBase : AlgorithmPlanner { std::enable_if_t, int> = 0> void add_setup_workspace_device_function(uint32_t team_size, uint32_t dataset_block_dim, - uint32_t pq_len) + uint32_t pq_len, + bool enable_fp8) { if (pq_len != 2 && pq_len != 4 && pq_len != 8) { RAFT_FAIL("CAGRA JIT VPQ setup_workspace expects pq_len in {2,4,8} (matrix uses pq_bits=8)"); } - auto add = [&]() { - this->add_static_fragment>(); + auto add = + [&]() { + this->add_static_fragment>(); + }; + auto dispatch_smem = [&]() { + dispatch_cagra_vpq_team_dim( + team_size, + dataset_block_dim, + pq_len, + [&add]() { + add.template operator()(); + }); }; - dispatch_cagra_team_dim( - team_size, dataset_block_dim, [&add, pq_len]() { - if (pq_len == 2) { - add.template operator()(); - } else if (pq_len == 4) { - add.template operator()(); - } else { - add.template operator()(); - } - }); + if (enable_fp8) { + dispatch_smem.template operator()(); + } else { + dispatch_smem.template operator()(); + } } /// Registers dist_op + normalization + `compute_distance` for standard layout. @@ -108,11 +117,13 @@ struct CagraPlannerBase : AlgorithmPlanner { TeamSz, Dim, PqBitsV, - PqLenV>>(); + PqLenV, + tag_smem_f16>>(); }; - dispatch_cagra_team_dim(team_size, dataset_block_dim, [&add]() { - add.template operator()(); - }); + dispatch_cagra_standard_team_dim( + team_size, dataset_block_dim, [&add]() { + add.template operator()(); + }); } /// VPQ: only the `compute_distance` fragment (no standard dist_op / normalization in this path). @@ -120,35 +131,179 @@ struct CagraPlannerBase : AlgorithmPlanner { std::enable_if_t, int> = 0> void add_compute_distance_device_function(uint32_t team_size, uint32_t dataset_block_dim, - uint32_t pq_len) + uint32_t pq_len, + bool enable_fp8) { if (pq_len != 2 && pq_len != 4 && pq_len != 8) { RAFT_FAIL("CAGRA JIT VPQ compute_distance expects pq_len in {2,4,8} (matrix uses pq_bits=8)"); } - auto add = [&]() { - this->add_static_fragment>(); + auto add = + [&]() { + this->add_static_fragment>(); + }; + auto dispatch_smem = [&]() { + dispatch_cagra_vpq_team_dim( + team_size, + dataset_block_dim, + pq_len, + [&add]() { + add.template operator()(); + }); }; - dispatch_cagra_team_dim( - team_size, dataset_block_dim, [&add, pq_len]() { - if (pq_len == 2) { - add.template operator()(); - } else if (pq_len == 4) { - add.template operator()(); - } else { - add.template operator()(); - } - }); + if (enable_fp8) { + dispatch_smem.template operator()(); + } else { + dispatch_smem.template operator()(); + } } private: + template + static void dispatch_cagra_standard_team_dim(uint32_t team_size, + uint32_t dataset_block_dim, + Lambda&& l) + { + switch (team_size) { + case 8: + switch (dataset_block_dim) { + case 128: std::forward(l).template operator()<8u, 128u>(); return; + case 256: std::forward(l).template operator()<8u, 256u>(); return; + case 512: std::forward(l).template operator()<8u, 512u>(); return; + default: break; + } + break; + case 16: + switch (dataset_block_dim) { + case 128: std::forward(l).template operator()<16u, 128u>(); return; + case 256: std::forward(l).template operator()<16u, 256u>(); return; + case 512: std::forward(l).template operator()<16u, 512u>(); return; + default: break; + } + break; + case 32: + switch (dataset_block_dim) { + case 128: std::forward(l).template operator()<32u, 128u>(); return; + case 256: std::forward(l).template operator()<32u, 256u>(); return; + case 512: std::forward(l).template operator()<32u, 512u>(); return; + default: break; + } + break; + default: break; + } + RAFT_FAIL("Unsupported standard team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u", + static_cast(team_size), + static_cast(dataset_block_dim)); + } + + template + static void dispatch_cagra_vpq_pq2_4_team_dim(uint32_t team_size, + uint32_t dataset_block_dim, + Lambda&& l) + { + switch (team_size) { + case 8: + switch (dataset_block_dim) { + case 128: std::forward(l).template operator()<8u, 128u, 8u, PqLenV>(); return; + case 256: std::forward(l).template operator()<8u, 256u, 8u, PqLenV>(); return; + case 512: std::forward(l).template operator()<8u, 512u, 8u, PqLenV>(); return; + default: break; + } + break; + case 16: + switch (dataset_block_dim) { + case 128: std::forward(l).template operator()<16u, 128u, 8u, PqLenV>(); return; + case 256: std::forward(l).template operator()<16u, 256u, 8u, PqLenV>(); return; + case 512: std::forward(l).template operator()<16u, 512u, 8u, PqLenV>(); return; + default: break; + } + break; + case 32: + switch (dataset_block_dim) { + case 128: std::forward(l).template operator()<32u, 128u, 8u, PqLenV>(); return; + case 256: std::forward(l).template operator()<32u, 256u, 8u, PqLenV>(); return; + case 512: std::forward(l).template operator()<32u, 512u, 8u, PqLenV>(); return; + default: break; + } + break; + default: break; + } + RAFT_FAIL( + "Unsupported VPQ pq_len=%u team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u", + static_cast(PqLenV), + static_cast(team_size), + static_cast(dataset_block_dim)); + } + + template + static void dispatch_cagra_vpq_pq8_team_dim(uint32_t team_size, + uint32_t dataset_block_dim, + Lambda&& l) + { + switch (team_size) { + case 4: + switch (dataset_block_dim) { + case 128: std::forward(l).template operator()<4u, 128u, 8u, 8u>(); return; + default: break; + } + break; + case 8: + switch (dataset_block_dim) { + case 256: std::forward(l).template operator()<8u, 256u, 8u, 8u>(); return; + default: break; + } + break; + case 16: + switch (dataset_block_dim) { + case 512: std::forward(l).template operator()<16u, 512u, 8u, 8u>(); return; + default: break; + } + break; + case 32: + switch (dataset_block_dim) { + case 1024: std::forward(l).template operator()<32u, 1024u, 8u, 8u>(); return; + default: break; + } + break; + default: break; + } + RAFT_FAIL( + "Unsupported VPQ pq_len=8 team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u", + static_cast(team_size), + static_cast(dataset_block_dim)); + } + + template + static void dispatch_cagra_vpq_team_dim(uint32_t team_size, + uint32_t dataset_block_dim, + uint32_t pq_len, + Lambda&& l) + { + switch (pq_len) { + case 2: + dispatch_cagra_vpq_pq2_4_team_dim<2u>( + team_size, dataset_block_dim, std::forward(l)); + return; + case 4: + dispatch_cagra_vpq_pq2_4_team_dim<4u>( + team_size, dataset_block_dim, std::forward(l)); + return; + case 8: + dispatch_cagra_vpq_pq8_team_dim(team_size, dataset_block_dim, std::forward(l)); + return; + default: break; + } + RAFT_FAIL("CAGRA JIT VPQ expects pq_len in {2,4,8}; got %u", static_cast(pq_len)); + } + void add_dist_op_device_function(cuvs::distance::DistanceType metric) { // dist_op_matrix.json pairs tag_metric_hamming with uint8 query (tag_u8) only; L2/IP/L1 use @@ -193,15 +348,16 @@ struct CagraPlannerBase : AlgorithmPlanner { uint32_t dataset_block_dim) { auto go = [&]() { - dispatch_cagra_team_dim(team_size, dataset_block_dim, [&]() { - this->add_static_fragment>(); - }); + dispatch_cagra_standard_team_dim( + team_size, dataset_block_dim, [&]() { + this->add_static_fragment>(); + }); }; // tag_u8 is only used for BitwiseHamming query layout; cosine norm fragments are built for // float query tag. Use if constexpr so we do not instantiate tag_norm_cosine with tag_u8 diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh index 08f44c171e..59b43bea64 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh @@ -100,9 +100,10 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker_impl( constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim; constexpr auto PQ_BITS = DescriptorT::kPqBits; constexpr auto PQ_LEN = DescriptorT::kPqLen; + constexpr auto EnableFP8 = DescriptorT::kEnableFP8; using PQ_CODEBOOK_LOAD_T = uint32_t; - using smem_val_config = vpq_smem_value_config; + using smem_val_config = vpq_smem_value_config; using smem_val_pack_t = typename smem_val_config::smem_val_pack_t; using smem_val_pack_uint_t = typename smem_val_config::smem_val_pack_uint_t; constexpr uint32_t num_packed_elements = smem_val_config::num_packed_elements; @@ -154,23 +155,55 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker_impl( for (std::uint32_t m = 0; m < PQ_LEN / num_packed_elements; m++) { constexpr uint32_t vq_val_pack_num_elements = 2; constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN); - const std::uint32_t vq_half2_index = + std::uint32_t vq_half2_index = m * (num_packed_elements / vq_val_pack_num_elements) + (PQ_LEN / 2) * v; - static_assert(num_packed_elements == 2, - "CAGRA JIT VPQ currently stores pq_len=8 in half2 shared-memory packs"); - const uint32_t query_val_index = - vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; + uint32_t query_val_index; + if constexpr (num_packed_elements == 2) { + query_val_index = + vq_half2_index * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId; + } else if constexpr (PQ_LEN == num_packed_elements) { + query_val_index = elem_offset + v * (DatasetBlockDim / (num_packed_elements * vlen)) + + e * TeamSize + laneId; + } else { + const uint32_t query_vec_element_id = + (elem_offset + e * vlen * TeamSize + v + laneId * vlen) * PQ_LEN / + num_packed_elements; + constexpr auto kStride = vlen * PQ_LEN / num_packed_elements; + query_val_index = + transpose(query_vec_element_id); + } - smem_val_pack_t q2, c2; - device::lds(q2, query_ptr + sizeof(smem_val_pack_t) * query_val_index); - device::lds(c2, - pq_codebook_ptr + - sizeof(smem_val_pack_uint_t) * ((1 << PQ_BITS) * m + (pq_code & 0xff))); - auto dist = - q2 - c2 - reinterpret_cast(vq_vals)[vq_half2_index]; - dist = dist * dist; - norm += static_cast(dist.x + dist.y); + if constexpr (num_packed_elements == 2) { + smem_val_pack_t q2, c2; + device::lds(q2, query_ptr + sizeof(smem_val_pack_t) * query_val_index); + device::lds(c2, + pq_codebook_ptr + + sizeof(smem_val_pack_uint_t) * ((1 << PQ_BITS) * m + (pq_code & 0xff))); + auto dist = + q2 - c2 - reinterpret_cast(vq_vals)[vq_half2_index]; + dist = dist * dist; + norm += static_cast(dist.x + dist.y); + } else if constexpr (num_packed_elements == 4 || num_packed_elements == 8) { + smem_val_pack_t q_vec, c_vec; + device::lds(q_vec.as_uint(), + query_ptr + sizeof(smem_val_pack_uint_t) * query_val_index); + device::lds(c_vec.as_uint(), + pq_codebook_ptr + + sizeof(smem_val_pack_uint_t) * ((1 << PQ_BITS) * m + (pq_code & 0xff))); + + half2 q2, c2; +#pragma unroll + for (uint32_t bi = 0; bi < num_packed_elements / 2; bi++) { + q2 = q_vec.as_half2(bi); + c2 = c_vec.as_half2(bi); + auto dist = + q2 - c2 - reinterpret_cast(vq_vals)[vq_half2_index]; + dist = dist * dist; + norm += static_cast(dist.x + dist.y); + vq_half2_index += 1; + } + } } pq_code >>= 8; } @@ -237,7 +270,8 @@ template + typename QueryT, + bool EnableFP8> __device__ DistanceT compute_distance_impl( const typename dataset_descriptor_base_t::args_t args, IndexT dataset_index) @@ -256,7 +290,8 @@ __device__ DistanceT compute_distance_impl( DataT, IndexT, DistanceT, - QueryT>; + QueryT, + EnableFP8>; return compute_distance_vpq_impl(args, dataset_index); } else { static_assert(sizeof(TeamSize) == 0, diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in index 13cd022918..130cbf502f 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in @@ -11,6 +11,7 @@ constexpr uint32_t k_team_size = @team_size@u; constexpr uint32_t k_dataset_block_dim = @dataset_block_dim@u; constexpr uint32_t k_pq_bits = @pq_bits@u; constexpr uint32_t k_pq_len = @pq_len@u; +constexpr bool k_enable_fp8 = @enable_fp8@; using data_t = @data_type@; using index_t = @index_type@; @@ -38,7 +39,8 @@ __device__ distance_t compute_distance(const args_t data_t, index_t, distance_t, - query_t>(args, dataset_index) + query_t, + k_enable_fp8>(args, dataset_index) : distance_t{}; return device::team_sum(per_thread, team_size_bits); } @@ -55,7 +57,8 @@ compute_distance_per_thread(const args_t args, inde data_t, index_t, distance_t, - query_t>(args, dataset_index); + query_t, + k_enable_fp8>(args, dataset_index); } } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json index 2e64ee2ce1..f1ce0daaab 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json @@ -81,6 +81,12 @@ "codebook_type": "void", "codebook_abbrev": "none" } + ], + "_smem": [ + { + "enable_fp8": "false", + "smem_abbrev": "f16" + } ] }, { @@ -149,6 +155,16 @@ "codebook_type": "half", "codebook_abbrev": "half" } + ], + "_smem": [ + { + "enable_fp8": "false", + "smem_abbrev": "f16" + }, + { + "enable_fp8": "true", + "smem_abbrev": "e5m2" + } ] }, { @@ -219,6 +235,16 @@ "dataset_block_dim": "1024", "team_size": "32" } + ], + "_smem": [ + { + "enable_fp8": "false", + "smem_abbrev": "f16" + }, + { + "enable_fp8": "true", + "smem_abbrev": "e5m2" + } ] } ] diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh index ed83c181fe..220c76ac96 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh @@ -84,7 +84,8 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq_impl( constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; constexpr auto PQ_BITS = DescriptorT::kPqBits; constexpr auto PQ_LEN = DescriptorT::kPqLen; - using smem_val_config = vpq_smem_value_config; + constexpr auto EnableFP8 = DescriptorT::kEnableFP8; + using smem_val_config = vpq_smem_value_config; using smem_val_t = typename smem_val_config::smem_val_t; using smem_val_pack_t = typename smem_val_config::smem_val_pack_t; using smem_val_pack_uint_t = typename smem_val_config::smem_val_pack_uint_t; @@ -120,10 +121,20 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq_impl( const auto smem_index = (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS); - smem_val_pack_t buf; - buf.x = r->pq_code_book_ptr()[i]; - buf.y = r->pq_code_book_ptr()[i + 1]; - device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_t), buf); + if constexpr (num_packed_elements == 2) { + smem_val_pack_t buf; + buf.x = r->pq_code_book_ptr()[i]; + buf.y = r->pq_code_book_ptr()[i + 1]; + device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_t), buf); + } else if constexpr (num_packed_elements == 4 || num_packed_elements == 8) { + smem_val_pack_t buf; +#pragma unroll + for (uint32_t k = 0; k < num_packed_elements; k++) { + buf.data.x1[k] = + static_cast(static_cast(r->pq_code_book_ptr()[i + k])); + } + device::sts(codebook_buf + smem_index * sizeof(smem_val_pack_uint_t), buf.as_uint()); + } } } } @@ -137,9 +148,21 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq_impl( DescriptorT::kSMemCodeBookSizeInBytes); for (unsigned i = threadIdx.x * num_packed_elements; i < dim; i += blockDim.x * num_packed_elements) { - smem_val_pack_t buf{0, 0}; - if (i < dim) { buf.x = mapping(queries_ptr[i]); } - if (i + 1 < dim) { buf.y = mapping(queries_ptr[i + 1]); } + smem_val_pack_t buf; + if constexpr (num_packed_elements == 2) { + buf.x = 0; + buf.y = 0; + if (i < dim) { buf.x = mapping(queries_ptr[i]); } + if (i + 1 < dim) { buf.y = mapping(queries_ptr[i + 1]); } + } else if constexpr (num_packed_elements == 4 || num_packed_elements == 8) { +#pragma unroll + for (uint32_t k = 0; k < num_packed_elements; k++) { + buf.data.x1[k] = static_cast(0.0f); + if (i + k < dim) { + buf.data.x1[k] = static_cast(static_cast(mapping(queries_ptr[i + k]))); + } + } + } if constexpr ((PQ_BITS == 8) && (PQ_LEN % num_packed_elements == 0)) { constexpr uint32_t vlen = 4; // **** DO NOT CHANGE **** constexpr auto kStride = vlen * PQ_LEN / num_packed_elements; @@ -162,7 +185,8 @@ template + typename QueryT, + bool EnableFP8> __device__ const dataset_descriptor_base_t* setup_workspace_impl( const dataset_descriptor_base_t* desc_ptr, void* smem, @@ -186,7 +210,8 @@ __device__ const dataset_descriptor_base_t* setup_work DataT, IndexT, DistanceT, - QueryT>; + QueryT, + EnableFP8>; const desc_t* desc = static_cast(desc_ptr); const desc_t* result = setup_workspace_vpq_impl(desc, smem, queries, query_id); diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in index fa17705250..2177212e36 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in @@ -12,6 +12,7 @@ constexpr uint32_t k_team_size = @team_size@u; constexpr uint32_t k_dataset_block_dim = @dataset_block_dim@u; constexpr uint32_t k_pq_bits = @pq_bits@u; constexpr uint32_t k_pq_len = @pq_len@u; +constexpr bool k_enable_fp8 = @enable_fp8@; using data_t = @data_type@; using index_t = @index_type@; @@ -39,7 +40,8 @@ setup_workspace( data_t, index_t, distance_t, - query_t>(desc, smem, queries, query_id); + query_t, + k_enable_fp8>(desc, smem, queries, query_id); } } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json index 64c82ce13a..7ee92494e6 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json @@ -81,6 +81,12 @@ "codebook_type": "void", "codebook_abbrev": "none" } + ], + "_smem": [ + { + "enable_fp8": "false", + "smem_abbrev": "f16" + } ] }, { @@ -149,6 +155,16 @@ "codebook_type": "half", "codebook_abbrev": "half" } + ], + "_smem": [ + { + "enable_fp8": "false", + "smem_abbrev": "f16" + }, + { + "enable_fp8": "true", + "smem_abbrev": "e5m2" + } ] }, { @@ -219,6 +235,16 @@ "dataset_block_dim": "1024", "team_size": "32" } + ], + "_smem": [ + { + "enable_fp8": "false", + "smem_abbrev": "f16" + }, + { + "enable_fp8": "true", + "smem_abbrev": "e5m2" + } ] } ] diff --git a/cpp/src/neighbors/detail/cagra/packed_type.hpp b/cpp/src/neighbors/detail/cagra/packed_type.hpp new file mode 100644 index 0000000000..f52edc126b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/packed_type.hpp @@ -0,0 +1,49 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once +#include +#include + +#include +#include + +namespace cuvs::neighbors::cagra::detail::device { +template +struct uintN_t {}; +template <> +struct uintN_t<32> { + using type = uint32_t; +}; +template <> +struct uintN_t<64> { + using type = uint64_t; +}; + +template +struct fp8xN {}; + +template +struct fp8xN { + using uint_t = typename uintN_t<8 * NumPacked>::type; + using unit_t = __nv_fp8_e5m2; + using x2_t = __nv_fp8x2_storage_t; + static constexpr uint32_t num_elements = NumPacked; + + union { + unit_t x1[num_elements]; + x2_t x2[num_elements / 2]; + uint_t u; + } data; + + HDI fp8xN() { data.u = 0; } + + HDI uint_t& as_uint() { return data.u; } + HDI uint_t as_uint() const { return data.u; } + HDI half2 as_half2(const uint32_t i) const + { + return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2); + } +}; +} // namespace cuvs::neighbors::cagra::detail::device From c323fa17eb8c603bb45a47e7c36c0fe3a6cc9d32 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Thu, 4 Jun 2026 18:40:30 +0900 Subject: [PATCH 106/119] Update EnableFP8 --- cpp/CMakeLists.txt | 14 ++++---- .../detail/jit_lto/cagra/cagra_fragments.hpp | 8 +++-- cpp/include/cuvs/neighbors/cagra.hpp | 6 ++++ .../detail/cagra/compute_distance.hpp | 17 ++++----- .../cagra/compute_distance_vpq-impl.cuh | 35 ++++++++++--------- .../detail/cagra/compute_distance_vpq.hpp | 12 ++++--- .../cagra/compute_distance_vpq_inst.cu.in | 4 +-- .../cagra/compute_distance_vpq_matrix.json | 12 +++++-- .../cagra_jit_launcher_factory.hpp | 12 +++---- .../jit_lto_kernels/cagra_planner_base.hpp | 32 ++++++++++------- .../jit_lto_kernels/compute_distance_impl.cuh | 8 ++--- .../compute_distance_kernel.cu.in | 6 ++-- .../compute_distance_matrix.json | 10 +++--- .../jit_lto_kernels/setup_workspace_impl.cuh | 8 ++--- .../setup_workspace_kernel.cu.in | 4 +-- .../setup_workspace_matrix.json | 10 +++--- cpp/tests/neighbors/ann_cagra.cuh | 22 +++++++++++- 17 files changed, 135 insertions(+), 85 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7f9f88695c..a49df49812 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -268,12 +268,12 @@ if(NOT BUILD_CPU_ONLY) INPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in" OUTPUT_FILE_FORMAT - "${CMAKE_CURRENT_BINARY_DIR}/src/neighbors/detail/cagra/compute_distance_vpq_inst_data_@data_abbrev@_index_@index_abbrev@_distance_@distance_abbrev@_codebook_@codebook_abbrev@_metric_@metric@_team_@team_size@_dim_@dim@_pq_bits_@pq_bits@_pq_len_@pq_len@.cu" + "${CMAKE_CURRENT_BINARY_DIR}/src/neighbors/detail/cagra/compute_distance_vpq_inst_data_@data_abbrev@_index_@index_abbrev@_distance_@distance_abbrev@_codebook_@codebook_abbrev@_metric_@metric@_team_@team_size@_dim_@dim@_pq_bits_@pq_bits@_pq_len_@pq_len@_smem_@smem_abbrev@.cu" ) generate_string_matrix( cagra_compute_distance_vpq_selector_template_params ITEM_FORMAT - "\nvpq_descriptor_spec" + "\nvpq_descriptor_spec" GLUE "," MATRIX_JSON_FILE @@ -282,7 +282,7 @@ if(NOT BUILD_CPU_ONLY) generate_string_matrix( cagra_compute_distance_vpq_template_inst ITEM_FORMAT - "extern template struct vpq_descriptor_spec@semicolon@" + "extern template struct vpq_descriptor_spec@semicolon@" GLUE "\n" MATRIX_JSON_FILE @@ -688,13 +688,13 @@ if(NOT BUILD_CPU_ONLY) generate_jit_lto_kernels( jit_lto_files NAME_FORMAT - "cagra_setup_workspace@pq_prefix@_team_size_@team_size@_dataset_block_dim_@dataset_block_dim@_@pq_bits@pq_@pq_len@subd_data_@data_abbrev@_query_@query_abbrev@" + "cagra_setup_workspace@pq_prefix@_team_size_@team_size@_dataset_block_dim_@dataset_block_dim@_@pq_bits@pq_@pq_len@subd_data_@data_abbrev@_query_@query_abbrev@_smem_@smem_abbrev@" MATRIX_JSON_FILE "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json" KERNEL_INPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in" FRAGMENT_TAG_FORMAT - "${cagra_ns}::fragment_tag_setup_workspace<${neighbors_ns}::tag_@data_abbrev@, ${neighbors_ns}::tag_index_@index_abbrev@, ${cagra_ns}::tag_dist_@distance_abbrev@, ${neighbors_ns}::tag_@query_abbrev@, ${cagra_ns}::tag_codebook_@codebook_abbrev@, @team_size@, @dataset_block_dim@, @pq_bits@, @pq_len@>" + "${cagra_ns}::fragment_tag_setup_workspace<${neighbors_ns}::tag_@data_abbrev@, ${neighbors_ns}::tag_index_@index_abbrev@, ${cagra_ns}::tag_dist_@distance_abbrev@, ${neighbors_ns}::tag_@query_abbrev@, ${cagra_ns}::tag_codebook_@codebook_abbrev@, @team_size@, @dataset_block_dim@, @pq_bits@, @pq_len@, ${cagra_ns}::tag_smem_@smem_abbrev@>" FRAGMENT_TAG_HEADER_FILES "" "" OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/cagra/setup_workspace" @@ -704,13 +704,13 @@ if(NOT BUILD_CPU_ONLY) generate_jit_lto_kernels( jit_lto_files NAME_FORMAT - "cagra_compute_distance@pq_prefix@_team_size_@team_size@_dataset_block_dim_@dataset_block_dim@_@pq_bits@pq_@pq_len@subd_data_@data_abbrev@_query_@query_abbrev@" + "cagra_compute_distance@pq_prefix@_team_size_@team_size@_dataset_block_dim_@dataset_block_dim@_@pq_bits@pq_@pq_len@subd_data_@data_abbrev@_query_@query_abbrev@_smem_@smem_abbrev@" MATRIX_JSON_FILE "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json" KERNEL_INPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in" FRAGMENT_TAG_FORMAT - "${cagra_ns}::fragment_tag_compute_distance<${neighbors_ns}::tag_@data_abbrev@, ${neighbors_ns}::tag_index_@index_abbrev@, ${cagra_ns}::tag_dist_@distance_abbrev@, ${neighbors_ns}::tag_@query_abbrev@, ${cagra_ns}::tag_codebook_@codebook_abbrev@, @team_size@, @dataset_block_dim@, @pq_bits@, @pq_len@>" + "${cagra_ns}::fragment_tag_compute_distance<${neighbors_ns}::tag_@data_abbrev@, ${neighbors_ns}::tag_index_@index_abbrev@, ${cagra_ns}::tag_dist_@distance_abbrev@, ${neighbors_ns}::tag_@query_abbrev@, ${cagra_ns}::tag_codebook_@codebook_abbrev@, @team_size@, @dataset_block_dim@, @pq_bits@, @pq_len@, ${cagra_ns}::tag_smem_@smem_abbrev@>" FRAGMENT_TAG_HEADER_FILES "" "" OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/cagra/compute_distance" diff --git a/cpp/include/cuvs/detail/jit_lto/cagra/cagra_fragments.hpp b/cpp/include/cuvs/detail/jit_lto/cagra/cagra_fragments.hpp index 0b42d79379..67cdd38783 100644 --- a/cpp/include/cuvs/detail/jit_lto/cagra/cagra_fragments.hpp +++ b/cpp/include/cuvs/detail/jit_lto/cagra/cagra_fragments.hpp @@ -16,6 +16,8 @@ struct tag_metric_cosine {}; struct tag_metric_hamming {}; struct tag_codebook_none {}; struct tag_codebook_half {}; +struct tag_smem_f16 {}; +struct tag_smem_e5m2 {}; struct tag_metric_l1 {}; struct tag_norm_noop {}; struct tag_norm_cosine {}; @@ -33,7 +35,8 @@ template + uint32_t PqLen, + typename SmemTag> struct fragment_tag_setup_workspace {}; template + uint32_t PqLen, + typename SmemTag> struct fragment_tag_compute_distance {}; template diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index 8edbcab8fa..d2a55ce406 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -276,6 +276,8 @@ enum class search_algo { enum class hash_mode { HASH = 0, SMALL = 1, AUTO = 100 }; +enum class internal_dtype { F16 = 0, E5M2 = 1, AUTO = 100 }; + struct search_params : cuvs::neighbors::search_params { /** Maximum number of queries to search at the same time (batch size). Auto select when 0.*/ size_t max_queries = 0; @@ -349,6 +351,10 @@ struct search_params : cuvs::neighbors::search_params { * negative, in which case the filtering rate is automatically calculated. */ float filtering_rate = -1.0; + + /** Data type of the query vector and codebook table on shared memory. Currently, only VPQ + * supports FP8. **/ + internal_dtype smem_dtype = internal_dtype::AUTO; }; /** diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index 7f921ce948..45997a62a3 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -202,12 +202,12 @@ struct dataset_descriptor_host { uint32_t team_size = 0; // JIT LTO metadata - stored when descriptor is created - cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded; - uint32_t dataset_block_dim = 0; - bool is_vpq = false; - uint32_t pq_bits = 0; - uint32_t pq_len = 0; - bool enable_fp8 = false; + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded; + uint32_t dataset_block_dim = 0; + bool is_vpq = false; + uint32_t pq_bits = 0; + uint32_t pq_len = 0; + cuvs::neighbors::cagra::internal_dtype smem_dtype = cuvs::neighbors::cagra::internal_dtype::F16; // Codebook type is determined by DataT for VPQ (always half for now) struct state { @@ -260,7 +260,8 @@ struct dataset_descriptor_host { bool is_vpq_val = false, uint32_t pq_bits_val = 0, uint32_t pq_len_val = 0, - bool enable_fp8_val = false) + cuvs::neighbors::cagra::internal_dtype smem_dtype_val = + cuvs::neighbors::cagra::internal_dtype::F16) : value_{std::make_shared(init, sizeof(DescriptorImpl))}, smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()}, team_size{dd_host.team_size()}, @@ -269,7 +270,7 @@ struct dataset_descriptor_host { is_vpq{is_vpq_val}, pq_bits{pq_bits_val}, pq_len{pq_len_val}, - enable_fp8{enable_fp8_val} + smem_dtype{smem_dtype_val} { } diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh index ea994c450a..f73f901f95 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh @@ -15,21 +15,24 @@ namespace cuvs::neighbors::cagra::detail { -template +template struct vpq_smem_value_config; -template -struct vpq_smem_value_config> { +template +struct vpq_smem_value_config< + PQ_LEN, + SmemDType, + std::enable_if_t> { using smem_val_pack_t = half2; using smem_val_t = half; using smem_val_pack_uint_t = uint32_t; static constexpr uint32_t num_packed_elements = 2; }; -template +template struct vpq_smem_value_config> { + cuvs::neighbors::cagra::internal_dtype::E5M2, + std::enable_if_t> { using smem_val_pack_t = device::fp8xN; using smem_val_t = typename smem_val_pack_t::unit_t; using smem_val_pack_uint_t = typename smem_val_pack_t::uint_t; @@ -45,7 +48,7 @@ template + cuvs::neighbors::cagra::internal_dtype SmemDType> struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t { using base_type = dataset_descriptor_base_t; using CODE_BOOK_T = CodebookT; @@ -61,7 +64,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t, "Only CODE_BOOK_T = `half` is supported now"); @@ -104,7 +107,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t; + using smem_val_config = vpq_smem_value_config; static constexpr std::uint32_t kSMemCodeBookSizeInBytes = (1 << PQ_BITS) * PQ_LEN * utils::size_of() / @@ -152,7 +155,7 @@ template + cuvs::neighbors::cagra::internal_dtype SmemDType> RAFT_KERNEL __launch_bounds__(1, 1) vpq_dataset_descriptor_init_kernel(dataset_descriptor_base_t* out, const std::uint8_t* encoded_dataset_ptr, @@ -171,7 +174,7 @@ RAFT_KERNEL __launch_bounds__(1, 1) IndexT, DistanceT, half, - EnableFP8>; + SmemDType>; new (out) desc_type( encoded_dataset_ptr, encoded_dataset_dim, vq_code_book_ptr, pq_code_book_ptr, size, dim); } @@ -185,7 +188,7 @@ template + cuvs::neighbors::cagra::internal_dtype SmemDType> dataset_descriptor_host vpq_descriptor_spec::init_(const cagra::search_params& params, + SmemDType>::init_(const cagra::search_params& params, const std::uint8_t* encoded_dataset_ptr, uint32_t encoded_dataset_dim, const CodebookT* vq_code_book_ptr, @@ -213,7 +216,7 @@ vpq_descriptor_spec; + SmemDType>; return host_type{ desc_type{ @@ -229,7 +232,7 @@ vpq_descriptor_spec<<<1, 1, 0, stream>>>(dev_ptr, + SmemDType><<<1, 1, 0, stream>>>(dev_ptr, encoded_dataset_ptr, encoded_dataset_dim, vq_code_book_ptr, @@ -243,7 +246,7 @@ vpq_descriptor_spec + cuvs::neighbors::cagra::internal_dtype SmemDType> struct vpq_descriptor_spec : public instance_spec { using base_type = instance_spec; using typename base_type::data_type; @@ -66,9 +66,11 @@ struct vpq_descriptor_spec : public instance_spec { cuvs::distance::DistanceType metric) -> double { const auto fp8_natively_supported = raft::getComputeCapability().first >= 9; - const auto use_fp8 = - params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::E5M2 || - (params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::AUTO && fp8_natively_supported); + const auto selected_smem_dtype = + params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::AUTO + ? (fp8_natively_supported ? cuvs::neighbors::cagra::internal_dtype::E5M2 + : cuvs::neighbors::cagra::internal_dtype::F16) + : params.smem_dtype; // If explicit team_size is specified and doesn't match the instance, discard it if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } @@ -76,7 +78,7 @@ struct vpq_descriptor_spec : public instance_spec { // Match codebook params if (dataset.pq_bits() != PqBits) { return -1.0; } if (dataset.pq_len() != PqLen) { return -1.0; } - if (use_fp8 != EnableFP8) { return -1.0; } + if (selected_smem_dtype != SmemDType) { return -1.0; } // Keep auto-selection on the tuned VPQ diagonal while allowing explicit team_size requests to // use the expanded team_size / dataset_block_dim grid. constexpr std::uint32_t auto_dataset_block_dim_per_team = PqLen == 8 ? 32 : 16; diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in index 676f25c9fd..25d4732a34 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_inst.cu.in @@ -13,7 +13,7 @@ constexpr uint32_t team_size = @team_size@; constexpr uint32_t dim = @dim@; constexpr uint32_t pq_bits = @pq_bits@; constexpr uint32_t pq_len = @pq_len@; -constexpr bool enable_fp8 = @enable_fp8@; +constexpr auto smem_dtype = @smem_dtype@; using codebook_t = @codebook_type@; using data_t = @data_type@; using index_t = @index_type@; @@ -32,6 +32,6 @@ template struct vpq_descriptor_spec; + smem_dtype>; } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json index 7dac07c2a4..1241b2346c 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_matrix.json @@ -97,8 +97,14 @@ "metric": [ "L2Expanded" ], - "enable_fp8": [ - "true", - "false" + "_smem": [ + { + "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::E5M2", + "smem_abbrev": "e5m2" + }, + { + "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::F16", + "smem_abbrev": "f16" + } ] } diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp index 60e965796c..973a5a1176 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_jit_launcher_factory.hpp @@ -60,11 +60,11 @@ std::shared_ptr build_single_cta_launcher( planner.add_setup_workspace_device_function(dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len, - dataset_desc.enable_fp8); + dataset_desc.smem_dtype); planner.add_compute_distance_device_function(dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len, - dataset_desc.enable_fp8); + dataset_desc.smem_dtype); } else { planner.add_setup_workspace_device_function(dataset_desc.team_size, dataset_desc.dataset_block_dim); @@ -109,11 +109,11 @@ std::shared_ptr build_multi_cta_launcher( planner.add_setup_workspace_device_function(dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len, - dataset_desc.enable_fp8); + dataset_desc.smem_dtype); planner.add_compute_distance_device_function(dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len, - dataset_desc.enable_fp8); + dataset_desc.smem_dtype); } else { planner.add_setup_workspace_device_function(dataset_desc.team_size, dataset_desc.dataset_block_dim); @@ -158,11 +158,11 @@ std::shared_ptr build_multi_kernel_launcher( planner.add_setup_workspace_device_function(dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len, - dataset_desc.enable_fp8); + dataset_desc.smem_dtype); planner.add_compute_distance_device_function(dataset_desc.team_size, dataset_desc.dataset_block_dim, dataset_desc.pq_len, - dataset_desc.enable_fp8); + dataset_desc.smem_dtype); } else { planner.add_setup_workspace_device_function(dataset_desc.team_size, dataset_desc.dataset_block_dim); diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp index 14ef271c2a..cce18a0216 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp @@ -65,7 +65,7 @@ struct CagraPlannerBase : AlgorithmPlanner { void add_setup_workspace_device_function(uint32_t team_size, uint32_t dataset_block_dim, uint32_t pq_len, - bool enable_fp8) + cuvs::neighbors::cagra::internal_dtype smem_dtype) { if (pq_len != 2 && pq_len != 4 && pq_len != 8) { RAFT_FAIL("CAGRA JIT VPQ setup_workspace expects pq_len in {2,4,8} (matrix uses pq_bits=8)"); @@ -92,11 +92,7 @@ struct CagraPlannerBase : AlgorithmPlanner { add.template operator()(); }); }; - if (enable_fp8) { - dispatch_smem.template operator()(); - } else { - dispatch_smem.template operator()(); - } + dispatch_cagra_smem_dtype(smem_dtype, dispatch_smem); } /// Registers dist_op + normalization + `compute_distance` for standard layout. @@ -132,7 +128,7 @@ struct CagraPlannerBase : AlgorithmPlanner { void add_compute_distance_device_function(uint32_t team_size, uint32_t dataset_block_dim, uint32_t pq_len, - bool enable_fp8) + cuvs::neighbors::cagra::internal_dtype smem_dtype) { if (pq_len != 2 && pq_len != 4 && pq_len != 8) { RAFT_FAIL("CAGRA JIT VPQ compute_distance expects pq_len in {2,4,8} (matrix uses pq_bits=8)"); @@ -159,14 +155,26 @@ struct CagraPlannerBase : AlgorithmPlanner { add.template operator()(); }); }; - if (enable_fp8) { - dispatch_smem.template operator()(); - } else { - dispatch_smem.template operator()(); - } + dispatch_cagra_smem_dtype(smem_dtype, dispatch_smem); } private: + template + static void dispatch_cagra_smem_dtype(cuvs::neighbors::cagra::internal_dtype smem_dtype, + Lambda&& l) + { + switch (smem_dtype) { + case cuvs::neighbors::cagra::internal_dtype::F16: + std::forward(l).template operator()(); + return; + case cuvs::neighbors::cagra::internal_dtype::E5M2: + std::forward(l).template operator()(); + return; + default: break; + } + RAFT_FAIL("Unsupported CAGRA JIT smem_dtype: %u", static_cast(smem_dtype)); + } + template static void dispatch_cagra_standard_team_dim(uint32_t team_size, uint32_t dataset_block_dim, diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh index 59b43bea64..ea817110e7 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh @@ -100,10 +100,10 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker_impl( constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim; constexpr auto PQ_BITS = DescriptorT::kPqBits; constexpr auto PQ_LEN = DescriptorT::kPqLen; - constexpr auto EnableFP8 = DescriptorT::kEnableFP8; + constexpr auto SmemDType = DescriptorT::kSmemDType; using PQ_CODEBOOK_LOAD_T = uint32_t; - using smem_val_config = vpq_smem_value_config; + using smem_val_config = vpq_smem_value_config; using smem_val_pack_t = typename smem_val_config::smem_val_pack_t; using smem_val_pack_uint_t = typename smem_val_config::smem_val_pack_uint_t; constexpr uint32_t num_packed_elements = smem_val_config::num_packed_elements; @@ -271,7 +271,7 @@ template + cuvs::neighbors::cagra::internal_dtype SmemDType> __device__ DistanceT compute_distance_impl( const typename dataset_descriptor_base_t::args_t args, IndexT dataset_index) @@ -291,7 +291,7 @@ __device__ DistanceT compute_distance_impl( IndexT, DistanceT, QueryT, - EnableFP8>; + SmemDType>; return compute_distance_vpq_impl(args, dataset_index); } else { static_assert(sizeof(TeamSize) == 0, diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in index 130cbf502f..1856781391 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_kernel.cu.in @@ -11,7 +11,7 @@ constexpr uint32_t k_team_size = @team_size@u; constexpr uint32_t k_dataset_block_dim = @dataset_block_dim@u; constexpr uint32_t k_pq_bits = @pq_bits@u; constexpr uint32_t k_pq_len = @pq_len@u; -constexpr bool k_enable_fp8 = @enable_fp8@; +constexpr auto k_smem_dtype = @smem_dtype@; using data_t = @data_type@; using index_t = @index_type@; @@ -40,7 +40,7 @@ __device__ distance_t compute_distance(const args_t index_t, distance_t, query_t, - k_enable_fp8>(args, dataset_index) + k_smem_dtype>(args, dataset_index) : distance_t{}; return device::team_sum(per_thread, team_size_bits); } @@ -58,7 +58,7 @@ compute_distance_per_thread(const args_t args, inde index_t, distance_t, query_t, - k_enable_fp8>(args, dataset_index); + k_smem_dtype>(args, dataset_index); } } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json index f1ce0daaab..4d260c5507 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_matrix.json @@ -84,7 +84,7 @@ ], "_smem": [ { - "enable_fp8": "false", + "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::F16", "smem_abbrev": "f16" } ] @@ -158,11 +158,11 @@ ], "_smem": [ { - "enable_fp8": "false", + "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::F16", "smem_abbrev": "f16" }, { - "enable_fp8": "true", + "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::E5M2", "smem_abbrev": "e5m2" } ] @@ -238,11 +238,11 @@ ], "_smem": [ { - "enable_fp8": "false", + "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::F16", "smem_abbrev": "f16" }, { - "enable_fp8": "true", + "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::E5M2", "smem_abbrev": "e5m2" } ] diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh index 220c76ac96..494e0973fe 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_impl.cuh @@ -84,8 +84,8 @@ _RAFT_DEVICE __noinline__ auto setup_workspace_vpq_impl( constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim; constexpr auto PQ_BITS = DescriptorT::kPqBits; constexpr auto PQ_LEN = DescriptorT::kPqLen; - constexpr auto EnableFP8 = DescriptorT::kEnableFP8; - using smem_val_config = vpq_smem_value_config; + constexpr auto SmemDType = DescriptorT::kSmemDType; + using smem_val_config = vpq_smem_value_config; using smem_val_t = typename smem_val_config::smem_val_t; using smem_val_pack_t = typename smem_val_config::smem_val_pack_t; using smem_val_pack_uint_t = typename smem_val_config::smem_val_pack_uint_t; @@ -186,7 +186,7 @@ template + cuvs::neighbors::cagra::internal_dtype SmemDType> __device__ const dataset_descriptor_base_t* setup_workspace_impl( const dataset_descriptor_base_t* desc_ptr, void* smem, @@ -211,7 +211,7 @@ __device__ const dataset_descriptor_base_t* setup_work IndexT, DistanceT, QueryT, - EnableFP8>; + SmemDType>; const desc_t* desc = static_cast(desc_ptr); const desc_t* result = setup_workspace_vpq_impl(desc, smem, queries, query_id); diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in index 2177212e36..6a54c9f956 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_kernel.cu.in @@ -12,7 +12,7 @@ constexpr uint32_t k_team_size = @team_size@u; constexpr uint32_t k_dataset_block_dim = @dataset_block_dim@u; constexpr uint32_t k_pq_bits = @pq_bits@u; constexpr uint32_t k_pq_len = @pq_len@u; -constexpr bool k_enable_fp8 = @enable_fp8@; +constexpr auto k_smem_dtype = @smem_dtype@; using data_t = @data_type@; using index_t = @index_type@; @@ -41,7 +41,7 @@ setup_workspace( index_t, distance_t, query_t, - k_enable_fp8>(desc, smem, queries, query_id); + k_smem_dtype>(desc, smem, queries, query_id); } } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json index 7ee92494e6..567fe3e5a1 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/setup_workspace_matrix.json @@ -84,7 +84,7 @@ ], "_smem": [ { - "enable_fp8": "false", + "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::F16", "smem_abbrev": "f16" } ] @@ -158,11 +158,11 @@ ], "_smem": [ { - "enable_fp8": "false", + "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::F16", "smem_abbrev": "f16" }, { - "enable_fp8": "true", + "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::E5M2", "smem_abbrev": "e5m2" } ] @@ -238,11 +238,11 @@ ], "_smem": [ { - "enable_fp8": "false", + "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::F16", "smem_abbrev": "f16" }, { - "enable_fp8": "true", + "smem_dtype": "cuvs::neighbors::cagra::internal_dtype::E5M2", "smem_abbrev": "e5m2" } ] diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh index 826b8d1a3a..c8322d29fd 100644 --- a/cpp/tests/neighbors/ann_cagra.cuh +++ b/cpp/tests/neighbors/ann_cagra.cuh @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -276,6 +277,7 @@ struct AnnCagraInputs { std::optional non_owning_memory_buffer_flag = std::nullopt; cuvs::neighbors::MergeStrategy merge_strategy = cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL; + cuvs::neighbors::cagra::internal_dtype smem_dtype = cuvs::neighbors::cagra::internal_dtype::AUTO; }; inline ::std::ostream& operator<<(::std::ostream& os, const AnnCagraInputs& p) @@ -299,6 +301,14 @@ inline ::std::ostream& operator<<(::std::ostream& os, const AnnCagraInputs& p) {search_algo::AUTO, "auto"} // }; std::vector build_algo = {"IVF_PQ", "NN_DESCENT", "ITERATIVE_CAGRA_SEARCH", "AUTO"}; + const auto smem_dtype_str = [](cuvs::neighbors::cagra::internal_dtype dtype) { + switch (dtype) { + case cuvs::neighbors::cagra::internal_dtype::F16: return "F16"; + case cuvs::neighbors::cagra::internal_dtype::E5M2: return "E5M2"; + case cuvs::neighbors::cagra::internal_dtype::AUTO: return "AUTO"; + } + return "Unknown"; + }; std::vector merge_strategy = {"PHYSICAL", "LOGICAL"}; os << "{n_queries=" << p.n_queries << ", dataset shape=" << p.n_rows << "x" << p.dim << ", k=" << p.k << ", " << algo_name[p.algo] << ", max_queries=" << p.max_queries @@ -312,7 +322,7 @@ inline ::std::ostream& operator<<(::std::ostream& os, const AnnCagraInputs& p) if (p.compression.has_value()) { auto vpq = p.compression.value(); os << ", pq_bits=" << vpq.pq_bits << ", pq_dim=" << vpq.pq_dim - << ", vq_n_centers=" << vpq.vq_n_centers; + << ", vq_n_centers=" << vpq.vq_n_centers << ", smem_dtype=" << smem_dtype_str(p.smem_dtype); } os << '}' << std::endl; return os; @@ -346,6 +356,10 @@ class AnnCagraTest : public ::testing::TestWithParam { if (ps.metric == cuvs::distance::DistanceType::L1 && ps.build_algo != graph_build_algo::ITERATIVE_CAGRA_SEARCH) GTEST_SKIP(); + if (ps.smem_dtype == cuvs::neighbors::cagra::internal_dtype::E5M2 && + raft::getComputeCapability().first < 9) { + GTEST_SKIP() << "CAGRA VPQ E5M2 smem dtype requires native FP8 support on SM90+"; + } if (ps.metric == cuvs::distance::DistanceType::CosineExpanded) { if (ps.compression.has_value()) { GTEST_SKIP(); } if (ps.build_algo == graph_build_algo::ITERATIVE_CAGRA_SEARCH || ps.dim == 1) { @@ -415,6 +429,7 @@ class AnnCagraTest : public ::testing::TestWithParam { search_params.algo = ps.algo; search_params.max_queries = ps.max_queries; search_params.team_size = ps.team_size; + search_params.smem_dtype = ps.smem_dtype; auto database_view = raft::make_device_matrix_view( (const DataT*)database.data(), ps.n_rows, ps.dim); @@ -1701,7 +1716,12 @@ inline std::vector generate_inputs() ps.pq_dim = input.dim / pq_len; ps.vq_n_centers = vq_n_centers; input.compression.emplace(ps); + input.smem_dtype = cuvs::neighbors::cagra::internal_dtype::AUTO; inputs.push_back(input); + if (pq_len >= 4 && vq_n_centers == 100) { + input.smem_dtype = cuvs::neighbors::cagra::internal_dtype::E5M2; + inputs.push_back(input); + } } } } From a577563fccababf13e5eba40d44240038b9e6d46 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Fri, 5 Jun 2026 00:55:48 +0900 Subject: [PATCH 107/119] Update vpq test --- cpp/tests/neighbors/ann_cagra.cuh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh index c8322d29fd..5c0da34dd0 100644 --- a/cpp/tests/neighbors/ann_cagra.cuh +++ b/cpp/tests/neighbors/ann_cagra.cuh @@ -1711,15 +1711,15 @@ inline std::vector generate_inputs() // without refinement for (uint32_t pq_len : {2, 4, 8}) { for (uint32_t vq_n_centers : {100, 1000}) { - for (auto input : inputs2) { - vpq_params ps{}; - ps.pq_dim = input.dim / pq_len; - ps.vq_n_centers = vq_n_centers; - input.compression.emplace(ps); - input.smem_dtype = cuvs::neighbors::cagra::internal_dtype::AUTO; - inputs.push_back(input); - if (pq_len >= 4 && vq_n_centers == 100) { - input.smem_dtype = cuvs::neighbors::cagra::internal_dtype::E5M2; + for (auto internal_smem_dtype : {cuvs::neighbors::cagra::internal_dtype::E5M2, + cuvs::neighbors::cagra::internal_dtype::F16, + cuvs::neighbors::cagra::internal_dtype::AUTO}) { + for (auto input : inputs2) { + vpq_params ps{}; + ps.pq_dim = input.dim / pq_len; + ps.vq_n_centers = vq_n_centers; + input.compression.emplace(ps); + input.smem_dtype = internal_smem_dtype; inputs.push_back(input); } } From d05f5524eee4868dd62713e53e18e4afeaa0fce3 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Fri, 5 Jun 2026 14:55:22 +0900 Subject: [PATCH 108/119] Remove internal_dtype::AUTO --- cpp/include/cuvs/neighbors/cagra.hpp | 4 ++-- cpp/src/neighbors/detail/cagra/cagra_search.cuh | 7 +++---- .../neighbors/detail/cagra/compute_distance_vpq.hpp | 10 +--------- cpp/tests/neighbors/ann_cagra.cuh | 6 ++---- 4 files changed, 8 insertions(+), 19 deletions(-) diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index d2a55ce406..9a906687e3 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -276,7 +276,7 @@ enum class search_algo { enum class hash_mode { HASH = 0, SMALL = 1, AUTO = 100 }; -enum class internal_dtype { F16 = 0, E5M2 = 1, AUTO = 100 }; +enum class internal_dtype { F16 = 0, E5M2 = 1 }; struct search_params : cuvs::neighbors::search_params { /** Maximum number of queries to search at the same time (batch size). Auto select when 0.*/ @@ -354,7 +354,7 @@ struct search_params : cuvs::neighbors::search_params { /** Data type of the query vector and codebook table on shared memory. Currently, only VPQ * supports FP8. **/ - internal_dtype smem_dtype = internal_dtype::AUTO; + internal_dtype smem_dtype = internal_dtype::F16; }; /** diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index f199cf7882..6a64ad7a85 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -153,10 +153,9 @@ void search_main(raft::resources const& res, // Dispatch search parameters based on the dataset kind. if (auto* strided_dset = dynamic_cast*>(&index.data()); strided_dset != nullptr) { - if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::AUTO && - params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::F16) { - RAFT_LOG_WARN("In this search mode, smem_dtype supports only AUTO or F16. Set it to AUTO."); - params.smem_dtype = cuvs::neighbors::cagra::internal_dtype::AUTO; + if (params.smem_dtype != cuvs::neighbors::cagra::internal_dtype::F16) { + RAFT_LOG_WARN("In this search mode, smem_dtype supports only F16. Set it to F16."); + params.smem_dtype = cuvs::neighbors::cagra::internal_dtype::F16; } // Search using a plain (strided) row-major dataset RAFT_EXPECTS(index.metric() != cuvs::distance::DistanceType::CosineExpanded || diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp index b32a0f17c6..83954491bf 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp @@ -8,7 +8,6 @@ #include "compute_distance.hpp" #include -#include #include @@ -65,20 +64,13 @@ struct vpq_descriptor_spec : public instance_spec { const DatasetT& dataset, cuvs::distance::DistanceType metric) -> double { - const auto fp8_natively_supported = raft::getComputeCapability().first >= 9; - const auto selected_smem_dtype = - params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::AUTO - ? (fp8_natively_supported ? cuvs::neighbors::cagra::internal_dtype::E5M2 - : cuvs::neighbors::cagra::internal_dtype::F16) - : params.smem_dtype; - // If explicit team_size is specified and doesn't match the instance, discard it if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; } if (cuvs::distance::DistanceType::L2Expanded != metric) { return -1.0; } // Match codebook params if (dataset.pq_bits() != PqBits) { return -1.0; } if (dataset.pq_len() != PqLen) { return -1.0; } - if (selected_smem_dtype != SmemDType) { return -1.0; } + if (params.smem_dtype != SmemDType) { return -1.0; } // Keep auto-selection on the tuned VPQ diagonal while allowing explicit team_size requests to // use the expanded team_size / dataset_block_dim grid. constexpr std::uint32_t auto_dataset_block_dim_per_team = PqLen == 8 ? 32 : 16; diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh index 5c0da34dd0..5f83d2eb8d 100644 --- a/cpp/tests/neighbors/ann_cagra.cuh +++ b/cpp/tests/neighbors/ann_cagra.cuh @@ -277,7 +277,7 @@ struct AnnCagraInputs { std::optional non_owning_memory_buffer_flag = std::nullopt; cuvs::neighbors::MergeStrategy merge_strategy = cuvs::neighbors::MergeStrategy::MERGE_STRATEGY_PHYSICAL; - cuvs::neighbors::cagra::internal_dtype smem_dtype = cuvs::neighbors::cagra::internal_dtype::AUTO; + cuvs::neighbors::cagra::internal_dtype smem_dtype = cuvs::neighbors::cagra::internal_dtype::F16; }; inline ::std::ostream& operator<<(::std::ostream& os, const AnnCagraInputs& p) @@ -305,7 +305,6 @@ inline ::std::ostream& operator<<(::std::ostream& os, const AnnCagraInputs& p) switch (dtype) { case cuvs::neighbors::cagra::internal_dtype::F16: return "F16"; case cuvs::neighbors::cagra::internal_dtype::E5M2: return "E5M2"; - case cuvs::neighbors::cagra::internal_dtype::AUTO: return "AUTO"; } return "Unknown"; }; @@ -1712,8 +1711,7 @@ inline std::vector generate_inputs() for (uint32_t pq_len : {2, 4, 8}) { for (uint32_t vq_n_centers : {100, 1000}) { for (auto internal_smem_dtype : {cuvs::neighbors::cagra::internal_dtype::E5M2, - cuvs::neighbors::cagra::internal_dtype::F16, - cuvs::neighbors::cagra::internal_dtype::AUTO}) { + cuvs::neighbors::cagra::internal_dtype::F16}) { for (auto input : inputs2) { vpq_params ps{}; ps.pq_dim = input.dim / pq_len; From 902073970e238dbede59697b2992bf066a8b75a7 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Fri, 5 Jun 2026 15:15:36 +0900 Subject: [PATCH 109/119] Update fp8xN to used SW emulated FP8 when FP8 is not natively supported --- .../neighbors/detail/cagra/packed_type.hpp | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/packed_type.hpp b/cpp/src/neighbors/detail/cagra/packed_type.hpp index f52edc126b..4e67fd50e6 100644 --- a/cpp/src/neighbors/detail/cagra/packed_type.hpp +++ b/cpp/src/neighbors/detail/cagra/packed_type.hpp @@ -3,6 +3,8 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once +#include "../../ivf_pq/ivf_pq_fp_8bit.cuh" + #include #include @@ -26,24 +28,39 @@ struct fp8xN {}; template struct fp8xN { - using uint_t = typename uintN_t<8 * NumPacked>::type; - using unit_t = __nv_fp8_e5m2; - using x2_t = __nv_fp8x2_storage_t; + using uint_t = typename uintN_t<8 * NumPacked>::type; +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 + using unit_t = __nv_fp8_e5m2; + using x2_t = __nv_fp8x2_storage_t; +#else + using unit_t = cuvs::neighbors::ivf_pq::detail::fp_8bit<5u, true>; +#endif static constexpr uint32_t num_elements = NumPacked; - union { + union storage_t { unit_t x1[num_elements]; +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 x2_t x2[num_elements / 2]; +#endif uint_t u; + + HDI storage_t() : u{0} {} } data; - HDI fp8xN() { data.u = 0; } + HDI fp8xN() = default; HDI uint_t& as_uint() { return data.u; } HDI uint_t as_uint() const { return data.u; } HDI half2 as_half2(const uint32_t i) const { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2); +#else + half2 r; + r.x = static_cast(data.x1[2 * i]); + r.y = static_cast(data.x1[2 * i + 1]); + return r; +#endif } }; } // namespace cuvs::neighbors::cagra::detail::device From 627ee0dc1026b5d267c665e144e12ebb58eec6e7 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Fri, 5 Jun 2026 15:26:36 +0900 Subject: [PATCH 110/119] Fix VPQ test --- cpp/tests/neighbors/ann_cagra.cuh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh index 5f83d2eb8d..46b427e241 100644 --- a/cpp/tests/neighbors/ann_cagra.cuh +++ b/cpp/tests/neighbors/ann_cagra.cuh @@ -355,10 +355,6 @@ class AnnCagraTest : public ::testing::TestWithParam { if (ps.metric == cuvs::distance::DistanceType::L1 && ps.build_algo != graph_build_algo::ITERATIVE_CAGRA_SEARCH) GTEST_SKIP(); - if (ps.smem_dtype == cuvs::neighbors::cagra::internal_dtype::E5M2 && - raft::getComputeCapability().first < 9) { - GTEST_SKIP() << "CAGRA VPQ E5M2 smem dtype requires native FP8 support on SM90+"; - } if (ps.metric == cuvs::distance::DistanceType::CosineExpanded) { if (ps.compression.has_value()) { GTEST_SKIP(); } if (ps.build_algo == graph_build_algo::ITERATIVE_CAGRA_SEARCH || ps.dim == 1) { From e7e4205c21ca87d9ebe83e810a8c001e31f6d26f Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Fri, 5 Jun 2026 15:46:35 +0900 Subject: [PATCH 111/119] Fix compilation error --- cpp/src/neighbors/detail/cagra/packed_type.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/neighbors/detail/cagra/packed_type.hpp b/cpp/src/neighbors/detail/cagra/packed_type.hpp index 4e67fd50e6..cfcc68be09 100644 --- a/cpp/src/neighbors/detail/cagra/packed_type.hpp +++ b/cpp/src/neighbors/detail/cagra/packed_type.hpp @@ -47,7 +47,7 @@ struct fp8xN { HDI storage_t() : u{0} {} } data; - HDI fp8xN() = default; + HDI fp8xN() : data{} {} HDI uint_t& as_uint() { return data.u; } HDI uint_t as_uint() const { return data.u; } From 1032ffb2f4a8ee0cc938c6c11bec5260d758dbaf Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Mon, 8 Jun 2026 13:34:46 +0900 Subject: [PATCH 112/119] Update VPQ test to use VpqMathT --- cpp/tests/neighbors/ann_cagra.cuh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh index 46b427e241..38f77a4346 100644 --- a/cpp/tests/neighbors/ann_cagra.cuh +++ b/cpp/tests/neighbors/ann_cagra.cuh @@ -477,9 +477,11 @@ class AnnCagraTest : public ::testing::TestWithParam { if (ps.compression.has_value()) { auto decoded_dataset = raft::make_device_matrix(handle_, ps.n_rows, ps.dim); - cuvs::neighbors::decode_vpq_dataset( + + using VpqMathT = half; + cuvs::neighbors::decode_vpq_dataset( decoded_dataset.view(), - dynamic_cast&>(index.data()), + dynamic_cast&>(index.data()), raft::resource::get_cuda_stream(handle_)); auto indices_out_view = raft::make_device_matrix_view( indices_dev.data(), ps.n_queries, ps.k); From 02e372639a21db8f75f2798067db139631333157 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Mon, 8 Jun 2026 15:14:05 +0900 Subject: [PATCH 113/119] Add pq_bits assert --- cpp/tests/neighbors/vpq_utils.cuh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/tests/neighbors/vpq_utils.cuh b/cpp/tests/neighbors/vpq_utils.cuh index 613b5fe1d5..44b4d188ee 100644 --- a/cpp/tests/neighbors/vpq_utils.cuh +++ b/cpp/tests/neighbors/vpq_utils.cuh @@ -51,6 +51,9 @@ void decode_vpq_dataset(raft::device_matrix_view decoded_datase { const auto dataset_size = decoded_dataset.extent(0); RAFT_EXPECTS(vpq_dataset.data.extent(0) == dataset_size, "Dataset sizes mismatch"); + RAFT_EXPECTS(vpq_dataset.pq_bits() == 8, + "decode_vpq_dataset currently only supports pq_bits == 8 (got %u)", + vpq_dataset.pq_bits()); constexpr uint32_t block_size = 256; constexpr uint32_t warp_size = 32; From c608bd16a0beb25683b428d05cecd6b9dda027d7 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 10 Jun 2026 00:18:42 +0900 Subject: [PATCH 114/119] Remove SW emulated FP8 --- .../neighbors/detail/cagra/cagra_search.cuh | 6 +++++ .../detail/cagra/compute_distance_vpq.hpp | 13 ++++++++- .../neighbors/detail/cagra/packed_type.hpp | 27 ++++--------------- cpp/tests/neighbors/ann_cagra.cuh | 1 - 4 files changed, 23 insertions(+), 24 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index 6a64ad7a85..a5925b16d2 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -184,6 +184,12 @@ void search_main(raft::resources const& res, RAFT_FAIL("FP32 VPQ dataset support is coming soon"); } else if (auto* vpq_dset = dynamic_cast*>(&index.data()); vpq_dset != nullptr) { + if (params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::E5M2 && + raft::getComputeCapability().first < 9) { + RAFT_LOG_WARN( + "CAGRA VPQ E5M2 smem_dtype requires native FP8 support on SM90+. Falling back to F16."); + params.smem_dtype = cuvs::neighbors::cagra::internal_dtype::F16; + } auto desc = dataset_descriptor_init_with_cache( res, params, *vpq_dset, index.metric(), nullptr); search_main_core( diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp index 83954491bf..6781eb6abc 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp @@ -8,11 +8,22 @@ #include "compute_distance.hpp" #include +#include #include namespace cuvs::neighbors::cagra::detail { +inline auto select_supported_vpq_smem_dtype(const cagra::search_params& params) + -> cuvs::neighbors::cagra::internal_dtype +{ + if (params.smem_dtype == cuvs::neighbors::cagra::internal_dtype::E5M2 && + raft::getComputeCapability().first < 9) { + return cuvs::neighbors::cagra::internal_dtype::F16; + } + return params.smem_dtype; +} + template { // Match codebook params if (dataset.pq_bits() != PqBits) { return -1.0; } if (dataset.pq_len() != PqLen) { return -1.0; } - if (params.smem_dtype != SmemDType) { return -1.0; } + if (select_supported_vpq_smem_dtype(params) != SmemDType) { return -1.0; } // Keep auto-selection on the tuned VPQ diagonal while allowing explicit team_size requests to // use the expanded team_size / dataset_block_dim grid. constexpr std::uint32_t auto_dataset_block_dim_per_team = PqLen == 8 ? 32 : 16; diff --git a/cpp/src/neighbors/detail/cagra/packed_type.hpp b/cpp/src/neighbors/detail/cagra/packed_type.hpp index cfcc68be09..f52edc126b 100644 --- a/cpp/src/neighbors/detail/cagra/packed_type.hpp +++ b/cpp/src/neighbors/detail/cagra/packed_type.hpp @@ -3,8 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include "../../ivf_pq/ivf_pq_fp_8bit.cuh" - #include #include @@ -28,39 +26,24 @@ struct fp8xN {}; template struct fp8xN { - using uint_t = typename uintN_t<8 * NumPacked>::type; -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 - using unit_t = __nv_fp8_e5m2; - using x2_t = __nv_fp8x2_storage_t; -#else - using unit_t = cuvs::neighbors::ivf_pq::detail::fp_8bit<5u, true>; -#endif + using uint_t = typename uintN_t<8 * NumPacked>::type; + using unit_t = __nv_fp8_e5m2; + using x2_t = __nv_fp8x2_storage_t; static constexpr uint32_t num_elements = NumPacked; - union storage_t { + union { unit_t x1[num_elements]; -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 x2_t x2[num_elements / 2]; -#endif uint_t u; - - HDI storage_t() : u{0} {} } data; - HDI fp8xN() : data{} {} + HDI fp8xN() { data.u = 0; } HDI uint_t& as_uint() { return data.u; } HDI uint_t as_uint() const { return data.u; } HDI half2 as_half2(const uint32_t i) const { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 return __nv_cvt_fp8x2_to_halfraw2(data.x2[i], __NV_E5M2); -#else - half2 r; - r.x = static_cast(data.x1[2 * i]); - r.y = static_cast(data.x1[2 * i + 1]); - return r; -#endif } }; } // namespace cuvs::neighbors::cagra::detail::device diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh index 38f77a4346..93e933cc94 100644 --- a/cpp/tests/neighbors/ann_cagra.cuh +++ b/cpp/tests/neighbors/ann_cagra.cuh @@ -27,7 +27,6 @@ #include #include #include -#include #include #include From f706baae2cb9e559460e531e84ddff17b0bbbff4 Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 10 Jun 2026 16:57:58 +0900 Subject: [PATCH 115/119] Update dispatch funcs --- .../jit_lto_kernels/cagra_planner_base.hpp | 204 +++++++----------- 1 file changed, 81 insertions(+), 123 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp index cce18a0216..0666ef815a 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp @@ -175,120 +175,6 @@ struct CagraPlannerBase : AlgorithmPlanner { RAFT_FAIL("Unsupported CAGRA JIT smem_dtype: %u", static_cast(smem_dtype)); } - template - static void dispatch_cagra_standard_team_dim(uint32_t team_size, - uint32_t dataset_block_dim, - Lambda&& l) - { - switch (team_size) { - case 8: - switch (dataset_block_dim) { - case 128: std::forward(l).template operator()<8u, 128u>(); return; - case 256: std::forward(l).template operator()<8u, 256u>(); return; - case 512: std::forward(l).template operator()<8u, 512u>(); return; - default: break; - } - break; - case 16: - switch (dataset_block_dim) { - case 128: std::forward(l).template operator()<16u, 128u>(); return; - case 256: std::forward(l).template operator()<16u, 256u>(); return; - case 512: std::forward(l).template operator()<16u, 512u>(); return; - default: break; - } - break; - case 32: - switch (dataset_block_dim) { - case 128: std::forward(l).template operator()<32u, 128u>(); return; - case 256: std::forward(l).template operator()<32u, 256u>(); return; - case 512: std::forward(l).template operator()<32u, 512u>(); return; - default: break; - } - break; - default: break; - } - RAFT_FAIL("Unsupported standard team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u", - static_cast(team_size), - static_cast(dataset_block_dim)); - } - - template - static void dispatch_cagra_vpq_pq2_4_team_dim(uint32_t team_size, - uint32_t dataset_block_dim, - Lambda&& l) - { - switch (team_size) { - case 8: - switch (dataset_block_dim) { - case 128: std::forward(l).template operator()<8u, 128u, 8u, PqLenV>(); return; - case 256: std::forward(l).template operator()<8u, 256u, 8u, PqLenV>(); return; - case 512: std::forward(l).template operator()<8u, 512u, 8u, PqLenV>(); return; - default: break; - } - break; - case 16: - switch (dataset_block_dim) { - case 128: std::forward(l).template operator()<16u, 128u, 8u, PqLenV>(); return; - case 256: std::forward(l).template operator()<16u, 256u, 8u, PqLenV>(); return; - case 512: std::forward(l).template operator()<16u, 512u, 8u, PqLenV>(); return; - default: break; - } - break; - case 32: - switch (dataset_block_dim) { - case 128: std::forward(l).template operator()<32u, 128u, 8u, PqLenV>(); return; - case 256: std::forward(l).template operator()<32u, 256u, 8u, PqLenV>(); return; - case 512: std::forward(l).template operator()<32u, 512u, 8u, PqLenV>(); return; - default: break; - } - break; - default: break; - } - RAFT_FAIL( - "Unsupported VPQ pq_len=%u team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u", - static_cast(PqLenV), - static_cast(team_size), - static_cast(dataset_block_dim)); - } - - template - static void dispatch_cagra_vpq_pq8_team_dim(uint32_t team_size, - uint32_t dataset_block_dim, - Lambda&& l) - { - switch (team_size) { - case 4: - switch (dataset_block_dim) { - case 128: std::forward(l).template operator()<4u, 128u, 8u, 8u>(); return; - default: break; - } - break; - case 8: - switch (dataset_block_dim) { - case 256: std::forward(l).template operator()<8u, 256u, 8u, 8u>(); return; - default: break; - } - break; - case 16: - switch (dataset_block_dim) { - case 512: std::forward(l).template operator()<16u, 512u, 8u, 8u>(); return; - default: break; - } - break; - case 32: - switch (dataset_block_dim) { - case 1024: std::forward(l).template operator()<32u, 1024u, 8u, 8u>(); return; - default: break; - } - break; - default: break; - } - RAFT_FAIL( - "Unsupported VPQ pq_len=8 team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u", - static_cast(team_size), - static_cast(dataset_block_dim)); - } - template static void dispatch_cagra_vpq_team_dim(uint32_t team_size, uint32_t dataset_block_dim, @@ -384,15 +270,11 @@ struct CagraPlannerBase : AlgorithmPlanner { // template parameters; CAGRA reads team_size / dataset_block_dim from the host descriptor at // planning time. template - static void dispatch_cagra_team_dim(uint32_t team_size, uint32_t dataset_block_dim, Lambda&& l) + static void dispatch_cagra_standard_team_dim(uint32_t team_size, + uint32_t dataset_block_dim, + Lambda&& l) { switch (team_size) { - case 4: - switch (dataset_block_dim) { - case 128: std::forward(l).template operator()<4u, 128u>(); return; - default: break; - } - break; case 8: switch (dataset_block_dim) { case 128: std::forward(l).template operator()<8u, 128u>(); return; @@ -414,17 +296,93 @@ struct CagraPlannerBase : AlgorithmPlanner { case 128: std::forward(l).template operator()<32u, 128u>(); return; case 256: std::forward(l).template operator()<32u, 256u>(); return; case 512: std::forward(l).template operator()<32u, 512u>(); return; - case 1024: std::forward(l).template operator()<32u, 1024u>(); return; default: break; } break; default: break; } - RAFT_FAIL("Unsupported team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u", + RAFT_FAIL("Unsupported standard team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u", static_cast(team_size), static_cast(dataset_block_dim)); } + template + static void dispatch_cagra_vpq_pq2_4_team_dim(uint32_t team_size, + uint32_t dataset_block_dim, + Lambda&& l) + { + switch (team_size) { + case 8: + switch (dataset_block_dim) { + case 128: std::forward(l).template operator()<8u, 128u, 8u, PqLenV>(); return; + case 256: std::forward(l).template operator()<8u, 256u, 8u, PqLenV>(); return; + case 512: std::forward(l).template operator()<8u, 512u, 8u, PqLenV>(); return; + default: break; + } + break; + case 16: + switch (dataset_block_dim) { + case 128: std::forward(l).template operator()<16u, 128u, 8u, PqLenV>(); return; + case 256: std::forward(l).template operator()<16u, 256u, 8u, PqLenV>(); return; + case 512: std::forward(l).template operator()<16u, 512u, 8u, PqLenV>(); return; + default: break; + } + break; + case 32: + switch (dataset_block_dim) { + case 128: std::forward(l).template operator()<32u, 128u, 8u, PqLenV>(); return; + case 256: std::forward(l).template operator()<32u, 256u, 8u, PqLenV>(); return; + case 512: std::forward(l).template operator()<32u, 512u, 8u, PqLenV>(); return; + default: break; + } + break; + default: break; + } + RAFT_FAIL( + "Unsupported VPQ pq_len=%u team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u", + static_cast(PqLenV), + static_cast(team_size), + static_cast(dataset_block_dim)); + } + + template + static void dispatch_cagra_vpq_pq8_team_dim(uint32_t team_size, + uint32_t dataset_block_dim, + Lambda&& l) + { + switch (team_size) { + case 4: + switch (dataset_block_dim) { + case 128: std::forward(l).template operator()<4u, 128u, 8u, 8u>(); return; + default: break; + } + break; + case 8: + switch (dataset_block_dim) { + case 256: std::forward(l).template operator()<8u, 256u, 8u, 8u>(); return; + default: break; + } + break; + case 16: + switch (dataset_block_dim) { + case 512: std::forward(l).template operator()<16u, 512u, 8u, 8u>(); return; + default: break; + } + break; + case 32: + switch (dataset_block_dim) { + case 1024: std::forward(l).template operator()<32u, 1024u, 8u, 8u>(); return; + default: break; + } + break; + default: break; + } + RAFT_FAIL( + "Unsupported VPQ pq_len=8 team_size / dataset_block_dim for CAGRA JIT: team=%u dim=%u", + static_cast(team_size), + static_cast(dataset_block_dim)); + } + void add_sample_filter_device_function() { if constexpr (!std::is_same_v) { From 0eef38fe66032618b645f452373c1c426eabcdbb Mon Sep 17 00:00:00 2001 From: enp1s0 Date: Wed, 10 Jun 2026 16:58:20 +0900 Subject: [PATCH 116/119] Fix ldg_cg use --- .../detail/cagra/jit_lto_kernels/compute_distance_impl.cuh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh index ea817110e7..a6dd0495fd 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/compute_distance_impl.cuh @@ -127,12 +127,7 @@ _RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker_impl( for (std::uint32_t e = 0; e < nelem; e++) { const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen; if (k >= n_subspace) break; - if constexpr (std::is_same_v) { - device::ldg_cg(pq_codes[e], - reinterpret_cast(dataset_ptr + 4 + k)); - } else { - pq_codes[e] = *reinterpret_cast(dataset_ptr + 4 + k); - } + device::ldg_cg(pq_codes[e], reinterpret_cast(dataset_ptr + 4 + k)); } // if constexpr (PQ_LEN % 2 == 0) { From b5355d94ed02deb4eb72fbe39a9689415e71bf0e Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Mon, 15 Jun 2026 06:53:31 -0700 Subject: [PATCH 117/119] Add shuffle_dataset option for iterative CAGRA-Q graph build Optionally permute the VPQ-compressed dataset before the iterative build loop to break spatial locality, then unshuffle the resulting graph back to the original node ordering. Adds the shuffle_dataset (and smem_dtype) build-search params and bench parsing. --- .../src/cuvs/cuvs_ann_bench_param_parser.h | 21 +++ cpp/include/cuvs/neighbors/cagra.hpp | 17 ++ .../neighbors/detail/cagra/cagra_build.cuh | 156 +++++++++++++++++- 3 files changed, 190 insertions(+), 4 deletions(-) diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h index b636465949..9977484742 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h +++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h @@ -423,6 +423,27 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index arg.hashmap_mode = cuvs::neighbors::cagra::hash_mode::AUTO; } } + // Whether to shuffle the (compressed) dataset before the iterative build loop. + if (build_search_conf.contains("shuffle_dataset")) { + arg.shuffle_dataset = build_search_conf.at("shuffle_dataset").get(); + } + // Precision of the codebook/query in shared memory for the VPQ search used during + // the iterative build. Accepts an integer code (0=F16, 1=E5M2) or a string. + if (build_search_conf.contains("smem_dtype")) { + const auto& sd = build_search_conf.at("smem_dtype"); + if (sd.is_number_integer()) { + arg.smem_dtype = static_cast(sd.get()); + } else { + std::string s = sd.get(); + if (s == "f16" || s == "F16" || s == "fp16" || s == "half") { + arg.smem_dtype = cuvs::neighbors::cagra::internal_dtype::F16; + } else if (s == "e5m2" || s == "E5M2" || s == "fp8") { + arg.smem_dtype = cuvs::neighbors::cagra::internal_dtype::E5M2; + } else { + throw std::runtime_error("invalid value for build_search smem_dtype: " + s); + } + } + } } }, params.graph_build_params); diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index daac54946e..fd96b2d052 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -166,6 +166,23 @@ struct iterative_search_params : cuvs::neighbors::cagra::search_params { */ std::optional build_compression = std::nullopt; + /** + * Whether to shuffle the dataset before building the graph. + * + * When enabled, the compressed dataset is randomly permuted before graph + * construction begins. This can improve graph quality by breaking any + * spatial locality in the original dataset ordering that might cause + * the iterative builder to get stuck in local optima during early + * iterations. + * + * After graph construction, the node indices in the graph are remapped + * back to the original dataset ordering. + * + * Only applies when compression is enabled (build_compression or + * index_params::compression is set). + */ + bool shuffle_dataset = true; + iterative_search_params() { this->search_width = 1; diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 00822330ce..f0396063f3 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -17,8 +17,17 @@ #include #include #include +#include +#include +#include +#include #include +#include +#include +#include +#include + #include #include #include @@ -51,6 +60,32 @@ namespace cuvs::neighbors::cagra::detail { constexpr double to_mib(size_t bytes) { return static_cast(bytes) / (1 << 20); } constexpr double to_gib(size_t bytes) { return static_cast(bytes) / (1 << 30); } +// Functor to remap indices using a permutation lookup table +template +struct remap_indices_op { + const IdxT* perm; + __host__ __device__ IdxT operator()(IdxT idx) const { return perm[idx]; } +}; + +// Functor to compute scattered output index for graph row reordering +template +struct graph_scatter_index_op { + const IdxT* perm; + int64_t degree; + __host__ __device__ int64_t operator()(int64_t idx) const + { + int64_t row = idx / degree; + int64_t col = idx % degree; + return static_cast(perm[row]) * degree + col; + } +}; + +// Functor to convert int64_t to IdxT +template +struct cast_to_idx_op { + __host__ __device__ IdxT operator()(int64_t v) const { return static_cast(v); } +}; + template void check_graph_degree(size_t& intermediate_degree, size_t& graph_degree, size_t dataset_size) { @@ -2203,15 +2238,80 @@ auto iterative_build_graph( // Generate the compressed index once if compression is enabled const uint64_t dataset_dim = dev_dataset.extent(1); std::optional> idx_opt; + + // Optional shuffle permutation for randomizing dataset order during build. + // inverse_perm[shuffled_idx] = original_idx + // perm[shuffled_idx] = original_idx, used to unshuffle the graph after build + auto dev_perm = raft::make_device_vector(res, 0); + bool dataset_shuffled = false; + + // Warn if shuffle is requested but compression is not enabled + if (iter_params.shuffle_dataset && !build_compression.has_value()) { + RAFT_LOG_WARN("shuffle_dataset is only supported with compression enabled; ignoring"); + } + if (build_compression.has_value()) { auto start = std::chrono::high_resolution_clock::now(); RAFT_EXPECTS(params.metric == cuvs::distance::DistanceType::L2Expanded, "VPQ compression is only supported with L2Expanded distance mertric"); + + // Build the VPQ compressed dataset + auto vpq_dset = + cuvs::preprocessing::quantize::pq::vpq_build(res, *build_compression, dev_dataset); + + // Optionally shuffle the compressed dataset to break spatial locality + if (iter_params.shuffle_dataset) { + auto shuffle_start = std::chrono::high_resolution_clock::now(); + RAFT_LOG_INFO("Shuffling compressed dataset to randomize build order..."); + + auto stream = raft::resource::get_cuda_stream(res); + const auto n_rows = vpq_dset.data.extent(0); + const auto row_len = vpq_dset.data.extent(1); + + // Generate random permutation: perm[i] = source index for output row i + // i.e., shuffled_data[i] = original_data[perm[i]] + // So perm maps: shuffled_idx -> original_idx + // Use int64_t for permutation to match vpq_dataset's index type + auto dev_perm_i64 = raft::make_device_vector(res, n_rows); + + // Use legacy permute API to generate permutation indices only (out=nullptr, in=nullptr) + // This just fills dev_perm_i64 with a random permutation of [0, n_rows) + raft::random::permute(dev_perm_i64.data_handle(), + static_cast(nullptr), + static_cast(nullptr), + static_cast(row_len), + static_cast(n_rows), + true, + stream); + + // Apply permutation to VPQ data: shuffled_data[i] = original_data[perm[i]] + // Use in-place gather which reorders rows according to the map + raft::matrix::gather(res, vpq_dset.data.view(), raft::make_const_mdspan(dev_perm_i64.view())); + + // Store perm as IdxT for graph unshuffling later + // perm[shuffled_idx] = original_idx + // This is used for: + // 1. Remapping neighbor values: neighbor j (shuffled) -> perm[j] (original) + // 2. Reordering rows: row i (for shuffled node i) -> position perm[i] (original node) + dev_perm = raft::make_device_vector(res, n_rows); + cast_to_idx_op cast_op; + thrust::transform(raft::resource::get_thrust_policy(res), + dev_perm_i64.data_handle(), + dev_perm_i64.data_handle() + n_rows, + dev_perm.data_handle(), + cast_op); + + dataset_shuffled = true; + + auto shuffle_end = std::chrono::high_resolution_clock::now(); + auto shuffle_ms = + std::chrono::duration_cast(shuffle_end - shuffle_start).count(); + RAFT_LOG_INFO("# Dataset shuffle time: %.3lf sec", (double)shuffle_ms / 1000); + } + idx_opt.emplace(res, params.metric); - idx_opt->update_dataset( - res, - // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later - cuvs::preprocessing::quantize::pq::vpq_build(res, *build_compression, dev_dataset)); + // Use the (optionally shuffled) compressed dataset built above. + idx_opt->update_dataset(res, std::move(vpq_dset)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_ms = std::chrono::duration_cast(end - start).count(); RAFT_LOG_INFO("# VPQ compression time: %.3lf sec", (double)elapsed_ms / 1000); @@ -2325,6 +2425,54 @@ auto iterative_build_graph( // (once for the build loop and once in build()'s shared tail). We could avoid this by returning // the index directly (with its VPQ dataset and device-side graph) instead of just the host graph. auto stream = raft::resource::get_cuda_stream(res); + + // If the dataset was shuffled, we need to unshuffle the graph: + // Recall: perm[shuffled_idx] = original_idx (stored in dev_perm) + // 1. Remap neighbor indices from shuffled space to original space + // 2. Reorder rows from shuffled order to original order + if (dataset_shuffled) { + auto unshuffle_start = std::chrono::high_resolution_clock::now(); + RAFT_LOG_INFO("Unshuffling graph to restore original dataset ordering..."); + + const auto n_rows = dev_graph.extent(0); + const auto degree = dev_graph.extent(1); + + // Step 1: Remap all neighbor indices using perm + // graph[i][j] contains shuffled index j; we need original index = perm[j] + remap_indices_op remap_op{dev_perm.data_handle()}; + thrust::transform(raft::resource::get_thrust_policy(res), + dev_graph.data_handle(), + dev_graph.data_handle() + n_rows * degree, + dev_graph.data_handle(), + remap_op); + + // Step 2: Reorder rows back to original order + // Row i in dev_graph is for shuffled node i, which is original node perm[i]. + // We want this row to be at position perm[i] in the final graph. + // scatter: output[map[i]] = input[i], so map[i] = perm[i] + auto dev_unshuffled_graph = raft::make_device_matrix(res, n_rows, degree); + + // Use thrust::scatter to reorder: for each row i, place it at position perm[i] + // We scatter row-by-row conceptually, but do it element-wise with computed output indices + graph_scatter_index_op scatter_idx_op{dev_perm.data_handle(), degree}; + auto output_indices = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), scatter_idx_op); + + thrust::scatter(raft::resource::get_thrust_policy(res), + dev_graph.data_handle(), + dev_graph.data_handle() + n_rows * degree, + output_indices, + dev_unshuffled_graph.data_handle()); + + dev_graph = std::move(dev_unshuffled_graph); + + auto unshuffle_end = std::chrono::high_resolution_clock::now(); + auto unshuffle_ms = + std::chrono::duration_cast(unshuffle_end - unshuffle_start) + .count(); + RAFT_LOG_INFO("# Graph unshuffle time: %.3lf sec", (double)unshuffle_ms / 1000); + } + cagra_graph = raft::make_host_matrix(dev_graph.extent(0), dev_graph.extent(1)); raft::copy(cagra_graph.data_handle(), dev_graph.data_handle(), From 23c811b44e857f098d7d3ba9ce0c1c1944d680d4 Mon Sep 17 00:00:00 2001 From: Irina Reshodko Date: Mon, 15 Jun 2026 09:40:27 -0700 Subject: [PATCH 118/119] fix oob due to in place raft shuffle --- cpp/src/neighbors/detail/cagra/cagra_build.cuh | 15 ++++++++++++--- .../detail/cagra/search_multi_kernel.cuh | 6 +++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index f0396063f3..ef91de6df6 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -2284,9 +2284,18 @@ auto iterative_build_graph( true, stream); - // Apply permutation to VPQ data: shuffled_data[i] = original_data[perm[i]] - // Use in-place gather which reorders rows according to the map - raft::matrix::gather(res, vpq_dset.data.view(), raft::make_const_mdspan(dev_perm_i64.view())); + // Apply permutation to VPQ data: shuffled_data[i] = original_data[perm[i]]. + // NOTE: use an out-of-place device gather into a temporary buffer rather than the + // in-place gather overload. The in-place overload uses a host-orchestrated, + // double-buffered, multi-stream path that races here and triggers an asynchronous + // illegal memory access (the crash disappears under CUDA_LAUNCH_BLOCKING=1). + auto shuffled_data = raft::make_device_matrix( + res, vpq_dset.data.extent(0), vpq_dset.data.extent(1)); + raft::matrix::gather(res, + raft::make_const_mdspan(vpq_dset.data.view()), + raft::make_const_mdspan(dev_perm_i64.view()), + shuffled_data.view()); + vpq_dset.data = std::move(shuffled_data); // Store perm as IdxT for graph unshuffling later // perm[shuffled_idx] = original_idx diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh index e09ef82a39..f1c7305833 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh @@ -511,7 +511,11 @@ struct search hashmap.data(), hash_bitlen, stream, - static_cast(this->dataset_size)); + // Bound random seed selection to the graph size, not the dataset size. + // During iterative / CAGRA-Q build the graph is smaller than the dataset, + // so using dataset_size here selects seeds that index past the graph end + // (out-of-bounds access). See https://github.com/rapidsai/cuvs/pull/1780. + static_cast(graph.extent(0))); std::shared_ptr compute_distance_to_child_nodes_launcher = make_cagra_multi_kernel_jit_launcher Date: Wed, 1 Jul 2026 02:55:11 -0700 Subject: [PATCH 119/119] made the growth-phase build search itopk parameter a tunable parameter --- cpp/include/cuvs/neighbors/cagra.hpp | 5 ++++ .../neighbors/detail/cagra/cagra_build.cuh | 23 +++++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index 532423253c..1e6e87b2e5 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -190,6 +190,11 @@ struct iterative_search_params : cuvs::neighbors::cagra::search_params { { this->search_width = 1; this->max_iterations = 8; + // itopk_size controls the search during the *growing* iterations of the build loop. + // 0 (default) means auto-select per iteration (max(graph_degree + 32, 128)); a nonzero + // value overrides it for the growing iterations. The final iteration always uses a fixed + // itopk tied to the output topk, regardless of this value. + this->itopk_size = 0; } }; diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index ef91de6df6..10e44377e3 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -2209,6 +2209,17 @@ auto iterative_build_graph( RAFT_LOG_DEBUG("# graph_degree = %lu", (uint64_t)graph_degree); RAFT_LOG_DEBUG("# topk = %lu", (uint64_t)topk); + // A fixed itopk_size (0 = auto) governs the growing iterations, which build graphs of degree + // ~graph_degree/2 and thus request topk ~= graph_degree/2 + 1; the search planner requires + // topk <= itopk_size. (The full-size iterations override itopk internally, so they are not + // constrained by this value.) + RAFT_EXPECTS(iter_params.itopk_size == 0 || + iter_params.itopk_size >= graph_degree / 2 + 1, + "iterative build search itopk_size (%zu) must be 0 (auto) or >= " + "graph_degree / 2 + 1 (%zu)", + (size_t)iter_params.itopk_size, + (size_t)(graph_degree / 2 + 1)); + // Create an initial graph. The initial graph created here is not suitable for // searching, but connectivity is guaranteed. auto offset = raft::make_host_vector(small_graph_degree); @@ -2344,8 +2355,16 @@ auto iterative_build_graph( // The search count (topk) is set to the next graph degree + 1, because // pruning is not used except in the last iteration. // (*) The appropriate setting for itopk_size requires careful consideration. - auto curr_topk = next_graph_degree + 1; - auto curr_itopk_size = std::max(next_graph_degree + 32, (uint64_t)128); + auto curr_topk = next_graph_degree + 1; + // The configurable itopk (iter_params.itopk_size, 0 = auto) applies only to the true growing + // iterations, where the degree being built is small_graph_degree. When the graph reaches its + // full size the search builds a graph_degree-degree graph (topk = graph_degree + 1); that + // iteration needs a larger itopk, so it overrides the configured value with the auto formula. + // The final iteration (flag_last) uses a fixed itopk tied to the output topk. + auto curr_itopk_size = + (iter_params.itopk_size > 0 && next_graph_degree == small_graph_degree) + ? (uint64_t)iter_params.itopk_size + : std::max(next_graph_degree + 32, (uint64_t)128); if (flag_last) { curr_topk = topk; curr_itopk_size = curr_topk + 32;