From 89f2ca05dd76ed10598ce8ac7d6dd93689155d63 Mon Sep 17 00:00:00 2001 From: vic Date: Tue, 2 Jun 2026 15:02:31 +0200 Subject: [PATCH 1/2] SNMG Batched KMeans Python API --- c/CMakeLists.txt | 1 + c/include/cuvs/cluster/mg_kmeans.h | 92 ++++++++ c/include/cuvs/core/all.h | 1 + c/src/cluster/mg_kmeans.cpp | 223 ++++++++++++++++++ c/tests/CMakeLists.txt | 3 + c/tests/cluster/kmeans_mg_c.cu | 132 +++++++++++ fern/docs.yml | 4 + fern/pages/c_api/c-api-cluster-mg-kmeans.md | 77 ++++++ fern/pages/c_api/index.md | 1 + fern/pages/python_api/index.md | 1 + .../python-api-cluster-mg-kmeans.md | 34 +++ python/cuvs/cuvs/cluster/CMakeLists.txt | 3 +- python/cuvs/cuvs/cluster/__init__.py | 6 +- python/cuvs/cuvs/cluster/kmeans/kmeans.pxd | 4 + python/cuvs/cuvs/cluster/kmeans/kmeans.pyx | 2 - python/cuvs/cuvs/cluster/mg/CMakeLists.txt | 8 + python/cuvs/cuvs/cluster/mg/__init__.py | 6 + .../cuvs/cluster/mg/kmeans/CMakeLists.txt | 17 ++ .../cuvs/cuvs/cluster/mg/kmeans/__init__.py | 8 + python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pxd | 19 ++ python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pyx | 153 ++++++++++++ python/cuvs/cuvs/tests/test_kmeans.py | 33 ++- python/cuvs/cuvs/tests/test_mg_kmeans.py | 220 +++++++++++++++++ 23 files changed, 1036 insertions(+), 12 deletions(-) create mode 100644 c/include/cuvs/cluster/mg_kmeans.h create mode 100644 c/src/cluster/mg_kmeans.cpp create mode 100644 c/tests/cluster/kmeans_mg_c.cu create mode 100644 fern/pages/c_api/c-api-cluster-mg-kmeans.md create mode 100644 fern/pages/python_api/python-api-cluster-mg-kmeans.md create mode 100644 python/cuvs/cuvs/cluster/mg/CMakeLists.txt create mode 100644 python/cuvs/cuvs/cluster/mg/__init__.py create mode 100644 python/cuvs/cuvs/cluster/mg/kmeans/CMakeLists.txt create mode 100644 python/cuvs/cuvs/cluster/mg/kmeans/__init__.py create mode 100644 python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pxd create mode 100644 python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pyx create mode 100644 python/cuvs/cuvs/tests/test_mg_kmeans.py diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index be4bc7a051..beb995c6c3 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -86,6 +86,7 @@ add_library( cuvs_c SHARED src/core/c_api.cpp src/cluster/kmeans.cpp + $<$:src/cluster/mg_kmeans.cpp> src/neighbors/brute_force.cpp src/neighbors/ivf_flat.cpp src/neighbors/ivf_pq.cpp diff --git a/c/include/cuvs/cluster/mg_kmeans.h b/c/include/cuvs/cluster/mg_kmeans.h new file mode 100644 index 0000000000..85a0c82dc6 --- /dev/null +++ b/c/include/cuvs/cluster/mg_kmeans.h @@ -0,0 +1,92 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include +#include + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @defgroup mg_kmeans_c Multi-GPU k-means clustering APIs + * @{ + */ + +/** + * @brief Find clusters with single-node multi-GPU k-means using host data. + * + * X, sample_weight, and centroids must be host-accessible, row-major, + * C-contiguous DLPack tensors. X and centroids must have dtype float32 or + * float64, and sample_weight must match X when provided. + * + * @note In cuVS 26.08 (next ABI major version) this signature will be + * replaced by cuvsMultiGpuKMeansFit_v2. + * + * @param[in] res cuvsMultiGpuResources_t opaque C handle + * created by cuvsMultiGpuResourcesCreate or + * cuvsMultiGpuResourcesCreateWithDeviceIds. + * @param[in] params Parameters for KMeans model. + * @param[in] X Host training instances to cluster. + * [dim = n_samples x n_features] + * @param[in] sample_weight Optional host weights for each observation in X. + * [len = n_samples] + * @param[inout] centroids Host centroids. When init is Array, used as the + * initial cluster centers. The final generated + * centroids are copied back to this tensor. + * [dim = n_clusters x n_features] + * @param[out] inertia Sum of squared distances of samples to their + * closest cluster center. + * @param[out] n_iter Number of iterations run. + */ +CUVS_EXPORT cuvsError_t cuvsMultiGpuKMeansFit(cuvsResources_t res, + cuvsKMeansParams_t params, + DLManagedTensor* X, + DLManagedTensor* sample_weight, + DLManagedTensor* centroids, + double* inertia, + int* n_iter); + +/** + * @brief Find clusters with single-node multi-GPU k-means (v2 params layout). + * + * Mirrors cuvsMultiGpuKMeansFit but takes cuvsKMeansParams_v2_t. Will become + * the unsuffixed cuvsMultiGpuKMeansFit in cuVS 26.08. + * + * @param[in] res cuvsMultiGpuResources_t opaque C handle. + * @param[in] params Parameters for KMeans model (v2 layout). + * @param[in] X Host training instances to cluster. + * [dim = n_samples x n_features] + * @param[in] sample_weight Optional host weights for each observation in X. + * [len = n_samples] + * @param[inout] centroids Host centroids. When init is Array, used as the + * initial cluster centers. The final generated + * centroids are copied back to this tensor. + * [dim = n_clusters x n_features] + * @param[out] inertia Sum of squared distances of samples to their + * closest cluster center. + * @param[out] n_iter Number of iterations run. + */ +CUVS_EXPORT cuvsError_t cuvsMultiGpuKMeansFit_v2(cuvsResources_t res, + cuvsKMeansParams_v2_t params, + DLManagedTensor* X, + DLManagedTensor* sample_weight, + DLManagedTensor* centroids, + double* inertia, + int* n_iter); + +/** + * @} + */ + +#ifdef __cplusplus +} +#endif diff --git a/c/include/cuvs/core/all.h b/c/include/cuvs/core/all.h index 545c7ec6f4..8e8caafcd8 100644 --- a/c/include/cuvs/core/all.h +++ b/c/include/cuvs/core/all.h @@ -34,6 +34,7 @@ #endif #ifdef CUVS_BUILD_MG_ALGOS + #include #include #include #include diff --git a/c/src/cluster/mg_kmeans.cpp b/c/src/cluster/mg_kmeans.cpp new file mode 100644 index 0000000000..06255b6f71 --- /dev/null +++ b/c/src/cluster/mg_kmeans.cpp @@ -0,0 +1,223 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "../core/exceptions.hpp" +#include "../core/interop.hpp" + +namespace { + +template +cuvs::cluster::kmeans::params convert_params(const ParamsT& params) +{ + auto kmeans_params = cuvs::cluster::kmeans::params(); + kmeans_params.metric = static_cast(params.metric); + kmeans_params.init = static_cast(params.init); + kmeans_params.n_clusters = params.n_clusters; + kmeans_params.max_iter = params.max_iter; + kmeans_params.tol = params.tol; + kmeans_params.n_init = params.n_init; + kmeans_params.oversampling_factor = params.oversampling_factor; + kmeans_params.batch_samples = params.batch_samples; + kmeans_params.batch_centroids = params.batch_centroids; + kmeans_params.init_size = params.init_size; + kmeans_params.streaming_batch_size = params.streaming_batch_size; + return kmeans_params; +} + +void validate_host_tensor(DLManagedTensor* tensor, const char* name) +{ + RAFT_EXPECTS(tensor != nullptr, "%s must not be NULL", name); + auto dl_tensor = tensor->dl_tensor; + RAFT_EXPECTS(dl_tensor.data != nullptr, "%s data must not be NULL", name); + RAFT_EXPECTS(dl_tensor.shape != nullptr, "%s shape must not be NULL", name); + RAFT_EXPECTS( + cuvs::core::is_dlpack_host_compatible(dl_tensor), "%s must be host accessible", name); + RAFT_EXPECTS(dl_tensor.device.device_type != kDLCUDA, "%s must reside in host memory", name); + RAFT_EXPECTS(cuvs::core::is_c_contiguous(tensor), "%s must be C-contiguous", name); +} + +bool dtype_equal(const DLTensor& lhs, const DLTensor& rhs) +{ + return lhs.dtype.code == rhs.dtype.code && lhs.dtype.bits == rhs.dtype.bits && + lhs.dtype.lanes == rhs.dtype.lanes; +} + +void validate_float_dtype(const DLTensor& tensor, const char* name) +{ + RAFT_EXPECTS( + tensor.dtype.code == kDLFloat && (tensor.dtype.bits == 32 || tensor.dtype.bits == 64), + "%s must have dtype float32 or float64", + name); + RAFT_EXPECTS(tensor.dtype.lanes == 1, "%s must have one DLPack lane", name); +} + +template +void validate_inputs(const ParamsT& params, + DLManagedTensor* X_tensor, + DLManagedTensor* sample_weight_tensor, + DLManagedTensor* centroids_tensor) +{ + RAFT_EXPECTS(params.n_clusters > 0, "n_clusters must be positive"); + RAFT_EXPECTS(!params.hierarchical, "hierarchical kmeans is not supported by SNMG kmeans"); + + validate_host_tensor(X_tensor, "X"); + validate_host_tensor(centroids_tensor, "centroids"); + + auto X = X_tensor->dl_tensor; + auto centroids = centroids_tensor->dl_tensor; + + RAFT_EXPECTS(X.ndim == 2, "X must be a 2D tensor"); + RAFT_EXPECTS(centroids.ndim == 2, "centroids must be a 2D tensor"); + RAFT_EXPECTS(X.shape[0] > 0, "X must have at least one row"); + RAFT_EXPECTS(X.shape[1] > 0, "X must have at least one column"); + RAFT_EXPECTS(centroids.shape[0] == params.n_clusters, + "centroids row count must equal n_clusters"); + RAFT_EXPECTS(centroids.shape[1] == X.shape[1], + "centroids column count must equal X column count"); + + validate_float_dtype(X, "X"); + RAFT_EXPECTS(dtype_equal(X, centroids), "centroids dtype must match X dtype"); + + if (sample_weight_tensor != nullptr) { + validate_host_tensor(sample_weight_tensor, "sample_weight"); + auto sample_weight = sample_weight_tensor->dl_tensor; + RAFT_EXPECTS(sample_weight.ndim == 1, "sample_weight must be a 1D tensor"); + RAFT_EXPECTS(sample_weight.shape[0] == X.shape[0], + "sample_weight length must equal X row count"); + RAFT_EXPECTS(dtype_equal(X, sample_weight), "sample_weight dtype must match X dtype"); + } +} + +template +void fit_snmg(cuvsResources_t res, + const ParamsT& params, + DLManagedTensor* X_tensor, + DLManagedTensor* sample_weight_tensor, + DLManagedTensor* centroids_tensor, + double* inertia, + int* n_iter) +{ + auto res_ptr = reinterpret_cast(res); + RAFT_EXPECTS(res_ptr != nullptr, "res must not be NULL"); + RAFT_EXPECTS(raft::resource::is_multi_gpu(*res_ptr), + "cuvsMultiGpuKMeansFit requires a MultiGpuResources handle"); + + auto X = X_tensor->dl_tensor; + auto centroids = centroids_tensor->dl_tensor; + + auto n_samples = static_cast(X.shape[0]); + auto n_features = static_cast(X.shape[1]); + auto n_clusters = static_cast(params.n_clusters); + + auto X_view = raft::make_host_matrix_view( + reinterpret_cast(X.data), n_samples, n_features); + + std::optional> sample_weight; + if (sample_weight_tensor != nullptr) { + auto sw = sample_weight_tensor->dl_tensor; + sample_weight = + raft::make_host_vector_view(reinterpret_cast(sw.data), n_samples); + } + + auto const& rank0_res = raft::resource::set_current_device_to_rank(*res_ptr, 0); + auto stream = raft::resource::get_cuda_stream(rank0_res); + auto d_centroids = raft::make_device_matrix(rank0_res, n_clusters, n_features); + auto n_centroid_values = n_clusters * n_features; + + if (params.init == Array) { + raft::update_device(d_centroids.data_handle(), + reinterpret_cast(centroids.data), + n_centroid_values, + stream); + raft::resource::sync_stream(rank0_res, stream); + } + + T inertia_temp = T{0}; + IdxT n_iter_temp = IdxT{0}; + auto kmeans_params = convert_params(params); + cuvs::cluster::kmeans::fit(*res_ptr, + kmeans_params, + X_view, + sample_weight, + d_centroids.view(), + raft::make_host_scalar_view(&inertia_temp), + raft::make_host_scalar_view(&n_iter_temp)); + + raft::update_host( + reinterpret_cast(centroids.data), d_centroids.data_handle(), n_centroid_values, stream); + raft::resource::sync_stream(rank0_res, stream); + + *inertia = static_cast(inertia_temp); + *n_iter = static_cast(n_iter_temp); +} + +template +void dispatch_fit(cuvsResources_t res, + ParamsT params, + DLManagedTensor* X, + DLManagedTensor* sample_weight, + DLManagedTensor* centroids, + double* inertia, + int* n_iter) +{ + RAFT_EXPECTS(res != 0, "res must not be NULL"); + RAFT_EXPECTS(params != nullptr, "params must not be NULL"); + RAFT_EXPECTS(inertia != nullptr, "inertia must not be NULL"); + RAFT_EXPECTS(n_iter != nullptr, "n_iter must not be NULL"); + + validate_inputs(*params, X, sample_weight, centroids); + + auto dataset = X->dl_tensor; + if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) { + fit_snmg(res, *params, X, sample_weight, centroids, inertia, n_iter); + } else if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 64) { + fit_snmg(res, *params, X, sample_weight, centroids, inertia, n_iter); + } else { + RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d", + dataset.dtype.code, + dataset.dtype.bits); + } +} + +} // namespace + +extern "C" cuvsError_t cuvsMultiGpuKMeansFit(cuvsResources_t res, + cuvsKMeansParams_t params, + DLManagedTensor* X, + DLManagedTensor* sample_weight, + DLManagedTensor* centroids, + double* inertia, + int* n_iter) +{ + return cuvs::core::translate_exceptions( + [=] { dispatch_fit(res, params, X, sample_weight, centroids, inertia, n_iter); }); +} + +extern "C" cuvsError_t cuvsMultiGpuKMeansFit_v2(cuvsResources_t res, + cuvsKMeansParams_v2_t params, + DLManagedTensor* X, + DLManagedTensor* sample_weight, + DLManagedTensor* centroids, + double* inertia, + int* n_iter) +{ + return cuvs::core::translate_exceptions( + [=] { dispatch_fit(res, params, X, sample_weight, centroids, inertia, n_iter); }); +} diff --git a/c/tests/CMakeLists.txt b/c/tests/CMakeLists.txt index f1cff7824e..614af5e955 100644 --- a/c/tests/CMakeLists.txt +++ b/c/tests/CMakeLists.txt @@ -79,6 +79,9 @@ ConfigureTest( NAME DISTANCE_C_TEST PATH distance/run_pairwise_distance_c.c distance/pairwise_distance_c.cu ) ConfigureTest(NAME KMEANS_C_TEST PATH cluster/kmeans_c.cu) +if(BUILD_MG_ALGOS) + ConfigureTest(NAME KMEANS_MG_C_TEST PATH cluster/kmeans_mg_c.cu) +endif() ConfigureTest(NAME BRUTEFORCE_C_TEST PATH neighbors/run_brute_force_c.c neighbors/brute_force_c.cu) ConfigureTest(NAME IVF_FLAT_C_TEST PATH neighbors/run_ivf_flat_c.c neighbors/ann_ivf_flat_c.cu) ConfigureTest(NAME IVF_PQ_C_TEST PATH neighbors/run_ivf_pq_c.c neighbors/ann_ivf_pq_c.cu) diff --git a/c/tests/cluster/kmeans_mg_c.cu b/c/tests/cluster/kmeans_mg_c.cu new file mode 100644 index 0000000000..526a5c45fa --- /dev/null +++ b/c/tests/cluster/kmeans_mg_c.cu @@ -0,0 +1,132 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef CUVS_BUILD_MG_ALGOS +#error "KMEANS_MG_C_TEST requires BUILD_MG_ALGOS" +#endif + +#include "../../src/core/interop.hpp" +#include +#include +#include + +#include + +#include + +#include + +namespace { + +constexpr int64_t kNSamples = 8; +constexpr int64_t kNFeatures = 2; +constexpr int kNClusters = 2; + +float kDataset[kNSamples][kNFeatures] = { + {1.0f, 1.0f}, + {1.0f, 2.0f}, + {2.0f, 1.0f}, + {2.0f, 2.0f}, + {10.0f, 10.0f}, + {10.0f, 11.0f}, + {11.0f, 10.0f}, + {11.0f, 11.0f}, +}; + +float kExpectedCentroids[kNClusters * kNFeatures] = {1.5f, 1.5f, 10.5f, 10.5f}; + +// 8 points, each at squared distance 0.5 from its cluster mean -> 4.0. +constexpr double kExpectedInertia = 4.0; + +struct kmeans_mg_api_v1 { + using params_t = cuvsKMeansParams_t; + static cuvsError_t params_create(params_t* p) { return cuvsKMeansParamsCreate(p); } + static cuvsError_t params_destroy(params_t p) { return cuvsKMeansParamsDestroy(p); } + static cuvsError_t fit(cuvsResources_t res, + params_t params, + DLManagedTensor* dataset, + DLManagedTensor* centroids, + double* inertia, + int* n_iter) + { + return cuvsMultiGpuKMeansFit(res, params, dataset, NULL, centroids, inertia, n_iter); + } +}; + +struct kmeans_mg_api_v2 { + using params_t = cuvsKMeansParams_v2_t; + static cuvsError_t params_create(params_t* p) { return cuvsKMeansParamsCreate_v2(p); } + static cuvsError_t params_destroy(params_t p) { return cuvsKMeansParamsDestroy_v2(p); } + static cuvsError_t fit(cuvsResources_t res, + params_t params, + DLManagedTensor* dataset, + DLManagedTensor* centroids, + double* inertia, + int* n_iter) + { + return cuvsMultiGpuKMeansFit_v2(res, params, dataset, NULL, centroids, inertia, n_iter); + } +}; + +template +void test_mg_fit_host() +{ + float centroids_h[kNClusters][kNFeatures] = { + {0.0f, 0.0f}, + {12.0f, 12.0f}, + }; + + cuvsResources_t res; + int32_t device_ids[1] = {0}; + DLManagedTensor device_ids_t{}; + cuvs::core::to_dlpack(raft::make_host_vector_view(device_ids, 1), + &device_ids_t); + ASSERT_EQ(cuvsMultiGpuResourcesCreateWithDeviceIds(&res, &device_ids_t), CUVS_SUCCESS); + device_ids_t.deleter(&device_ids_t); + + typename Api::params_t params; + ASSERT_EQ(Api::params_create(¶ms), CUVS_SUCCESS); + params->n_clusters = kNClusters; + params->max_iter = 100; + params->tol = 1e-6; + params->init = Array; + params->streaming_batch_size = 4; // force at least 2 streamed batches + + DLManagedTensor dataset_t{}; + cuvs::core::to_dlpack(raft::make_host_matrix_view( + reinterpret_cast(kDataset), kNSamples, kNFeatures), + &dataset_t); + + DLManagedTensor centroids_t{}; + cuvs::core::to_dlpack(raft::make_host_matrix_view( + reinterpret_cast(centroids_h), kNClusters, kNFeatures), + ¢roids_t); + + double inertia = -1.0; + int n_iter = -1; + + ASSERT_EQ(Api::fit(res, params, &dataset_t, ¢roids_t, &inertia, &n_iter), CUVS_SUCCESS); + + auto* centroids_data = reinterpret_cast(centroids_h); + for (int i = 0; i < kNClusters * kNFeatures; ++i) { + EXPECT_NEAR(centroids_data[i], kExpectedCentroids[i], 1e-4f); + } + + EXPECT_GT(n_iter, 0); + EXPECT_NEAR(inertia, kExpectedInertia, 1e-4); + + centroids_t.deleter(¢roids_t); + dataset_t.deleter(&dataset_t); + + ASSERT_EQ(Api::params_destroy(params), CUVS_SUCCESS); + ASSERT_EQ(cuvsMultiGpuResourcesDestroy(res), CUVS_SUCCESS); +} + +} // namespace + +TEST(KMeansMgC, FitHost) { test_mg_fit_host(); } +// TODO(cuVS 26.08): remove FitHostV2 once `_v2` is promoted to the +// unsuffixed ABI. +TEST(KMeansMgC, FitHostV2) { test_mg_fit_host(); } diff --git a/fern/docs.yml b/fern/docs.yml index db239ce019..c9e8e1dc9d 100644 --- a/fern/docs.yml +++ b/fern/docs.yml @@ -246,6 +246,8 @@ navigation: contents: - page: "Cluster Kmeans" path: "./pages/c_api/c-api-cluster-kmeans.md" + - page: "Cluster Multi GPU Kmeans" + path: "./pages/c_api/c-api-cluster-mg-kmeans.md" - page: "Core C API" path: "./pages/c_api/c-api-core-c-api.md" - page: "Distance Distance" @@ -385,6 +387,8 @@ navigation: contents: - page: "Cluster Kmeans" path: "./pages/python_api/python-api-cluster-kmeans.md" + - page: "Cluster Multi GPU Kmeans" + path: "./pages/python_api/python-api-cluster-mg-kmeans.md" - page: "Common" path: "./pages/python_api/python-api-common.md" - page: "Distance" diff --git a/fern/pages/c_api/c-api-cluster-mg-kmeans.md b/fern/pages/c_api/c-api-cluster-mg-kmeans.md new file mode 100644 index 0000000000..899ede902c --- /dev/null +++ b/fern/pages/c_api/c-api-cluster-mg-kmeans.md @@ -0,0 +1,77 @@ +--- +slug: api-reference/c-api-cluster-mg-kmeans +--- + +# Multi-GPU K-Means + +_Source header: `cuvs/cluster/mg_kmeans.h`_ + +## Multi-GPU k-means clustering APIs + + +### cuvsMultiGpuKMeansFit + +Find clusters with single-node multi-GPU k-means using host data. + +```c +CUVS_EXPORT cuvsError_t cuvsMultiGpuKMeansFit(cuvsResources_t res, +cuvsKMeansParams_t params, +DLManagedTensor* X, +DLManagedTensor* sample_weight, +DLManagedTensor* centroids, +double* inertia, +int* n_iter); +``` + +X, sample_weight, and centroids must be host-accessible, row-major, C-contiguous DLPack tensors. X and centroids must have dtype float32 or float64, and sample_weight must match X when provided. + +**Note:** In cuVS 26.08 (next ABI major version) this signature will be
replaced by cuvsMultiGpuKMeansFit_v2. + +**Parameters** + +| Name | Direction | Type | Description | +| --- | --- | --- | --- | +| `res` | in | [`cuvsResources_t`](/api-reference/c-api-core-c-api#cuvsresources-t) | cuvsMultiGpuResources_t opaque C handle created by cuvsMultiGpuResourcesCreate or cuvsMultiGpuResourcesCreateWithDeviceIds. | +| `params` | in | [`cuvsKMeansParams_t`](/api-reference/c-api-cluster-kmeans#cuvskmeansparams) | Parameters for KMeans model. | +| `X` | in | `DLManagedTensor*` | Host training instances to cluster. [dim = n_samples x n_features] | +| `sample_weight` | in | `DLManagedTensor*` | Optional host weights for each observation in X. [len = n_samples] | +| `centroids` | inout | `DLManagedTensor*` | Host centroids. When init is Array, used as the initial cluster centers. The final generated centroids are copied back to this tensor. [dim = n_clusters x n_features] | +| `inertia` | out | `double*` | Sum of squared distances of samples to their closest cluster center. | +| `n_iter` | out | `int*` | Number of iterations run. | + +**Returns** + +[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t) + + +### cuvsMultiGpuKMeansFit_v2 + +Find clusters with single-node multi-GPU k-means (v2 params layout). + +```c +CUVS_EXPORT cuvsError_t cuvsMultiGpuKMeansFit_v2(cuvsResources_t res, +cuvsKMeansParams_v2_t params, +DLManagedTensor* X, +DLManagedTensor* sample_weight, +DLManagedTensor* centroids, +double* inertia, +int* n_iter); +``` + +Mirrors cuvsMultiGpuKMeansFit but takes cuvsKMeansParams_v2_t. Will become the unsuffixed cuvsMultiGpuKMeansFit in cuVS 26.08. + +**Parameters** + +| Name | Direction | Type | Description | +| --- | --- | --- | --- | +| `res` | in | [`cuvsResources_t`](/api-reference/c-api-core-c-api#cuvsresources-t) | cuvsMultiGpuResources_t opaque C handle. | +| `params` | in | [`cuvsKMeansParams_v2_t`](/api-reference/c-api-cluster-kmeans#cuvskmeansparams-v2) | Parameters for KMeans model (v2 layout). | +| `X` | in | `DLManagedTensor*` | Host training instances to cluster. [dim = n_samples x n_features] | +| `sample_weight` | in | `DLManagedTensor*` | Optional host weights for each observation in X. [len = n_samples] | +| `centroids` | inout | `DLManagedTensor*` | Host centroids. When init is Array, used as the initial cluster centers. The final generated centroids are copied back to this tensor. [dim = n_clusters x n_features] | +| `inertia` | out | `double*` | Sum of squared distances of samples to their closest cluster center. | +| `n_iter` | out | `int*` | Number of iterations run. | + +**Returns** + +[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t) diff --git a/fern/pages/c_api/index.md b/fern/pages/c_api/index.md index 8c72554709..d98f4cebe5 100644 --- a/fern/pages/c_api/index.md +++ b/fern/pages/c_api/index.md @@ -5,6 +5,7 @@ These pages are generated from the documented public headers in the cuVS source ## Cluster - [K-Means](/api-reference/c-api-cluster-kmeans) +- [Multi-GPU K-Means](/api-reference/c-api-cluster-mg-kmeans) ## Common diff --git a/fern/pages/python_api/index.md b/fern/pages/python_api/index.md index 41773574fa..c2a7ae9e78 100644 --- a/fern/pages/python_api/index.md +++ b/fern/pages/python_api/index.md @@ -5,6 +5,7 @@ These pages are generated from the Python and Cython sources under `python/cuvs/ ## Cluster - [Kmeans](/api-reference/python-api-cluster-kmeans) +- [Kmeans](/api-reference/python-api-cluster-mg-kmeans) ## Common diff --git a/fern/pages/python_api/python-api-cluster-mg-kmeans.md b/fern/pages/python_api/python-api-cluster-mg-kmeans.md new file mode 100644 index 0000000000..3b06cccc03 --- /dev/null +++ b/fern/pages/python_api/python-api-cluster-mg-kmeans.md @@ -0,0 +1,34 @@ +--- +slug: api-reference/python-api-cluster-mg-kmeans +--- + +# Kmeans + +_Python module: `cuvs.cluster.mg.kmeans`_ + +## fit + +`@auto_sync_multi_gpu_resources` + +```python +def fit( KMeansParams params, X, centroids=None, sample_weights=None, resources=None ) +``` + +Find clusters with single-node multi-GPU k-means using host data. + +**Parameters** + +| Name | Type | Description | +| --- | --- | --- | +| `params` | `KMeansParams` | Parameters to use to fit KMeans model. | +| `X` | `host array-like` | Training instances, shape (m, k). Must be C-contiguous float32 or float64 host data. | +| `centroids` | `host array-like, optional` | Initial centroids when ``params.init_method == "Array"`` and output centroids for all init methods. If omitted, a host NumPy output array is allocated unless ``init_method == "Array"``. | +| `sample_weights` | `host array-like, optional` | Optional weights per observation. Must be C-contiguous and have the same dtype as X. | +| `resources` | `cuvs.common.Resources, optional` | | + +**Returns** + +FitOutput +``centroids`` is a host NumPy array containing the computed centroids, +``inertia`` is the final objective value, and ``n_iter`` is the number +of iterations run. diff --git a/python/cuvs/cuvs/cluster/CMakeLists.txt b/python/cuvs/cuvs/cluster/CMakeLists.txt index 5e2df9a60b..4ad956a5bd 100644 --- a/python/cuvs/cuvs/cluster/CMakeLists.txt +++ b/python/cuvs/cuvs/cluster/CMakeLists.txt @@ -1,8 +1,9 @@ # ============================================================================= # cmake-format: off -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # cmake-format: on # ============================================================================= add_subdirectory(kmeans) +add_subdirectory(mg) diff --git a/python/cuvs/cuvs/cluster/__init__.py b/python/cuvs/cuvs/cluster/__init__.py index ec29a2139a..3d1a9d3c4f 100644 --- a/python/cuvs/cuvs/cluster/__init__.py +++ b/python/cuvs/cuvs/cluster/__init__.py @@ -1,7 +1,7 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 -from cuvs.cluster import kmeans +from cuvs.cluster import kmeans, mg -__all__ = ["kmeans"] +__all__ = ["kmeans", "mg"] diff --git a/python/cuvs/cuvs/cluster/kmeans/kmeans.pxd b/python/cuvs/cuvs/cluster/kmeans/kmeans.pxd index 975ef386df..73db2c86b4 100644 --- a/python/cuvs/cuvs/cluster/kmeans/kmeans.pxd +++ b/python/cuvs/cuvs/cluster/kmeans/kmeans.pxd @@ -70,3 +70,7 @@ cdef extern from "cuvs/cluster/kmeans.h" nogil: DLManagedTensor* X, DLManagedTensor* centroids, double* cost) + + +cdef class KMeansParams: + cdef cuvsKMeansParams* params diff --git a/python/cuvs/cuvs/cluster/kmeans/kmeans.pyx b/python/cuvs/cuvs/cluster/kmeans/kmeans.pyx index 2e9046b4b2..ec5934e6da 100644 --- a/python/cuvs/cuvs/cluster/kmeans/kmeans.pyx +++ b/python/cuvs/cuvs/cluster/kmeans/kmeans.pyx @@ -100,8 +100,6 @@ cdef class KMeansParams: For hierarchical k-means , defines the number of training iterations """ - cdef cuvsKMeansParams* params - def __cinit__(self): cuvsKMeansParamsCreate(&self.params) diff --git a/python/cuvs/cuvs/cluster/mg/CMakeLists.txt b/python/cuvs/cuvs/cluster/mg/CMakeLists.txt new file mode 100644 index 0000000000..c054ddd2cd --- /dev/null +++ b/python/cuvs/cuvs/cluster/mg/CMakeLists.txt @@ -0,0 +1,8 @@ +# +# cmake-format: off +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# cmake-format: on +# + +add_subdirectory(kmeans) diff --git a/python/cuvs/cuvs/cluster/mg/__init__.py b/python/cuvs/cuvs/cluster/mg/__init__.py new file mode 100644 index 0000000000..f397d889ed --- /dev/null +++ b/python/cuvs/cuvs/cluster/mg/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +from . import kmeans + +__all__ = ["kmeans"] diff --git a/python/cuvs/cuvs/cluster/mg/kmeans/CMakeLists.txt b/python/cuvs/cuvs/cluster/mg/kmeans/CMakeLists.txt new file mode 100644 index 0000000000..9d48f01ae4 --- /dev/null +++ b/python/cuvs/cuvs/cluster/mg/kmeans/CMakeLists.txt @@ -0,0 +1,17 @@ +# +# cmake-format: off +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# cmake-format: on +# + +# Set the list of Cython files to build +set(cython_sources kmeans.pyx) +set(linked_libraries cuvs::cuvs cuvs::c_api) + +# Build all of the Cython targets +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX cluster_mg_kmeans_ +) diff --git a/python/cuvs/cuvs/cluster/mg/kmeans/__init__.py b/python/cuvs/cuvs/cluster/mg/kmeans/__init__.py new file mode 100644 index 0000000000..04ce9eebc5 --- /dev/null +++ b/python/cuvs/cuvs/cluster/mg/kmeans/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +from cuvs.cluster.kmeans import KMeansParams + +from .kmeans import FitOutput, fit + +__all__ = ["FitOutput", "KMeansParams", "fit"] diff --git a/python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pxd b/python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pxd new file mode 100644 index 0000000000..7885d9ef51 --- /dev/null +++ b/python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pxd @@ -0,0 +1,19 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# +# cython: language_level=3 + +from cuvs.cluster.kmeans.kmeans cimport cuvsKMeansParams_t +from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t +from cuvs.common.cydlpack cimport DLManagedTensor + + +cdef extern from "cuvs/cluster/mg_kmeans.h" nogil: + cuvsError_t cuvsMultiGpuKMeansFit(cuvsResources_t res, + cuvsKMeansParams_t params, + DLManagedTensor* X, + DLManagedTensor* sample_weight, + DLManagedTensor* centroids, + double* inertia, + int* n_iter) except + diff --git a/python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pyx b/python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pyx new file mode 100644 index 0000000000..bf313322a5 --- /dev/null +++ b/python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pyx @@ -0,0 +1,153 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# +# cython: language_level=3 + +from collections import namedtuple + +import numpy as np +from pylibraft.common.cai_wrapper import wrap_array +from pylibraft.common.interruptible import cuda_interruptible + +from cuvs.common.exceptions import check_cuvs +from cuvs.common.mg_resources import auto_sync_multi_gpu_resources +from cuvs.neighbors.common import _check_input_array + +from cuvs.cluster.kmeans.kmeans cimport KMeansParams +from cuvs.common cimport cydlpack +from cuvs.common.c_api cimport cuvsResources_t + +from .kmeans cimport cuvsMultiGpuKMeansFit + +FitOutput = namedtuple("FitOutput", "centroids inertia n_iter") + + +def _as_host_array(array, name, *, writable=False): + if hasattr(array, "__cuda_array_interface__") and not isinstance( + array, np.ndarray + ): + raise ValueError( + f"SNMG KMeans requires {name} to be in host memory" + ) + + if not isinstance(array, np.ndarray): + array = np.asarray(array) + + if array.ndim == 0: + raise ValueError(f"{name} must be an array") + + if not array.flags["C_CONTIGUOUS"]: + raise ValueError(f"{name} must have C contiguous layout") + + if writable and not array.flags["WRITEABLE"]: + raise ValueError(f"{name} must be writable") + + return array + + +@auto_sync_multi_gpu_resources +def fit( + KMeansParams params, X, centroids=None, sample_weights=None, + resources=None +): + """ + Find clusters with single-node multi-GPU k-means using host data. + + Parameters + ---------- + params : KMeansParams + Parameters to use to fit KMeans model. + X : host array-like + Training instances, shape (m, k). Must be C-contiguous float32 or + float64 host data. + centroids : host array-like, optional + Initial centroids when ``params.init_method == "Array"`` and output + centroids for all init methods. If omitted, a host NumPy output array + is allocated unless ``init_method == "Array"``. + sample_weights : host array-like, optional + Optional weights per observation. Must be C-contiguous and have the + same dtype as X. + {resources_docstring} + + Returns + ------- + FitOutput + ``centroids`` is a host NumPy array containing the computed centroids, + ``inertia`` is the final objective value, and ``n_iter`` is the number + of iterations run. + """ + + if params.hierarchical: + raise ValueError("SNMG KMeans does not support hierarchical KMeans") + + X = _as_host_array(X, "X") + if X.ndim != 2: + raise ValueError("X must be a 2D array") + + x_ai = wrap_array(X) + _check_input_array( + x_ai, [np.dtype("float32"), np.dtype("float64")] + ) + + if centroids is None: + if params.init_method == "Array": + raise ValueError( + "centroids must be provided when init_method is 'Array'" + ) + centroids = np.empty( + (params.n_clusters, X.shape[1]), dtype=X.dtype + ) + else: + centroids = _as_host_array(centroids, "centroids", writable=True) + + if centroids.ndim != 2: + raise ValueError("centroids must be a 2D array") + if centroids.dtype != X.dtype: + raise TypeError("centroids dtype must match X dtype") + + centroids_ai = wrap_array(centroids) + _check_input_array( + centroids_ai, + [x_ai.dtype], + exp_rows=params.n_clusters, + exp_cols=X.shape[1], + ) + + cdef cydlpack.DLManagedTensor* sample_weight_dlpack = NULL + if sample_weights is not None: + sample_weights = _as_host_array(sample_weights, "sample_weights") + if sample_weights.ndim != 1: + raise ValueError("sample_weights must be a 1D array") + if sample_weights.dtype != X.dtype: + raise TypeError("sample_weights dtype must match X dtype") + + sample_weights_ai = wrap_array(sample_weights) + _check_input_array( + sample_weights_ai, + [x_ai.dtype], + exp_rows=X.shape[0], + exp_row_major=False, + ) + sample_weight_dlpack = cydlpack.dlpack_c(sample_weights_ai) + + cdef cydlpack.DLManagedTensor* x_dlpack = cydlpack.dlpack_c(x_ai) + cdef cydlpack.DLManagedTensor* centroids_dlpack = ( + cydlpack.dlpack_c(centroids_ai) + ) + cdef cuvsResources_t res = resources.get_c_obj() + + cdef double inertia = 0 + cdef int n_iter = 0 + + with cuda_interruptible(): + check_cuvs(cuvsMultiGpuKMeansFit( + res, + params.params, + x_dlpack, + sample_weight_dlpack, + centroids_dlpack, + &inertia, + &n_iter)) + + return FitOutput(centroids, inertia, n_iter) diff --git a/python/cuvs/cuvs/tests/test_kmeans.py b/python/cuvs/cuvs/tests/test_kmeans.py index 210ae06b80..d0905c2a5f 100644 --- a/python/cuvs/cuvs/tests/test_kmeans.py +++ b/python/cuvs/cuvs/tests/test_kmeans.py @@ -15,6 +15,19 @@ from cuvs.distance import pairwise_distance +def make_well_separated_kmeans_input(rng, n_rows, n_cols, n_clusters, dtype): + labels = np.arange(n_rows) % n_clusters + centers = rng.normal( + loc=0.0, scale=10.0, size=(n_clusters, n_cols) + ).astype(dtype) + noise = rng.normal(loc=0.0, scale=0.01, size=(n_rows, n_cols)).astype( + dtype + ) + X = centers[labels] + noise + initial_centroids = X[np.arange(n_clusters)].copy() + return np.ascontiguousarray(X), initial_centroids + + @pytest.mark.parametrize("n_rows", [100]) @pytest.mark.parametrize("n_cols", [5, 25]) @pytest.mark.parametrize("n_clusters", [5, 15]) @@ -80,7 +93,7 @@ def test_cluster_cost(n_rows, n_cols, n_clusters, dtype): @pytest.mark.parametrize("n_cols", [10, 100]) @pytest.mark.parametrize("n_clusters", [8, 16]) @pytest.mark.parametrize("streaming_batch_size", [0, 100, 239, 500]) -@pytest.mark.parametrize("dtype", [np.float64]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("weighted", [False, True]) def test_fit_host_matches_fit_device( n_rows, n_cols, n_clusters, streaming_batch_size, dtype, weighted @@ -91,10 +104,9 @@ def test_fit_host_matches_fit_device( Optionally tests with non-uniform sample weights. """ rng = np.random.default_rng(99) - X_host = rng.random((n_rows, n_cols)).astype(dtype) - - centroid_indices = rng.choice(n_rows, size=n_clusters, replace=False) - initial_centroids_host = X_host[centroid_indices].copy() + X_host, initial_centroids_host = make_well_separated_kmeans_input( + rng, n_rows, n_cols, n_clusters, dtype + ) if weighted: sample_weights_host = rng.uniform(0.5, 2.0, size=n_rows).astype(dtype) @@ -136,6 +148,15 @@ def test_fit_host_matches_fit_device( centroids_regular, centroids_batched, rtol=1e-3, atol=1e-3 ), f"max diff: {np.max(np.abs(centroids_regular - centroids_batched))}" + if dtype == np.float32: + # Weighted float32 inertia is sensitive to L2Expanded cancellation and + # reduction order. Centroids remain the primary parity check here. + inertia_tol = 1e-1 if weighted else 5e-2 + else: + inertia_tol = 1e-3 assert np.allclose( - inertia_regular, inertia_batched, rtol=1e-3, atol=1e-3 + inertia_regular, + inertia_batched, + rtol=inertia_tol, + atol=inertia_tol, ), f"max diff: {np.max(np.abs(inertia_regular - inertia_batched))}" diff --git a/python/cuvs/cuvs/tests/test_mg_kmeans.py b/python/cuvs/cuvs/tests/test_mg_kmeans.py new file mode 100644 index 0000000000..33c0d0f39e --- /dev/null +++ b/python/cuvs/cuvs/tests/test_mg_kmeans.py @@ -0,0 +1,220 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import pytest + +from cuvs.cluster.kmeans import KMeansParams +from cuvs.cluster.mg import kmeans as mg_kmeans +from cuvs.common import MultiGpuResources + + +def has_gpus(count=1): + try: + import cupy as cp + + return cp.cuda.runtime.getDeviceCount() >= count + except Exception: + return False + + +pytestmark = pytest.mark.skipif( + not has_gpus(1), reason="SNMG KMeans tests require at least one GPU" +) + +requires_multiple_gpus = pytest.mark.skipif( + not has_gpus(2), + reason="SNMG KMeans multi-GPU smoke test requires two GPUs", +) + + +def make_inputs(dtype, n_rows=256, n_cols=8, n_clusters=4): + rng = np.random.default_rng(123) + labels = np.arange(n_rows) % n_clusters + centers = rng.normal( + loc=0.0, scale=10.0, size=(n_clusters, n_cols) + ).astype(dtype) + noise = rng.normal(loc=0.0, scale=0.01, size=(n_rows, n_cols)).astype( + dtype + ) + X = centers[labels] + noise + centroids = X[np.arange(n_clusters)].copy() + return np.ascontiguousarray(X), centroids + + +def make_sample_weights(dtype, n_rows): + rng = np.random.default_rng(321) + return rng.uniform(0.5, 2.0, size=n_rows).astype(dtype) + + +def predict_labels_host(X, centroids): + distances = np.sum( + (X[:, None, :] - centroids[None, :, :]) ** 2, + axis=2, + ) + labels = np.argmin(distances, axis=1) + row_distances = distances[np.arange(X.shape[0]), labels] + return labels, row_distances + + +def assert_same_label_partition(lhs, rhs): + lhs = np.asarray(lhs) + rhs = np.asarray(rhs) + assert np.array_equal( + np.equal.outer(lhs, lhs), + np.equal.outer(rhs, rhs), + ) + + +def assert_valid_mg_fit_output(out, X, n_clusters): + assert isinstance(out.centroids, np.ndarray) + assert out.centroids.shape == (n_clusters, X.shape[1]) + assert out.centroids.dtype == X.dtype + assert np.all(np.isfinite(out.centroids)) + assert np.isfinite(out.inertia) + assert out.n_iter > 0 + + +def assert_inertia_matches_centroids(out, X, sample_weights): + _, row_distances = predict_labels_host(X, out.centroids) + if sample_weights is not None: + sample_weights = sample_weights * X.shape[0] / sample_weights.sum() + row_distances = row_distances * sample_weights + + inertia_tol = 5e-2 if X.dtype == np.float32 else 1e-3 + assert np.allclose( + out.inertia, + np.sum(row_distances), + rtol=inertia_tol, + atol=inertia_tol, + ) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("init_method", ["Array", "KMeansPlusPlus", "Random"]) +@pytest.mark.parametrize("weighted", [False, True]) +def test_mg_kmeans_fit_options(dtype, init_method, weighted): + n_clusters = 4 + X, initial_centroids = make_inputs(dtype, n_clusters=n_clusters) + resources = MultiGpuResources() + + if weighted: + sample_weights = make_sample_weights(dtype, X.shape[0]) + else: + sample_weights = None + + params = KMeansParams( + n_clusters=n_clusters, + init_method=init_method, + max_iter=20, + tol=1e-10, + n_init=3 if init_method == "Random" else 1, + init_size=X.shape[0], + streaming_batch_size=37, + ) + centroids = initial_centroids.copy() if init_method == "Array" else None + + mg_out = mg_kmeans.fit( + params, + X, + centroids=centroids, + sample_weights=sample_weights, + resources=resources, + ) + resources.sync() + + assert_valid_mg_fit_output(mg_out, X, n_clusters) + assert_inertia_matches_centroids(mg_out, X, sample_weights) + + labels, _ = predict_labels_host(X, mg_out.centroids) + if init_method != "Random": + expected_labels = np.arange(X.shape[0]) % n_clusters + assert_same_label_partition(labels, expected_labels) + assert len(np.unique(labels)) == n_clusters + + +def test_mg_kmeans_input_validation(): + import cupy as cp + + n_clusters = 4 + X, centroids = make_inputs(np.float32, n_clusters=n_clusters) + resources = MultiGpuResources() + params = KMeansParams(n_clusters=n_clusters, init_method="Array") + + with pytest.raises(ValueError, match="centroids must be provided"): + mg_kmeans.fit(params, X, resources=resources) + + hierarchical_params = KMeansParams( + n_clusters=n_clusters, hierarchical=True + ) + with pytest.raises(ValueError, match="hierarchical"): + mg_kmeans.fit( + hierarchical_params, + X, + centroids=centroids.copy(), + resources=resources, + ) + + with pytest.raises(ValueError, match="host memory"): + mg_kmeans.fit( + params, + cp.asarray(X), + centroids=centroids.copy(), + resources=resources, + ) + + with pytest.raises(ValueError, match="host memory"): + mg_kmeans.fit( + params, X, centroids=cp.asarray(centroids), resources=resources + ) + + with pytest.raises(ValueError, match="C contiguous"): + mg_kmeans.fit( + params, + np.asfortranarray(X), + centroids=centroids.copy(), + resources=resources, + ) + + with pytest.raises(TypeError, match="centroids dtype"): + mg_kmeans.fit( + params, + X, + centroids=centroids.astype(np.float64), + resources=resources, + ) + + with pytest.raises(ValueError, match="Incorrect number of rows"): + mg_kmeans.fit( + params, + X, + centroids=centroids[: n_clusters - 1].copy(), + resources=resources, + ) + + with pytest.raises(ValueError, match="sample_weights must be a 1D"): + mg_kmeans.fit( + params, + X, + centroids=centroids.copy(), + sample_weights=np.ones((X.shape[0], 1), dtype=X.dtype), + resources=resources, + ) + + with pytest.raises(TypeError, match="sample_weights dtype"): + mg_kmeans.fit( + params, + X, + centroids=centroids.copy(), + sample_weights=np.ones(X.shape[0], dtype=np.float64), + resources=resources, + ) + + with pytest.raises(ValueError, match="host memory"): + mg_kmeans.fit( + params, + X, + centroids=centroids.copy(), + sample_weights=cp.ones(X.shape[0], dtype=cp.float32), + resources=resources, + ) From 574ebc39f0317d240ab67a17e8267b7c78ded146 Mon Sep 17 00:00:00 2001 From: vic Date: Wed, 3 Jun 2026 09:03:03 +0200 Subject: [PATCH 2/2] revert test_kmeans.py file --- python/cuvs/cuvs/tests/test_kmeans.py | 33 +++++---------------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/python/cuvs/cuvs/tests/test_kmeans.py b/python/cuvs/cuvs/tests/test_kmeans.py index d0905c2a5f..210ae06b80 100644 --- a/python/cuvs/cuvs/tests/test_kmeans.py +++ b/python/cuvs/cuvs/tests/test_kmeans.py @@ -15,19 +15,6 @@ from cuvs.distance import pairwise_distance -def make_well_separated_kmeans_input(rng, n_rows, n_cols, n_clusters, dtype): - labels = np.arange(n_rows) % n_clusters - centers = rng.normal( - loc=0.0, scale=10.0, size=(n_clusters, n_cols) - ).astype(dtype) - noise = rng.normal(loc=0.0, scale=0.01, size=(n_rows, n_cols)).astype( - dtype - ) - X = centers[labels] + noise - initial_centroids = X[np.arange(n_clusters)].copy() - return np.ascontiguousarray(X), initial_centroids - - @pytest.mark.parametrize("n_rows", [100]) @pytest.mark.parametrize("n_cols", [5, 25]) @pytest.mark.parametrize("n_clusters", [5, 15]) @@ -93,7 +80,7 @@ def test_cluster_cost(n_rows, n_cols, n_clusters, dtype): @pytest.mark.parametrize("n_cols", [10, 100]) @pytest.mark.parametrize("n_clusters", [8, 16]) @pytest.mark.parametrize("streaming_batch_size", [0, 100, 239, 500]) -@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("dtype", [np.float64]) @pytest.mark.parametrize("weighted", [False, True]) def test_fit_host_matches_fit_device( n_rows, n_cols, n_clusters, streaming_batch_size, dtype, weighted @@ -104,9 +91,10 @@ def test_fit_host_matches_fit_device( Optionally tests with non-uniform sample weights. """ rng = np.random.default_rng(99) - X_host, initial_centroids_host = make_well_separated_kmeans_input( - rng, n_rows, n_cols, n_clusters, dtype - ) + X_host = rng.random((n_rows, n_cols)).astype(dtype) + + centroid_indices = rng.choice(n_rows, size=n_clusters, replace=False) + initial_centroids_host = X_host[centroid_indices].copy() if weighted: sample_weights_host = rng.uniform(0.5, 2.0, size=n_rows).astype(dtype) @@ -148,15 +136,6 @@ def test_fit_host_matches_fit_device( centroids_regular, centroids_batched, rtol=1e-3, atol=1e-3 ), f"max diff: {np.max(np.abs(centroids_regular - centroids_batched))}" - if dtype == np.float32: - # Weighted float32 inertia is sensitive to L2Expanded cancellation and - # reduction order. Centroids remain the primary parity check here. - inertia_tol = 1e-1 if weighted else 5e-2 - else: - inertia_tol = 1e-3 assert np.allclose( - inertia_regular, - inertia_batched, - rtol=inertia_tol, - atol=inertia_tol, + inertia_regular, inertia_batched, rtol=1e-3, atol=1e-3 ), f"max diff: {np.max(np.abs(inertia_regular - inertia_batched))}"