From 89f2ca05dd76ed10598ce8ac7d6dd93689155d63 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Tue, 2 Jun 2026 15:02:31 +0200
Subject: [PATCH 1/2] SNMG Batched KMeans Python API

---
 c/CMakeLists.txt                              |   1 +
 c/include/cuvs/cluster/mg_kmeans.h            |  92 ++++++++
 c/include/cuvs/core/all.h                     |   1 +
 c/src/cluster/mg_kmeans.cpp                   | 223 ++++++++++++++++++
 c/tests/CMakeLists.txt                        |   3 +
 c/tests/cluster/kmeans_mg_c.cu                | 132 +++++++++++
 fern/docs.yml                                 |   4 +
 fern/pages/c_api/c-api-cluster-mg-kmeans.md   |  77 ++++++
 fern/pages/c_api/index.md                     |   1 +
 fern/pages/python_api/index.md                |   1 +
 .../python-api-cluster-mg-kmeans.md           |  34 +++
 python/cuvs/cuvs/cluster/CMakeLists.txt       |   3 +-
 python/cuvs/cuvs/cluster/__init__.py          |   6 +-
 python/cuvs/cuvs/cluster/kmeans/kmeans.pxd    |   4 +
 python/cuvs/cuvs/cluster/kmeans/kmeans.pyx    |   2 -
 python/cuvs/cuvs/cluster/mg/CMakeLists.txt    |   8 +
 python/cuvs/cuvs/cluster/mg/__init__.py       |   6 +
 .../cuvs/cluster/mg/kmeans/CMakeLists.txt     |  17 ++
 .../cuvs/cuvs/cluster/mg/kmeans/__init__.py   |   8 +
 python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pxd |  19 ++
 python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pyx | 153 ++++++++++++
 python/cuvs/cuvs/tests/test_kmeans.py         |  33 ++-
 python/cuvs/cuvs/tests/test_mg_kmeans.py      | 220 +++++++++++++++++
 23 files changed, 1036 insertions(+), 12 deletions(-)
 create mode 100644 c/include/cuvs/cluster/mg_kmeans.h
 create mode 100644 c/src/cluster/mg_kmeans.cpp
 create mode 100644 c/tests/cluster/kmeans_mg_c.cu
 create mode 100644 fern/pages/c_api/c-api-cluster-mg-kmeans.md
 create mode 100644 fern/pages/python_api/python-api-cluster-mg-kmeans.md
 create mode 100644 python/cuvs/cuvs/cluster/mg/CMakeLists.txt
 create mode 100644 python/cuvs/cuvs/cluster/mg/__init__.py
 create mode 100644 python/cuvs/cuvs/cluster/mg/kmeans/CMakeLists.txt
 create mode 100644 python/cuvs/cuvs/cluster/mg/kmeans/__init__.py
 create mode 100644 python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pxd
 create mode 100644 python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pyx
 create mode 100644 python/cuvs/cuvs/tests/test_mg_kmeans.py

diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt
index be4bc7a051..beb995c6c3 100644
--- a/c/CMakeLists.txt
+++ b/c/CMakeLists.txt
@@ -86,6 +86,7 @@ add_library(
   cuvs_c SHARED
   src/core/c_api.cpp
   src/cluster/kmeans.cpp
+  $<$<BOOL:${BUILD_MG_ALGOS}>:src/cluster/mg_kmeans.cpp>
   src/neighbors/brute_force.cpp
   src/neighbors/ivf_flat.cpp
   src/neighbors/ivf_pq.cpp
diff --git a/c/include/cuvs/cluster/mg_kmeans.h b/c/include/cuvs/cluster/mg_kmeans.h
new file mode 100644
index 0000000000..85a0c82dc6
--- /dev/null
+++ b/c/include/cuvs/cluster/mg_kmeans.h
@@ -0,0 +1,92 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cuvs/cluster/kmeans.h>
+#include <cuvs/core/c_api.h>
+#include <dlpack/dlpack.h>
+#include <stdint.h>
+
+#include <cuvs/core/export.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup mg_kmeans_c Multi-GPU k-means clustering APIs
+ * @{
+ */
+
+/**
+ * @brief Find clusters with single-node multi-GPU k-means using host data.
+ *
+ * X, sample_weight, and centroids must be host-accessible, row-major,
+ * C-contiguous DLPack tensors. X and centroids must have dtype float32 or
+ * float64, and sample_weight must match X when provided.
+ *
+ * @note In cuVS 26.08 (next ABI major version) this signature will be
+ * replaced by cuvsMultiGpuKMeansFit_v2.
+ *
+ * @param[in]     res           cuvsMultiGpuResources_t opaque C handle
+ *                              created by cuvsMultiGpuResourcesCreate or
+ *                              cuvsMultiGpuResourcesCreateWithDeviceIds.
+ * @param[in]     params        Parameters for KMeans model.
+ * @param[in]     X             Host training instances to cluster.
+ *                              [dim = n_samples x n_features]
+ * @param[in]     sample_weight Optional host weights for each observation in X.
+ *                              [len = n_samples]
+ * @param[inout]  centroids     Host centroids. When init is Array, used as the
+ *                              initial cluster centers. The final generated
+ *                              centroids are copied back to this tensor.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    inertia       Sum of squared distances of samples to their
+ *                              closest cluster center.
+ * @param[out]    n_iter        Number of iterations run.
+ */
+CUVS_EXPORT cuvsError_t cuvsMultiGpuKMeansFit(cuvsResources_t res,
+                                              cuvsKMeansParams_t params,
+                                              DLManagedTensor* X,
+                                              DLManagedTensor* sample_weight,
+                                              DLManagedTensor* centroids,
+                                              double* inertia,
+                                              int* n_iter);
+
+/**
+ * @brief Find clusters with single-node multi-GPU k-means (v2 params layout).
+ *
+ * Mirrors cuvsMultiGpuKMeansFit but takes cuvsKMeansParams_v2_t. Will become
+ * the unsuffixed cuvsMultiGpuKMeansFit in cuVS 26.08.
+ *
+ * @param[in]     res           cuvsMultiGpuResources_t opaque C handle.
+ * @param[in]     params        Parameters for KMeans model (v2 layout).
+ * @param[in]     X             Host training instances to cluster.
+ *                              [dim = n_samples x n_features]
+ * @param[in]     sample_weight Optional host weights for each observation in X.
+ *                              [len = n_samples]
+ * @param[inout]  centroids     Host centroids. When init is Array, used as the
+ *                              initial cluster centers. The final generated
+ *                              centroids are copied back to this tensor.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    inertia       Sum of squared distances of samples to their
+ *                              closest cluster center.
+ * @param[out]    n_iter        Number of iterations run.
+ */
+CUVS_EXPORT cuvsError_t cuvsMultiGpuKMeansFit_v2(cuvsResources_t res,
+                                                 cuvsKMeansParams_v2_t params,
+                                                 DLManagedTensor* X,
+                                                 DLManagedTensor* sample_weight,
+                                                 DLManagedTensor* centroids,
+                                                 double* inertia,
+                                                 int* n_iter);
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/c/include/cuvs/core/all.h b/c/include/cuvs/core/all.h
index 545c7ec6f4..8e8caafcd8 100644
--- a/c/include/cuvs/core/all.h
+++ b/c/include/cuvs/core/all.h
@@ -34,6 +34,7 @@
 #endif
 
 #ifdef CUVS_BUILD_MG_ALGOS
+  #include <cuvs/cluster/mg_kmeans.h>
   #include <cuvs/neighbors/mg_cagra.h>
   #include <cuvs/neighbors/mg_common.h>
   #include <cuvs/neighbors/mg_ivf_flat.h>
diff --git a/c/src/cluster/mg_kmeans.cpp b/c/src/cluster/mg_kmeans.cpp
new file mode 100644
index 0000000000..06255b6f71
--- /dev/null
+++ b/c/src/cluster/mg_kmeans.cpp
@@ -0,0 +1,223 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cstdint>
+#include <optional>
+
+#include <dlpack/dlpack.h>
+
+#include <cuvs/cluster/kmeans.hpp>
+#include <cuvs/cluster/mg_kmeans.h>
+#include <cuvs/core/c_api.h>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/multi_gpu.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include "../core/exceptions.hpp"
+#include "../core/interop.hpp"
+
+namespace {
+
+template <typename ParamsT>
+cuvs::cluster::kmeans::params convert_params(const ParamsT& params)
+{
+  auto kmeans_params       = cuvs::cluster::kmeans::params();
+  kmeans_params.metric     = static_cast<cuvs::distance::DistanceType>(params.metric);
+  kmeans_params.init       = static_cast<cuvs::cluster::kmeans::params::InitMethod>(params.init);
+  kmeans_params.n_clusters = params.n_clusters;
+  kmeans_params.max_iter   = params.max_iter;
+  kmeans_params.tol        = params.tol;
+  kmeans_params.n_init     = params.n_init;
+  kmeans_params.oversampling_factor  = params.oversampling_factor;
+  kmeans_params.batch_samples        = params.batch_samples;
+  kmeans_params.batch_centroids      = params.batch_centroids;
+  kmeans_params.init_size            = params.init_size;
+  kmeans_params.streaming_batch_size = params.streaming_batch_size;
+  return kmeans_params;
+}
+
+void validate_host_tensor(DLManagedTensor* tensor, const char* name)
+{
+  RAFT_EXPECTS(tensor != nullptr, "%s must not be NULL", name);
+  auto dl_tensor = tensor->dl_tensor;
+  RAFT_EXPECTS(dl_tensor.data != nullptr, "%s data must not be NULL", name);
+  RAFT_EXPECTS(dl_tensor.shape != nullptr, "%s shape must not be NULL", name);
+  RAFT_EXPECTS(
+    cuvs::core::is_dlpack_host_compatible(dl_tensor), "%s must be host accessible", name);
+  RAFT_EXPECTS(dl_tensor.device.device_type != kDLCUDA, "%s must reside in host memory", name);
+  RAFT_EXPECTS(cuvs::core::is_c_contiguous(tensor), "%s must be C-contiguous", name);
+}
+
+bool dtype_equal(const DLTensor& lhs, const DLTensor& rhs)
+{
+  return lhs.dtype.code == rhs.dtype.code && lhs.dtype.bits == rhs.dtype.bits &&
+         lhs.dtype.lanes == rhs.dtype.lanes;
+}
+
+void validate_float_dtype(const DLTensor& tensor, const char* name)
+{
+  RAFT_EXPECTS(
+    tensor.dtype.code == kDLFloat && (tensor.dtype.bits == 32 || tensor.dtype.bits == 64),
+    "%s must have dtype float32 or float64",
+    name);
+  RAFT_EXPECTS(tensor.dtype.lanes == 1, "%s must have one DLPack lane", name);
+}
+
+template <typename ParamsT>
+void validate_inputs(const ParamsT& params,
+                     DLManagedTensor* X_tensor,
+                     DLManagedTensor* sample_weight_tensor,
+                     DLManagedTensor* centroids_tensor)
+{
+  RAFT_EXPECTS(params.n_clusters > 0, "n_clusters must be positive");
+  RAFT_EXPECTS(!params.hierarchical, "hierarchical kmeans is not supported by SNMG kmeans");
+
+  validate_host_tensor(X_tensor, "X");
+  validate_host_tensor(centroids_tensor, "centroids");
+
+  auto X         = X_tensor->dl_tensor;
+  auto centroids = centroids_tensor->dl_tensor;
+
+  RAFT_EXPECTS(X.ndim == 2, "X must be a 2D tensor");
+  RAFT_EXPECTS(centroids.ndim == 2, "centroids must be a 2D tensor");
+  RAFT_EXPECTS(X.shape[0] > 0, "X must have at least one row");
+  RAFT_EXPECTS(X.shape[1] > 0, "X must have at least one column");
+  RAFT_EXPECTS(centroids.shape[0] == params.n_clusters,
+               "centroids row count must equal n_clusters");
+  RAFT_EXPECTS(centroids.shape[1] == X.shape[1],
+               "centroids column count must equal X column count");
+
+  validate_float_dtype(X, "X");
+  RAFT_EXPECTS(dtype_equal(X, centroids), "centroids dtype must match X dtype");
+
+  if (sample_weight_tensor != nullptr) {
+    validate_host_tensor(sample_weight_tensor, "sample_weight");
+    auto sample_weight = sample_weight_tensor->dl_tensor;
+    RAFT_EXPECTS(sample_weight.ndim == 1, "sample_weight must be a 1D tensor");
+    RAFT_EXPECTS(sample_weight.shape[0] == X.shape[0],
+                 "sample_weight length must equal X row count");
+    RAFT_EXPECTS(dtype_equal(X, sample_weight), "sample_weight dtype must match X dtype");
+  }
+}
+
+template <typename T, typename ParamsT, typename IdxT = int64_t>
+void fit_snmg(cuvsResources_t res,
+              const ParamsT& params,
+              DLManagedTensor* X_tensor,
+              DLManagedTensor* sample_weight_tensor,
+              DLManagedTensor* centroids_tensor,
+              double* inertia,
+              int* n_iter)
+{
+  auto res_ptr = reinterpret_cast<raft::resources*>(res);
+  RAFT_EXPECTS(res_ptr != nullptr, "res must not be NULL");
+  RAFT_EXPECTS(raft::resource::is_multi_gpu(*res_ptr),
+               "cuvsMultiGpuKMeansFit requires a MultiGpuResources handle");
+
+  auto X         = X_tensor->dl_tensor;
+  auto centroids = centroids_tensor->dl_tensor;
+
+  auto n_samples  = static_cast<IdxT>(X.shape[0]);
+  auto n_features = static_cast<IdxT>(X.shape[1]);
+  auto n_clusters = static_cast<IdxT>(params.n_clusters);
+
+  auto X_view = raft::make_host_matrix_view<T const, IdxT>(
+    reinterpret_cast<T const*>(X.data), n_samples, n_features);
+
+  std::optional<raft::host_vector_view<T const, IdxT>> sample_weight;
+  if (sample_weight_tensor != nullptr) {
+    auto sw = sample_weight_tensor->dl_tensor;
+    sample_weight =
+      raft::make_host_vector_view<T const, IdxT>(reinterpret_cast<T const*>(sw.data), n_samples);
+  }
+
+  auto const& rank0_res  = raft::resource::set_current_device_to_rank(*res_ptr, 0);
+  auto stream            = raft::resource::get_cuda_stream(rank0_res);
+  auto d_centroids       = raft::make_device_matrix<T, IdxT>(rank0_res, n_clusters, n_features);
+  auto n_centroid_values = n_clusters * n_features;
+
+  if (params.init == Array) {
+    raft::update_device(d_centroids.data_handle(),
+                        reinterpret_cast<T const*>(centroids.data),
+                        n_centroid_values,
+                        stream);
+    raft::resource::sync_stream(rank0_res, stream);
+  }
+
+  T inertia_temp     = T{0};
+  IdxT n_iter_temp   = IdxT{0};
+  auto kmeans_params = convert_params(params);
+  cuvs::cluster::kmeans::fit(*res_ptr,
+                             kmeans_params,
+                             X_view,
+                             sample_weight,
+                             d_centroids.view(),
+                             raft::make_host_scalar_view<T>(&inertia_temp),
+                             raft::make_host_scalar_view<IdxT>(&n_iter_temp));
+
+  raft::update_host(
+    reinterpret_cast<T*>(centroids.data), d_centroids.data_handle(), n_centroid_values, stream);
+  raft::resource::sync_stream(rank0_res, stream);
+
+  *inertia = static_cast<double>(inertia_temp);
+  *n_iter  = static_cast<int>(n_iter_temp);
+}
+
+template <typename ParamsT>
+void dispatch_fit(cuvsResources_t res,
+                  ParamsT params,
+                  DLManagedTensor* X,
+                  DLManagedTensor* sample_weight,
+                  DLManagedTensor* centroids,
+                  double* inertia,
+                  int* n_iter)
+{
+  RAFT_EXPECTS(res != 0, "res must not be NULL");
+  RAFT_EXPECTS(params != nullptr, "params must not be NULL");
+  RAFT_EXPECTS(inertia != nullptr, "inertia must not be NULL");
+  RAFT_EXPECTS(n_iter != nullptr, "n_iter must not be NULL");
+
+  validate_inputs(*params, X, sample_weight, centroids);
+
+  auto dataset = X->dl_tensor;
+  if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) {
+    fit_snmg<float>(res, *params, X, sample_weight, centroids, inertia, n_iter);
+  } else if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 64) {
+    fit_snmg<double>(res, *params, X, sample_weight, centroids, inertia, n_iter);
+  } else {
+    RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
+              dataset.dtype.code,
+              dataset.dtype.bits);
+  }
+}
+
+}  // namespace
+
+extern "C" cuvsError_t cuvsMultiGpuKMeansFit(cuvsResources_t res,
+                                             cuvsKMeansParams_t params,
+                                             DLManagedTensor* X,
+                                             DLManagedTensor* sample_weight,
+                                             DLManagedTensor* centroids,
+                                             double* inertia,
+                                             int* n_iter)
+{
+  return cuvs::core::translate_exceptions(
+    [=] { dispatch_fit(res, params, X, sample_weight, centroids, inertia, n_iter); });
+}
+
+extern "C" cuvsError_t cuvsMultiGpuKMeansFit_v2(cuvsResources_t res,
+                                                cuvsKMeansParams_v2_t params,
+                                                DLManagedTensor* X,
+                                                DLManagedTensor* sample_weight,
+                                                DLManagedTensor* centroids,
+                                                double* inertia,
+                                                int* n_iter)
+{
+  return cuvs::core::translate_exceptions(
+    [=] { dispatch_fit(res, params, X, sample_weight, centroids, inertia, n_iter); });
+}
diff --git a/c/tests/CMakeLists.txt b/c/tests/CMakeLists.txt
index f1cff7824e..614af5e955 100644
--- a/c/tests/CMakeLists.txt
+++ b/c/tests/CMakeLists.txt
@@ -79,6 +79,9 @@ ConfigureTest(
   NAME DISTANCE_C_TEST PATH distance/run_pairwise_distance_c.c distance/pairwise_distance_c.cu
 )
 ConfigureTest(NAME KMEANS_C_TEST PATH cluster/kmeans_c.cu)
+if(BUILD_MG_ALGOS)
+  ConfigureTest(NAME KMEANS_MG_C_TEST PATH cluster/kmeans_mg_c.cu)
+endif()
 ConfigureTest(NAME BRUTEFORCE_C_TEST PATH neighbors/run_brute_force_c.c neighbors/brute_force_c.cu)
 ConfigureTest(NAME IVF_FLAT_C_TEST PATH neighbors/run_ivf_flat_c.c neighbors/ann_ivf_flat_c.cu)
 ConfigureTest(NAME IVF_PQ_C_TEST PATH neighbors/run_ivf_pq_c.c neighbors/ann_ivf_pq_c.cu)
diff --git a/c/tests/cluster/kmeans_mg_c.cu b/c/tests/cluster/kmeans_mg_c.cu
new file mode 100644
index 0000000000..526a5c45fa
--- /dev/null
+++ b/c/tests/cluster/kmeans_mg_c.cu
@@ -0,0 +1,132 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CUVS_BUILD_MG_ALGOS
+#error "KMEANS_MG_C_TEST requires BUILD_MG_ALGOS"
+#endif
+
+#include "../../src/core/interop.hpp"
+#include <cuvs/cluster/kmeans.h>
+#include <cuvs/cluster/mg_kmeans.h>
+#include <cuvs/core/c_api.h>
+
+#include <gtest/gtest.h>
+
+#include <raft/core/host_mdspan.hpp>
+
+#include <cstdint>
+
+namespace {
+
+constexpr int64_t kNSamples  = 8;
+constexpr int64_t kNFeatures = 2;
+constexpr int kNClusters     = 2;
+
+float kDataset[kNSamples][kNFeatures] = {
+  {1.0f, 1.0f},
+  {1.0f, 2.0f},
+  {2.0f, 1.0f},
+  {2.0f, 2.0f},
+  {10.0f, 10.0f},
+  {10.0f, 11.0f},
+  {11.0f, 10.0f},
+  {11.0f, 11.0f},
+};
+
+float kExpectedCentroids[kNClusters * kNFeatures] = {1.5f, 1.5f, 10.5f, 10.5f};
+
+// 8 points, each at squared distance 0.5 from its cluster mean -> 4.0.
+constexpr double kExpectedInertia = 4.0;
+
+struct kmeans_mg_api_v1 {
+  using params_t = cuvsKMeansParams_t;
+  static cuvsError_t params_create(params_t* p) { return cuvsKMeansParamsCreate(p); }
+  static cuvsError_t params_destroy(params_t p) { return cuvsKMeansParamsDestroy(p); }
+  static cuvsError_t fit(cuvsResources_t res,
+                         params_t params,
+                         DLManagedTensor* dataset,
+                         DLManagedTensor* centroids,
+                         double* inertia,
+                         int* n_iter)
+  {
+    return cuvsMultiGpuKMeansFit(res, params, dataset, NULL, centroids, inertia, n_iter);
+  }
+};
+
+struct kmeans_mg_api_v2 {
+  using params_t = cuvsKMeansParams_v2_t;
+  static cuvsError_t params_create(params_t* p) { return cuvsKMeansParamsCreate_v2(p); }
+  static cuvsError_t params_destroy(params_t p) { return cuvsKMeansParamsDestroy_v2(p); }
+  static cuvsError_t fit(cuvsResources_t res,
+                         params_t params,
+                         DLManagedTensor* dataset,
+                         DLManagedTensor* centroids,
+                         double* inertia,
+                         int* n_iter)
+  {
+    return cuvsMultiGpuKMeansFit_v2(res, params, dataset, NULL, centroids, inertia, n_iter);
+  }
+};
+
+template <typename Api>
+void test_mg_fit_host()
+{
+  float centroids_h[kNClusters][kNFeatures] = {
+    {0.0f, 0.0f},
+    {12.0f, 12.0f},
+  };
+
+  cuvsResources_t res;
+  int32_t device_ids[1] = {0};
+  DLManagedTensor device_ids_t{};
+  cuvs::core::to_dlpack(raft::make_host_vector_view<int32_t, int64_t>(device_ids, 1),
+                        &device_ids_t);
+  ASSERT_EQ(cuvsMultiGpuResourcesCreateWithDeviceIds(&res, &device_ids_t), CUVS_SUCCESS);
+  device_ids_t.deleter(&device_ids_t);
+
+  typename Api::params_t params;
+  ASSERT_EQ(Api::params_create(&params), CUVS_SUCCESS);
+  params->n_clusters           = kNClusters;
+  params->max_iter             = 100;
+  params->tol                  = 1e-6;
+  params->init                 = Array;
+  params->streaming_batch_size = 4;  // force at least 2 streamed batches
+
+  DLManagedTensor dataset_t{};
+  cuvs::core::to_dlpack(raft::make_host_matrix_view<float, int64_t>(
+                          reinterpret_cast<float*>(kDataset), kNSamples, kNFeatures),
+                        &dataset_t);
+
+  DLManagedTensor centroids_t{};
+  cuvs::core::to_dlpack(raft::make_host_matrix_view<float, int64_t>(
+                          reinterpret_cast<float*>(centroids_h), kNClusters, kNFeatures),
+                        &centroids_t);
+
+  double inertia = -1.0;
+  int n_iter     = -1;
+
+  ASSERT_EQ(Api::fit(res, params, &dataset_t, &centroids_t, &inertia, &n_iter), CUVS_SUCCESS);
+
+  auto* centroids_data = reinterpret_cast<float*>(centroids_h);
+  for (int i = 0; i < kNClusters * kNFeatures; ++i) {
+    EXPECT_NEAR(centroids_data[i], kExpectedCentroids[i], 1e-4f);
+  }
+
+  EXPECT_GT(n_iter, 0);
+  EXPECT_NEAR(inertia, kExpectedInertia, 1e-4);
+
+  centroids_t.deleter(&centroids_t);
+  dataset_t.deleter(&dataset_t);
+
+  ASSERT_EQ(Api::params_destroy(params), CUVS_SUCCESS);
+  ASSERT_EQ(cuvsMultiGpuResourcesDestroy(res), CUVS_SUCCESS);
+}
+
+}  // namespace
+
+TEST(KMeansMgC, FitHost) { test_mg_fit_host<kmeans_mg_api_v1>(); }
+// TODO(cuVS 26.08): remove FitHostV2 once `_v2` is promoted to the
+// unsuffixed ABI.
+TEST(KMeansMgC, FitHostV2) { test_mg_fit_host<kmeans_mg_api_v2>(); }
diff --git a/fern/docs.yml b/fern/docs.yml
index db239ce019..c9e8e1dc9d 100644
--- a/fern/docs.yml
+++ b/fern/docs.yml
@@ -246,6 +246,8 @@ navigation:
         contents:
           - page: "Cluster Kmeans"
             path: "./pages/c_api/c-api-cluster-kmeans.md"
+          - page: "Cluster Multi GPU Kmeans"
+            path: "./pages/c_api/c-api-cluster-mg-kmeans.md"
           - page: "Core C API"
             path: "./pages/c_api/c-api-core-c-api.md"
           - page: "Distance Distance"
@@ -385,6 +387,8 @@ navigation:
         contents:
           - page: "Cluster Kmeans"
             path: "./pages/python_api/python-api-cluster-kmeans.md"
+          - page: "Cluster Multi GPU Kmeans"
+            path: "./pages/python_api/python-api-cluster-mg-kmeans.md"
           - page: "Common"
             path: "./pages/python_api/python-api-common.md"
           - page: "Distance"
diff --git a/fern/pages/c_api/c-api-cluster-mg-kmeans.md b/fern/pages/c_api/c-api-cluster-mg-kmeans.md
new file mode 100644
index 0000000000..899ede902c
--- /dev/null
+++ b/fern/pages/c_api/c-api-cluster-mg-kmeans.md
@@ -0,0 +1,77 @@
+---
+slug: api-reference/c-api-cluster-mg-kmeans
+---
+
+# Multi-GPU K-Means
+
+_Source header: `cuvs/cluster/mg_kmeans.h`_
+
+## Multi-GPU k-means clustering APIs
+
+<a id="cuvsmultigpukmeansfit"></a>
+### cuvsMultiGpuKMeansFit
+
+Find clusters with single-node multi-GPU k-means using host data.
+
+```c
+CUVS_EXPORT cuvsError_t cuvsMultiGpuKMeansFit(cuvsResources_t res,
+cuvsKMeansParams_t params,
+DLManagedTensor* X,
+DLManagedTensor* sample_weight,
+DLManagedTensor* centroids,
+double* inertia,
+int* n_iter);
+```
+
+X, sample_weight, and centroids must be host-accessible, row-major, C-contiguous DLPack tensors. X and centroids must have dtype float32 or float64, and sample_weight must match X when provided.
+
+**Note:** In cuVS 26.08 (next ABI major version) this signature will be<br />replaced by cuvsMultiGpuKMeansFit_v2.
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `res` | in | [`cuvsResources_t`](/api-reference/c-api-core-c-api#cuvsresources-t) | cuvsMultiGpuResources_t opaque C handle created by cuvsMultiGpuResourcesCreate or cuvsMultiGpuResourcesCreateWithDeviceIds. |
+| `params` | in | [`cuvsKMeansParams_t`](/api-reference/c-api-cluster-kmeans#cuvskmeansparams) | Parameters for KMeans model. |
+| `X` | in | `DLManagedTensor*` | Host training instances to cluster. [dim = n_samples x n_features] |
+| `sample_weight` | in | `DLManagedTensor*` | Optional host weights for each observation in X. [len = n_samples] |
+| `centroids` | inout | `DLManagedTensor*` | Host centroids. When init is Array, used as the initial cluster centers. The final generated centroids are copied back to this tensor. [dim = n_clusters x n_features] |
+| `inertia` | out | `double*` | Sum of squared distances of samples to their closest cluster center. |
+| `n_iter` | out | `int*` | Number of iterations run. |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
+
+<a id="cuvsmultigpukmeansfit-v2"></a>
+### cuvsMultiGpuKMeansFit_v2
+
+Find clusters with single-node multi-GPU k-means (v2 params layout).
+
+```c
+CUVS_EXPORT cuvsError_t cuvsMultiGpuKMeansFit_v2(cuvsResources_t res,
+cuvsKMeansParams_v2_t params,
+DLManagedTensor* X,
+DLManagedTensor* sample_weight,
+DLManagedTensor* centroids,
+double* inertia,
+int* n_iter);
+```
+
+Mirrors cuvsMultiGpuKMeansFit but takes cuvsKMeansParams_v2_t. Will become the unsuffixed cuvsMultiGpuKMeansFit in cuVS 26.08.
+
+**Parameters**
+
+| Name | Direction | Type | Description |
+| --- | --- | --- | --- |
+| `res` | in | [`cuvsResources_t`](/api-reference/c-api-core-c-api#cuvsresources-t) | cuvsMultiGpuResources_t opaque C handle. |
+| `params` | in | [`cuvsKMeansParams_v2_t`](/api-reference/c-api-cluster-kmeans#cuvskmeansparams-v2) | Parameters for KMeans model (v2 layout). |
+| `X` | in | `DLManagedTensor*` | Host training instances to cluster. [dim = n_samples x n_features] |
+| `sample_weight` | in | `DLManagedTensor*` | Optional host weights for each observation in X. [len = n_samples] |
+| `centroids` | inout | `DLManagedTensor*` | Host centroids. When init is Array, used as the initial cluster centers. The final generated centroids are copied back to this tensor. [dim = n_clusters x n_features] |
+| `inertia` | out | `double*` | Sum of squared distances of samples to their closest cluster center. |
+| `n_iter` | out | `int*` | Number of iterations run. |
+
+**Returns**
+
+[`CUVS_EXPORT cuvsError_t`](/api-reference/c-api-core-c-api#cuvserror-t)
diff --git a/fern/pages/c_api/index.md b/fern/pages/c_api/index.md
index 8c72554709..d98f4cebe5 100644
--- a/fern/pages/c_api/index.md
+++ b/fern/pages/c_api/index.md
@@ -5,6 +5,7 @@ These pages are generated from the documented public headers in the cuVS source
 ## Cluster
 
 - [K-Means](/api-reference/c-api-cluster-kmeans)
+- [Multi-GPU K-Means](/api-reference/c-api-cluster-mg-kmeans)
 
 ## Common
 
diff --git a/fern/pages/python_api/index.md b/fern/pages/python_api/index.md
index 41773574fa..c2a7ae9e78 100644
--- a/fern/pages/python_api/index.md
+++ b/fern/pages/python_api/index.md
@@ -5,6 +5,7 @@ These pages are generated from the Python and Cython sources under `python/cuvs/
 ## Cluster
 
 - [Kmeans](/api-reference/python-api-cluster-kmeans)
+- [Kmeans](/api-reference/python-api-cluster-mg-kmeans)
 
 ## Common
 
diff --git a/fern/pages/python_api/python-api-cluster-mg-kmeans.md b/fern/pages/python_api/python-api-cluster-mg-kmeans.md
new file mode 100644
index 0000000000..3b06cccc03
--- /dev/null
+++ b/fern/pages/python_api/python-api-cluster-mg-kmeans.md
@@ -0,0 +1,34 @@
+---
+slug: api-reference/python-api-cluster-mg-kmeans
+---
+
+# Kmeans
+
+_Python module: `cuvs.cluster.mg.kmeans`_
+
+## fit
+
+`@auto_sync_multi_gpu_resources`
+
+```python
+def fit( KMeansParams params, X, centroids=None, sample_weights=None, resources=None )
+```
+
+Find clusters with single-node multi-GPU k-means using host data.
+
+**Parameters**
+
+| Name | Type | Description |
+| --- | --- | --- |
+| `params` | `KMeansParams` | Parameters to use to fit KMeans model. |
+| `X` | `host array-like` | Training instances, shape (m, k). Must be C-contiguous float32 or float64 host data. |
+| `centroids` | `host array-like, optional` | Initial centroids when ``params.init_method == "Array"`` and output centroids for all init methods. If omitted, a host NumPy output array is allocated unless ``init_method == "Array"``. |
+| `sample_weights` | `host array-like, optional` | Optional weights per observation. Must be C-contiguous and have the same dtype as X. |
+| `resources` | `cuvs.common.Resources, optional` |  |
+
+**Returns**
+
+FitOutput
+``centroids`` is a host NumPy array containing the computed centroids,
+``inertia`` is the final objective value, and ``n_iter`` is the number
+of iterations run.
diff --git a/python/cuvs/cuvs/cluster/CMakeLists.txt b/python/cuvs/cuvs/cluster/CMakeLists.txt
index 5e2df9a60b..4ad956a5bd 100644
--- a/python/cuvs/cuvs/cluster/CMakeLists.txt
+++ b/python/cuvs/cuvs/cluster/CMakeLists.txt
@@ -1,8 +1,9 @@
 # =============================================================================
 # cmake-format: off
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 # cmake-format: on
 # =============================================================================
 
 add_subdirectory(kmeans)
+add_subdirectory(mg)
diff --git a/python/cuvs/cuvs/cluster/__init__.py b/python/cuvs/cuvs/cluster/__init__.py
index ec29a2139a..3d1a9d3c4f 100644
--- a/python/cuvs/cuvs/cluster/__init__.py
+++ b/python/cuvs/cuvs/cluster/__init__.py
@@ -1,7 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 
-from cuvs.cluster import kmeans
+from cuvs.cluster import kmeans, mg
 
-__all__ = ["kmeans"]
+__all__ = ["kmeans", "mg"]
diff --git a/python/cuvs/cuvs/cluster/kmeans/kmeans.pxd b/python/cuvs/cuvs/cluster/kmeans/kmeans.pxd
index 975ef386df..73db2c86b4 100644
--- a/python/cuvs/cuvs/cluster/kmeans/kmeans.pxd
+++ b/python/cuvs/cuvs/cluster/kmeans/kmeans.pxd
@@ -70,3 +70,7 @@ cdef extern from "cuvs/cluster/kmeans.h" nogil:
                                       DLManagedTensor* X,
                                       DLManagedTensor* centroids,
                                       double* cost)
+
+
+cdef class KMeansParams:
+    cdef cuvsKMeansParams* params
diff --git a/python/cuvs/cuvs/cluster/kmeans/kmeans.pyx b/python/cuvs/cuvs/cluster/kmeans/kmeans.pyx
index 2e9046b4b2..ec5934e6da 100644
--- a/python/cuvs/cuvs/cluster/kmeans/kmeans.pyx
+++ b/python/cuvs/cuvs/cluster/kmeans/kmeans.pyx
@@ -100,8 +100,6 @@ cdef class KMeansParams:
         For hierarchical k-means , defines the number of training iterations
     """
 
-    cdef cuvsKMeansParams* params
-
     def __cinit__(self):
         cuvsKMeansParamsCreate(&self.params)
 
diff --git a/python/cuvs/cuvs/cluster/mg/CMakeLists.txt b/python/cuvs/cuvs/cluster/mg/CMakeLists.txt
new file mode 100644
index 0000000000..c054ddd2cd
--- /dev/null
+++ b/python/cuvs/cuvs/cluster/mg/CMakeLists.txt
@@ -0,0 +1,8 @@
+#
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
+#
+
+add_subdirectory(kmeans)
diff --git a/python/cuvs/cuvs/cluster/mg/__init__.py b/python/cuvs/cuvs/cluster/mg/__init__.py
new file mode 100644
index 0000000000..f397d889ed
--- /dev/null
+++ b/python/cuvs/cuvs/cluster/mg/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+
+from . import kmeans
+
+__all__ = ["kmeans"]
diff --git a/python/cuvs/cuvs/cluster/mg/kmeans/CMakeLists.txt b/python/cuvs/cuvs/cluster/mg/kmeans/CMakeLists.txt
new file mode 100644
index 0000000000..9d48f01ae4
--- /dev/null
+++ b/python/cuvs/cuvs/cluster/mg/kmeans/CMakeLists.txt
@@ -0,0 +1,17 @@
+#
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
+#
+
+# Set the list of Cython files to build
+set(cython_sources kmeans.pyx)
+set(linked_libraries cuvs::cuvs cuvs::c_api)
+
+# Build all of the Cython targets
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX cluster_mg_kmeans_
+)
diff --git a/python/cuvs/cuvs/cluster/mg/kmeans/__init__.py b/python/cuvs/cuvs/cluster/mg/kmeans/__init__.py
new file mode 100644
index 0000000000..04ce9eebc5
--- /dev/null
+++ b/python/cuvs/cuvs/cluster/mg/kmeans/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+
+from cuvs.cluster.kmeans import KMeansParams
+
+from .kmeans import FitOutput, fit
+
+__all__ = ["FitOutput", "KMeansParams", "fit"]
diff --git a/python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pxd b/python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pxd
new file mode 100644
index 0000000000..7885d9ef51
--- /dev/null
+++ b/python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pxd
@@ -0,0 +1,19 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+# cython: language_level=3
+
+from cuvs.cluster.kmeans.kmeans cimport cuvsKMeansParams_t
+from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t
+from cuvs.common.cydlpack cimport DLManagedTensor
+
+
+cdef extern from "cuvs/cluster/mg_kmeans.h" nogil:
+    cuvsError_t cuvsMultiGpuKMeansFit(cuvsResources_t res,
+                                      cuvsKMeansParams_t params,
+                                      DLManagedTensor* X,
+                                      DLManagedTensor* sample_weight,
+                                      DLManagedTensor* centroids,
+                                      double* inertia,
+                                      int* n_iter) except +
diff --git a/python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pyx b/python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pyx
new file mode 100644
index 0000000000..bf313322a5
--- /dev/null
+++ b/python/cuvs/cuvs/cluster/mg/kmeans/kmeans.pyx
@@ -0,0 +1,153 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+# cython: language_level=3
+
+from collections import namedtuple
+
+import numpy as np
+from pylibraft.common.cai_wrapper import wrap_array
+from pylibraft.common.interruptible import cuda_interruptible
+
+from cuvs.common.exceptions import check_cuvs
+from cuvs.common.mg_resources import auto_sync_multi_gpu_resources
+from cuvs.neighbors.common import _check_input_array
+
+from cuvs.cluster.kmeans.kmeans cimport KMeansParams
+from cuvs.common cimport cydlpack
+from cuvs.common.c_api cimport cuvsResources_t
+
+from .kmeans cimport cuvsMultiGpuKMeansFit
+
+FitOutput = namedtuple("FitOutput", "centroids inertia n_iter")
+
+
+def _as_host_array(array, name, *, writable=False):
+    if hasattr(array, "__cuda_array_interface__") and not isinstance(
+        array, np.ndarray
+    ):
+        raise ValueError(
+            f"SNMG KMeans requires {name} to be in host memory"
+        )
+
+    if not isinstance(array, np.ndarray):
+        array = np.asarray(array)
+
+    if array.ndim == 0:
+        raise ValueError(f"{name} must be an array")
+
+    if not array.flags["C_CONTIGUOUS"]:
+        raise ValueError(f"{name} must have C contiguous layout")
+
+    if writable and not array.flags["WRITEABLE"]:
+        raise ValueError(f"{name} must be writable")
+
+    return array
+
+
+@auto_sync_multi_gpu_resources
+def fit(
+    KMeansParams params, X, centroids=None, sample_weights=None,
+    resources=None
+):
+    """
+    Find clusters with single-node multi-GPU k-means using host data.
+
+    Parameters
+    ----------
+    params : KMeansParams
+        Parameters to use to fit KMeans model.
+    X : host array-like
+        Training instances, shape (m, k). Must be C-contiguous float32 or
+        float64 host data.
+    centroids : host array-like, optional
+        Initial centroids when ``params.init_method == "Array"`` and output
+        centroids for all init methods. If omitted, a host NumPy output array
+        is allocated unless ``init_method == "Array"``.
+    sample_weights : host array-like, optional
+        Optional weights per observation. Must be C-contiguous and have the
+        same dtype as X.
+    {resources_docstring}
+
+    Returns
+    -------
+    FitOutput
+        ``centroids`` is a host NumPy array containing the computed centroids,
+        ``inertia`` is the final objective value, and ``n_iter`` is the number
+        of iterations run.
+    """
+
+    if params.hierarchical:
+        raise ValueError("SNMG KMeans does not support hierarchical KMeans")
+
+    X = _as_host_array(X, "X")
+    if X.ndim != 2:
+        raise ValueError("X must be a 2D array")
+
+    x_ai = wrap_array(X)
+    _check_input_array(
+        x_ai, [np.dtype("float32"), np.dtype("float64")]
+    )
+
+    if centroids is None:
+        if params.init_method == "Array":
+            raise ValueError(
+                "centroids must be provided when init_method is 'Array'"
+            )
+        centroids = np.empty(
+            (params.n_clusters, X.shape[1]), dtype=X.dtype
+        )
+    else:
+        centroids = _as_host_array(centroids, "centroids", writable=True)
+
+    if centroids.ndim != 2:
+        raise ValueError("centroids must be a 2D array")
+    if centroids.dtype != X.dtype:
+        raise TypeError("centroids dtype must match X dtype")
+
+    centroids_ai = wrap_array(centroids)
+    _check_input_array(
+        centroids_ai,
+        [x_ai.dtype],
+        exp_rows=params.n_clusters,
+        exp_cols=X.shape[1],
+    )
+
+    cdef cydlpack.DLManagedTensor* sample_weight_dlpack = NULL
+    if sample_weights is not None:
+        sample_weights = _as_host_array(sample_weights, "sample_weights")
+        if sample_weights.ndim != 1:
+            raise ValueError("sample_weights must be a 1D array")
+        if sample_weights.dtype != X.dtype:
+            raise TypeError("sample_weights dtype must match X dtype")
+
+        sample_weights_ai = wrap_array(sample_weights)
+        _check_input_array(
+            sample_weights_ai,
+            [x_ai.dtype],
+            exp_rows=X.shape[0],
+            exp_row_major=False,
+        )
+        sample_weight_dlpack = cydlpack.dlpack_c(sample_weights_ai)
+
+    cdef cydlpack.DLManagedTensor* x_dlpack = cydlpack.dlpack_c(x_ai)
+    cdef cydlpack.DLManagedTensor* centroids_dlpack = (
+        cydlpack.dlpack_c(centroids_ai)
+    )
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+
+    cdef double inertia = 0
+    cdef int n_iter = 0
+
+    with cuda_interruptible():
+        check_cuvs(cuvsMultiGpuKMeansFit(
+            res,
+            params.params,
+            x_dlpack,
+            sample_weight_dlpack,
+            centroids_dlpack,
+            &inertia,
+            &n_iter))
+
+    return FitOutput(centroids, inertia, n_iter)
diff --git a/python/cuvs/cuvs/tests/test_kmeans.py b/python/cuvs/cuvs/tests/test_kmeans.py
index 210ae06b80..d0905c2a5f 100644
--- a/python/cuvs/cuvs/tests/test_kmeans.py
+++ b/python/cuvs/cuvs/tests/test_kmeans.py
@@ -15,6 +15,19 @@
 from cuvs.distance import pairwise_distance
 
 
+def make_well_separated_kmeans_input(rng, n_rows, n_cols, n_clusters, dtype):
+    labels = np.arange(n_rows) % n_clusters
+    centers = rng.normal(
+        loc=0.0, scale=10.0, size=(n_clusters, n_cols)
+    ).astype(dtype)
+    noise = rng.normal(loc=0.0, scale=0.01, size=(n_rows, n_cols)).astype(
+        dtype
+    )
+    X = centers[labels] + noise
+    initial_centroids = X[np.arange(n_clusters)].copy()
+    return np.ascontiguousarray(X), initial_centroids
+
+
 @pytest.mark.parametrize("n_rows", [100])
 @pytest.mark.parametrize("n_cols", [5, 25])
 @pytest.mark.parametrize("n_clusters", [5, 15])
@@ -80,7 +93,7 @@ def test_cluster_cost(n_rows, n_cols, n_clusters, dtype):
 @pytest.mark.parametrize("n_cols", [10, 100])
 @pytest.mark.parametrize("n_clusters", [8, 16])
 @pytest.mark.parametrize("streaming_batch_size", [0, 100, 239, 500])
-@pytest.mark.parametrize("dtype", [np.float64])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
 @pytest.mark.parametrize("weighted", [False, True])
 def test_fit_host_matches_fit_device(
     n_rows, n_cols, n_clusters, streaming_batch_size, dtype, weighted
@@ -91,10 +104,9 @@ def test_fit_host_matches_fit_device(
     Optionally tests with non-uniform sample weights.
     """
     rng = np.random.default_rng(99)
-    X_host = rng.random((n_rows, n_cols)).astype(dtype)
-
-    centroid_indices = rng.choice(n_rows, size=n_clusters, replace=False)
-    initial_centroids_host = X_host[centroid_indices].copy()
+    X_host, initial_centroids_host = make_well_separated_kmeans_input(
+        rng, n_rows, n_cols, n_clusters, dtype
+    )
 
     if weighted:
         sample_weights_host = rng.uniform(0.5, 2.0, size=n_rows).astype(dtype)
@@ -136,6 +148,15 @@ def test_fit_host_matches_fit_device(
         centroids_regular, centroids_batched, rtol=1e-3, atol=1e-3
     ), f"max diff: {np.max(np.abs(centroids_regular - centroids_batched))}"
 
+    if dtype == np.float32:
+        # Weighted float32 inertia is sensitive to L2Expanded cancellation and
+        # reduction order. Centroids remain the primary parity check here.
+        inertia_tol = 1e-1 if weighted else 5e-2
+    else:
+        inertia_tol = 1e-3
     assert np.allclose(
-        inertia_regular, inertia_batched, rtol=1e-3, atol=1e-3
+        inertia_regular,
+        inertia_batched,
+        rtol=inertia_tol,
+        atol=inertia_tol,
     ), f"max diff: {np.max(np.abs(inertia_regular - inertia_batched))}"
diff --git a/python/cuvs/cuvs/tests/test_mg_kmeans.py b/python/cuvs/cuvs/tests/test_mg_kmeans.py
new file mode 100644
index 0000000000..33c0d0f39e
--- /dev/null
+++ b/python/cuvs/cuvs/tests/test_mg_kmeans.py
@@ -0,0 +1,220 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import pytest
+
+from cuvs.cluster.kmeans import KMeansParams
+from cuvs.cluster.mg import kmeans as mg_kmeans
+from cuvs.common import MultiGpuResources
+
+
+def has_gpus(count=1):
+    try:
+        import cupy as cp
+
+        return cp.cuda.runtime.getDeviceCount() >= count
+    except Exception:
+        return False
+
+
+pytestmark = pytest.mark.skipif(
+    not has_gpus(1), reason="SNMG KMeans tests require at least one GPU"
+)
+
+requires_multiple_gpus = pytest.mark.skipif(
+    not has_gpus(2),
+    reason="SNMG KMeans multi-GPU smoke test requires two GPUs",
+)
+
+
+def make_inputs(dtype, n_rows=256, n_cols=8, n_clusters=4):
+    rng = np.random.default_rng(123)
+    labels = np.arange(n_rows) % n_clusters
+    centers = rng.normal(
+        loc=0.0, scale=10.0, size=(n_clusters, n_cols)
+    ).astype(dtype)
+    noise = rng.normal(loc=0.0, scale=0.01, size=(n_rows, n_cols)).astype(
+        dtype
+    )
+    X = centers[labels] + noise
+    centroids = X[np.arange(n_clusters)].copy()
+    return np.ascontiguousarray(X), centroids
+
+
+def make_sample_weights(dtype, n_rows):
+    rng = np.random.default_rng(321)
+    return rng.uniform(0.5, 2.0, size=n_rows).astype(dtype)
+
+
+def predict_labels_host(X, centroids):
+    distances = np.sum(
+        (X[:, None, :] - centroids[None, :, :]) ** 2,
+        axis=2,
+    )
+    labels = np.argmin(distances, axis=1)
+    row_distances = distances[np.arange(X.shape[0]), labels]
+    return labels, row_distances
+
+
+def assert_same_label_partition(lhs, rhs):
+    lhs = np.asarray(lhs)
+    rhs = np.asarray(rhs)
+    assert np.array_equal(
+        np.equal.outer(lhs, lhs),
+        np.equal.outer(rhs, rhs),
+    )
+
+
+def assert_valid_mg_fit_output(out, X, n_clusters):
+    assert isinstance(out.centroids, np.ndarray)
+    assert out.centroids.shape == (n_clusters, X.shape[1])
+    assert out.centroids.dtype == X.dtype
+    assert np.all(np.isfinite(out.centroids))
+    assert np.isfinite(out.inertia)
+    assert out.n_iter > 0
+
+
+def assert_inertia_matches_centroids(out, X, sample_weights):
+    _, row_distances = predict_labels_host(X, out.centroids)
+    if sample_weights is not None:
+        sample_weights = sample_weights * X.shape[0] / sample_weights.sum()
+        row_distances = row_distances * sample_weights
+
+    inertia_tol = 5e-2 if X.dtype == np.float32 else 1e-3
+    assert np.allclose(
+        out.inertia,
+        np.sum(row_distances),
+        rtol=inertia_tol,
+        atol=inertia_tol,
+    )
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("init_method", ["Array", "KMeansPlusPlus", "Random"])
+@pytest.mark.parametrize("weighted", [False, True])
+def test_mg_kmeans_fit_options(dtype, init_method, weighted):
+    n_clusters = 4
+    X, initial_centroids = make_inputs(dtype, n_clusters=n_clusters)
+    resources = MultiGpuResources()
+
+    if weighted:
+        sample_weights = make_sample_weights(dtype, X.shape[0])
+    else:
+        sample_weights = None
+
+    params = KMeansParams(
+        n_clusters=n_clusters,
+        init_method=init_method,
+        max_iter=20,
+        tol=1e-10,
+        n_init=3 if init_method == "Random" else 1,
+        init_size=X.shape[0],
+        streaming_batch_size=37,
+    )
+    centroids = initial_centroids.copy() if init_method == "Array" else None
+
+    mg_out = mg_kmeans.fit(
+        params,
+        X,
+        centroids=centroids,
+        sample_weights=sample_weights,
+        resources=resources,
+    )
+    resources.sync()
+
+    assert_valid_mg_fit_output(mg_out, X, n_clusters)
+    assert_inertia_matches_centroids(mg_out, X, sample_weights)
+
+    labels, _ = predict_labels_host(X, mg_out.centroids)
+    if init_method != "Random":
+        expected_labels = np.arange(X.shape[0]) % n_clusters
+        assert_same_label_partition(labels, expected_labels)
+    assert len(np.unique(labels)) == n_clusters
+
+
+def test_mg_kmeans_input_validation():
+    import cupy as cp
+
+    n_clusters = 4
+    X, centroids = make_inputs(np.float32, n_clusters=n_clusters)
+    resources = MultiGpuResources()
+    params = KMeansParams(n_clusters=n_clusters, init_method="Array")
+
+    with pytest.raises(ValueError, match="centroids must be provided"):
+        mg_kmeans.fit(params, X, resources=resources)
+
+    hierarchical_params = KMeansParams(
+        n_clusters=n_clusters, hierarchical=True
+    )
+    with pytest.raises(ValueError, match="hierarchical"):
+        mg_kmeans.fit(
+            hierarchical_params,
+            X,
+            centroids=centroids.copy(),
+            resources=resources,
+        )
+
+    with pytest.raises(ValueError, match="host memory"):
+        mg_kmeans.fit(
+            params,
+            cp.asarray(X),
+            centroids=centroids.copy(),
+            resources=resources,
+        )
+
+    with pytest.raises(ValueError, match="host memory"):
+        mg_kmeans.fit(
+            params, X, centroids=cp.asarray(centroids), resources=resources
+        )
+
+    with pytest.raises(ValueError, match="C contiguous"):
+        mg_kmeans.fit(
+            params,
+            np.asfortranarray(X),
+            centroids=centroids.copy(),
+            resources=resources,
+        )
+
+    with pytest.raises(TypeError, match="centroids dtype"):
+        mg_kmeans.fit(
+            params,
+            X,
+            centroids=centroids.astype(np.float64),
+            resources=resources,
+        )
+
+    with pytest.raises(ValueError, match="Incorrect number of rows"):
+        mg_kmeans.fit(
+            params,
+            X,
+            centroids=centroids[: n_clusters - 1].copy(),
+            resources=resources,
+        )
+
+    with pytest.raises(ValueError, match="sample_weights must be a 1D"):
+        mg_kmeans.fit(
+            params,
+            X,
+            centroids=centroids.copy(),
+            sample_weights=np.ones((X.shape[0], 1), dtype=X.dtype),
+            resources=resources,
+        )
+
+    with pytest.raises(TypeError, match="sample_weights dtype"):
+        mg_kmeans.fit(
+            params,
+            X,
+            centroids=centroids.copy(),
+            sample_weights=np.ones(X.shape[0], dtype=np.float64),
+            resources=resources,
+        )
+
+    with pytest.raises(ValueError, match="host memory"):
+        mg_kmeans.fit(
+            params,
+            X,
+            centroids=centroids.copy(),
+            sample_weights=cp.ones(X.shape[0], dtype=cp.float32),
+            resources=resources,
+        )

From 574ebc39f0317d240ab67a17e8267b7c78ded146 Mon Sep 17 00:00:00 2001
From: vic <viclafargue@nvidia.com>
Date: Wed, 3 Jun 2026 09:03:03 +0200
Subject: [PATCH 2/2] revert test_kmeans.py file

---
 python/cuvs/cuvs/tests/test_kmeans.py | 33 +++++----------------------
 1 file changed, 6 insertions(+), 27 deletions(-)

diff --git a/python/cuvs/cuvs/tests/test_kmeans.py b/python/cuvs/cuvs/tests/test_kmeans.py
index d0905c2a5f..210ae06b80 100644
--- a/python/cuvs/cuvs/tests/test_kmeans.py
+++ b/python/cuvs/cuvs/tests/test_kmeans.py
@@ -15,19 +15,6 @@
 from cuvs.distance import pairwise_distance
 
 
-def make_well_separated_kmeans_input(rng, n_rows, n_cols, n_clusters, dtype):
-    labels = np.arange(n_rows) % n_clusters
-    centers = rng.normal(
-        loc=0.0, scale=10.0, size=(n_clusters, n_cols)
-    ).astype(dtype)
-    noise = rng.normal(loc=0.0, scale=0.01, size=(n_rows, n_cols)).astype(
-        dtype
-    )
-    X = centers[labels] + noise
-    initial_centroids = X[np.arange(n_clusters)].copy()
-    return np.ascontiguousarray(X), initial_centroids
-
-
 @pytest.mark.parametrize("n_rows", [100])
 @pytest.mark.parametrize("n_cols", [5, 25])
 @pytest.mark.parametrize("n_clusters", [5, 15])
@@ -93,7 +80,7 @@ def test_cluster_cost(n_rows, n_cols, n_clusters, dtype):
 @pytest.mark.parametrize("n_cols", [10, 100])
 @pytest.mark.parametrize("n_clusters", [8, 16])
 @pytest.mark.parametrize("streaming_batch_size", [0, 100, 239, 500])
-@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("dtype", [np.float64])
 @pytest.mark.parametrize("weighted", [False, True])
 def test_fit_host_matches_fit_device(
     n_rows, n_cols, n_clusters, streaming_batch_size, dtype, weighted
@@ -104,9 +91,10 @@ def test_fit_host_matches_fit_device(
     Optionally tests with non-uniform sample weights.
     """
     rng = np.random.default_rng(99)
-    X_host, initial_centroids_host = make_well_separated_kmeans_input(
-        rng, n_rows, n_cols, n_clusters, dtype
-    )
+    X_host = rng.random((n_rows, n_cols)).astype(dtype)
+
+    centroid_indices = rng.choice(n_rows, size=n_clusters, replace=False)
+    initial_centroids_host = X_host[centroid_indices].copy()
 
     if weighted:
         sample_weights_host = rng.uniform(0.5, 2.0, size=n_rows).astype(dtype)
@@ -148,15 +136,6 @@ def test_fit_host_matches_fit_device(
         centroids_regular, centroids_batched, rtol=1e-3, atol=1e-3
     ), f"max diff: {np.max(np.abs(centroids_regular - centroids_batched))}"
 
-    if dtype == np.float32:
-        # Weighted float32 inertia is sensitive to L2Expanded cancellation and
-        # reduction order. Centroids remain the primary parity check here.
-        inertia_tol = 1e-1 if weighted else 5e-2
-    else:
-        inertia_tol = 1e-3
     assert np.allclose(
-        inertia_regular,
-        inertia_batched,
-        rtol=inertia_tol,
-        atol=inertia_tol,
+        inertia_regular, inertia_batched, rtol=1e-3, atol=1e-3
     ), f"max diff: {np.max(np.abs(inertia_regular - inertia_batched))}"