rapidsai · tarang-jain · Jan 9, 2026 · Jan 9, 2026 · Jan 10, 2026 · Jan 10, 2026
@@ -36,6 +36,7 @@ typedef enum {
   Array = 2
 } cuvsKMeansInitMethod;
 
+
 /**
  * @brief Hyper-parameters for the kmeans algorithm
  */
@@ -90,6 +91,7 @@ struct cuvsKMeansParams {
    */
   int batch_centroids;
 
+  /** Check inertia during iterations for early convergence. */
   bool inertia_check;
 
   /**
@@ -101,6 +103,12 @@ struct cuvsKMeansParams {
    * For hierarchical k-means , defines the number of training iterations
    */
   int hierarchical_n_iters;
+
+  /**
+   * Number of samples to process per GPU batch for the batched (host-data) API.
+   * When set to 0, defaults to n_samples (process all at once).
+   */
+   int64_t batch_size;
 };
 
 typedef struct cuvsKMeansParams* cuvsKMeansParams_t;
@@ -142,18 +150,24 @@ typedef enum { CUVS_KMEANS_TYPE_KMEANS = 0, CUVS_KMEANS_TYPE_KMEANS_BALANCED = 1
  *   clusters are reinitialized by choosing new centroids with
  *   k-means++ algorithm.
  *
+ *   X may reside on either host (CPU) or device (GPU) memory.
+ *   When X is on the host the data is streamed to the GPU in
+ *   batches controlled by params->batch_size.
+ *
  * @param[in]     res           opaque C handle
  * @param[in]     params        Parameters for KMeans model.
  * @param[in]     X             Training instances to cluster. The data must
- *                              be in row-major format.
+ *                              be in row-major format. May be on host or
+ *                              device memory.
  *                              [dim = n_samples x n_features]
  * @param[in]     sample_weight Optional weights for each observation in X.
+ *                              Must be on the same memory space as X.
  *                              [len = n_samples]
  * @param[inout]  centroids     [in] When init is InitMethod::Array, use
  *                              centroids as the initial cluster centers.
  *                              [out] The generated centroids from the
  *                              kmeans algorithm are stored at the address
- *                              pointed by 'centroids'.
+ *                              pointed by 'centroids'. Must be on device.
  *                              [dim = n_clusters x n_features]
  * @param[out]    inertia       Sum of squared distances of samples to their
  *                              closest cluster center.
@@ -212,6 +226,7 @@ cuvsError_t cuvsKMeansClusterCost(cuvsResources_t res,
                                   DLManagedTensor* X,
                                   DLManagedTensor* centroids,
                                   double* cost);
+
 /**
  * @}
  */

@@ -17,16 +17,18 @@ namespace {
 
 cuvs::cluster::kmeans::params convert_params(const cuvsKMeansParams& params)
 {
-  auto kmeans_params       = cuvs::cluster::kmeans::params();
-  kmeans_params.metric     = static_cast<cuvs::distance::DistanceType>(params.metric);
-  kmeans_params.init       = static_cast<cuvs::cluster::kmeans::params::InitMethod>(params.init);
-  kmeans_params.n_clusters = params.n_clusters;
-  kmeans_params.max_iter   = params.max_iter;
-  kmeans_params.tol        = params.tol;
+  auto kmeans_params                = cuvs::cluster::kmeans::params();
+  kmeans_params.metric              = static_cast<cuvs::distance::DistanceType>(params.metric);
+  kmeans_params.init = static_cast<cuvs::cluster::kmeans::params::InitMethod>(params.init);
+  kmeans_params.n_clusters          = params.n_clusters;
+  kmeans_params.max_iter            = params.max_iter;
+  kmeans_params.tol                 = params.tol;
+  kmeans_params.n_init              = params.n_init;
   kmeans_params.oversampling_factor = params.oversampling_factor;
   kmeans_params.batch_samples       = params.batch_samples;
   kmeans_params.batch_centroids     = params.batch_centroids;
   kmeans_params.inertia_check       = params.inertia_check;
+  kmeans_params.batch_size  = params.batch_size;
   return kmeans_params;
 }
 
@@ -49,8 +51,53 @@ void _fit(cuvsResources_t res,
 {
   auto X       = X_tensor->dl_tensor;
   auto res_ptr = reinterpret_cast<raft::resources*>(res);
+  bool is_host = (X.device.device_type == kDLCPU);
 
-  if (cuvs::core::is_dlpack_device_compatible(X)) {
+  if (is_host) {
+    auto n_samples  = static_cast<IdxT>(X.shape[0]);
+    auto n_features = static_cast<IdxT>(X.shape[1]);
+
+    if (params.hierarchical) {
+      RAFT_FAIL("hierarchical kmeans is not supported with host data");
+    }
+
+    auto centroids_dl = centroids_tensor->dl_tensor;
+    if (!cuvs::core::is_dlpack_device_compatible(centroids_dl)) {
+      RAFT_FAIL("centroids must be on device memory");
+    }
+
+    auto X_view = raft::make_host_matrix_view<T const, IdxT>(
+      reinterpret_cast<T const*>(X.data), n_samples, n_features);
+    auto centroids_view =
+      cuvs::core::from_dlpack<raft::device_matrix_view<T, IdxT, raft::row_major>>(
+        centroids_tensor);
+
+    std::optional<raft::host_vector_view<T const, IdxT>> sample_weight;
+    if (sample_weight_tensor != NULL) {
+      auto sw = sample_weight_tensor->dl_tensor;
+      if (sw.device.device_type != kDLCPU) {
+        RAFT_FAIL("sample_weight must be on host memory when X is on host");
+      }
+      sample_weight = raft::make_host_vector_view<T const, IdxT>(
+        reinterpret_cast<T const*>(sw.data), n_samples);
+    }
+
+    T inertia_temp;
+    IdxT n_iter_temp;
+
+    auto kmeans_params = convert_params(params);
+    cuvs::cluster::kmeans::fit(*res_ptr,
+                               kmeans_params,
+                               X_view,
+                               sample_weight,
+                               centroids_view,
+                               raft::make_host_scalar_view<T>(&inertia_temp),
+                               raft::make_host_scalar_view<IdxT>(&n_iter_temp));
+
+    *inertia = inertia_temp;
+    *n_iter  = n_iter_temp;
+
+  } else {
     using const_mdspan_type = raft::device_matrix_view<T const, IdxT, raft::row_major>;
     using mdspan_type       = raft::device_matrix_view<T, IdxT, raft::row_major>;
 
@@ -90,8 +137,6 @@ void _fit(cuvsResources_t res,
       *inertia = inertia_temp;
       *n_iter  = n_iter_temp;
     }
-  } else {
-    RAFT_FAIL("X dataset must be accessible on device memory");
   }
 }
 
@@ -182,17 +227,20 @@ extern "C" cuvsError_t cuvsKMeansParamsCreate(cuvsKMeansParams_t* params)
   return cuvs::core::translate_exceptions([=] {
     cuvs::cluster::kmeans::params cpp_params;
     cuvs::cluster::kmeans::balanced_params cpp_balanced_params;
-    *params =
-      new cuvsKMeansParams{.metric     = static_cast<cuvsDistanceType>(cpp_params.metric),
-                           .n_clusters = cpp_params.n_clusters,
-                           .init       = static_cast<cuvsKMeansInitMethod>(cpp_params.init),
-                           .max_iter   = cpp_params.max_iter,
-                           .tol        = cpp_params.tol,
-                           .oversampling_factor  = cpp_params.oversampling_factor,
-                           .batch_samples        = cpp_params.batch_samples,
-                           .inertia_check        = cpp_params.inertia_check,
-                           .hierarchical         = false,
-                           .hierarchical_n_iters = static_cast<int>(cpp_balanced_params.n_iters)};
+    *params = new cuvsKMeansParams{
+      .metric               = static_cast<cuvsDistanceType>(cpp_params.metric),
+      .n_clusters           = cpp_params.n_clusters,
+      .init                 = static_cast<cuvsKMeansInitMethod>(cpp_params.init),
+      .max_iter             = cpp_params.max_iter,
+      .tol                  = cpp_params.tol,
+      .n_init               = cpp_params.n_init,
+      .oversampling_factor  = cpp_params.oversampling_factor,
+      .batch_samples        = cpp_params.batch_samples,
+      .batch_centroids      = cpp_params.batch_centroids,
+      .inertia_check        = cpp_params.inertia_check,
+      .hierarchical         = false,
+      .hierarchical_n_iters = static_cast<int>(cpp_balanced_params.n_iters),
+      .batch_size           = cpp_params.batch_size};
   });
 }
 

@@ -100,15 +100,31 @@ struct params : base_params {
    * useful to optimize/control the memory footprint
    * Default tile is [batch_samples x n_clusters] i.e. when batch_centroids is 0
    * then don't tile the centroids
+   *
+   * NB: These parameters are unrelated to batch_size, which controls how many
+   * samples to transfer from host to device per batch when processing out-of-core
+   * data.
    */
   int batch_samples = 1 << 15;
 
   /**
    * if 0 then batch_centroids = n_clusters
    */
-  int batch_centroids = 0;  //
+  int batch_centroids = 0;
 
+  /**
+   * If true, check inertia during iterations for early convergence.
+   */
   bool inertia_check = false;
+
+  /**
+   * Number of samples to process per GPU batch when fitting with host data.
+   * When set to 0, defaults to n_samples (process all at once).
+   * Only used by the batched (host-data) code path and ignored by device-data
+   * overloads.
+   * Default: 0 (process all data at once).
+   */
+  int64_t batch_size = 0;
 };
 
 /**
@@ -141,6 +157,178 @@ enum class kmeans_type { KMeans = 0, KMeansBalanced = 1 };
  * @{
  */
 
+/**
+ * @brief Find clusters with k-means algorithm using batched processing of host data.
+ *
+ * This overload supports out-of-core computation where the dataset resides
+ * on the host. Data is processed in GPU-sized batches, streaming from host to device.
+ * The batch size is controlled by params.batch_size.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/resources.hpp>
+ *   #include <cuvs/cluster/kmeans.hpp>
+ *   using namespace cuvs::cluster;
+ *   ...
+ *   raft::resources handle;
+ *   cuvs::cluster::kmeans::params params;
+ *   params.n_clusters = 100;
+ *   params.batch_size = 100000;
+ *   int n_features = 15;
+ *   float inertia;
+ *   int n_iter;
+ *
+ *   // Data on host
+ *   std::vector<float> h_X(n_samples * n_features);
+ *   auto X = raft::make_host_matrix_view<const float, int>(h_X.data(), n_samples, n_features);
+ *
+ *   // Centroids on device
+ *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
+ *
+ *   kmeans::fit(handle,
+ *               params,
+ *               X,
+ *               std::nullopt,
+ *               centroids.view(),
+ *               raft::make_host_scalar_view(&inertia),
+ *               raft::make_host_scalar_view(&n_iter));
+ * @endcode
+ *
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model. Batch size is read from
+ *                              params.batch_size.
+ * @param[in]     X             Training instances on HOST memory. The data must
+ *                              be in row-major format.
+ *                              [dim = n_samples x n_features]
+ * @param[in]     sample_weight Optional weights for each observation in X (on host).
+ *                              [len = n_samples]
+ * @param[inout]  centroids     [in] When init is InitMethod::Array, use
+ *                              centroids as the initial cluster centers.
+ *                              [out] The generated centroids from the
+ *                              kmeans algorithm are stored at the address
+ *                              pointed by 'centroids'.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    inertia       Sum of squared distances of samples to their
+ *                              closest cluster center.
+ * @param[out]    n_iter        Number of iterations run.
+ */
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::host_matrix_view<const float, int> X,
+         std::optional<raft::host_vector_view<const float, int>> sample_weight,
+         raft::device_matrix_view<float, int> centroids,
+         raft::host_scalar_view<float> inertia,
+         raft::host_scalar_view<int> n_iter);
+
+/**
+ * @brief Find clusters with k-means algorithm using batched processing of host data.
+ */
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::host_matrix_view<const float, int64_t> X,
+         std::optional<raft::host_vector_view<const float, int64_t>> sample_weight,
+         raft::device_matrix_view<float, int64_t> centroids,
+         raft::host_scalar_view<float> inertia,
+         raft::host_scalar_view<int64_t> n_iter);
+
+/**
+ * @brief Find clusters with k-means algorithm using batched processing of host data.
+ */
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::host_matrix_view<const double, int> X,
+         std::optional<raft::host_vector_view<const double, int>> sample_weight,
+         raft::device_matrix_view<double, int> centroids,
+         raft::host_scalar_view<double> inertia,
+         raft::host_scalar_view<int> n_iter);
+
+/**
+ * @brief Find clusters with k-means algorithm using batched processing of host data.
+ */
+void fit(raft::resources const& handle,
+         const cuvs::cluster::kmeans::params& params,
+         raft::host_matrix_view<const double, int64_t> X,
+         std::optional<raft::host_vector_view<const double, int64_t>> sample_weight,
+         raft::device_matrix_view<double, int64_t> centroids,
+         raft::host_scalar_view<double> inertia,
+         raft::host_scalar_view<int64_t> n_iter);
+
+/**
+ * @defgroup predict_host K-Means Predict (host data)
+ * @{
+ */
+
+/**
+ * @brief Predict cluster labels for host data using batched processing.
+ *
+ * Streams data from host to GPU in batches, assigns each sample to its nearest
+ * centroid, and writes labels back to host memory.
+ * The batch size is controlled by params.batch_size.
+ *
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model.
+ * @param[in]     X             Input samples on HOST memory. [dim = n_samples x n_features]
+ * @param[in]     sample_weight Optional weights for each observation (on host).
+ * @param[in]     centroids     Cluster centers on device. [dim = n_clusters x n_features]
+ * @param[out]    labels        Predicted cluster labels on HOST memory. [dim = n_samples]
+ * @param[in]     normalize_weight Whether to normalize sample weights.
+ * @param[out]    inertia       Sum of squared distances to nearest centroid.
+ */
+void predict(raft::resources const& handle,
+             const cuvs::cluster::kmeans::params& params,
+             raft::host_matrix_view<const float, int64_t> X,
+             std::optional<raft::host_vector_view<const float, int64_t>> sample_weight,
+             raft::device_matrix_view<const float, int64_t> centroids,
+             raft::host_vector_view<int64_t, int64_t> labels,
+             bool normalize_weight,
+             raft::host_scalar_view<float> inertia);
+
+/**
+ * @brief Predict cluster labels for host data using batched processing (double).
+ */
+void predict(raft::resources const& handle,
+             const cuvs::cluster::kmeans::params& params,
+             raft::host_matrix_view<const double, int64_t> X,
+             std::optional<raft::host_vector_view<const double, int64_t>> sample_weight,
+             raft::device_matrix_view<const double, int64_t> centroids,
+             raft::host_vector_view<int64_t, int64_t> labels,
+             bool normalize_weight,
+             raft::host_scalar_view<double> inertia);
+
+/**
+ * @brief Fit k-means and predict cluster labels using batched processing of host data.
+ *
+ * Combines batched fit and batched predict into a single call.
+ *
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model.
+ * @param[in]     X             Training instances on HOST memory. [dim = n_samples x n_features]
+ * @param[in]     sample_weight Optional weights for each observation (on host).
+ * @param[inout]  centroids     Cluster centers on device. [dim = n_clusters x n_features]
+ * @param[out]    labels        Predicted cluster labels on HOST memory. [dim = n_samples]
+ * @param[out]    inertia       Sum of squared distances to nearest centroid.
+ * @param[out]    n_iter        Number of iterations run.
+ */
+void fit_predict(raft::resources const& handle,
+                 const cuvs::cluster::kmeans::params& params,
+                 raft::host_matrix_view<const float, int64_t> X,
+                 std::optional<raft::host_vector_view<const float, int64_t>> sample_weight,
+                 raft::device_matrix_view<float, int64_t> centroids,
+                 raft::host_vector_view<int64_t, int64_t> labels,
+                 raft::host_scalar_view<float> inertia,
+                 raft::host_scalar_view<int64_t> n_iter);
+
+/**
+ * @brief Fit k-means and predict cluster labels using batched processing (double).
+ */
+void fit_predict(raft::resources const& handle,
+                 const cuvs::cluster::kmeans::params& params,
+                 raft::host_matrix_view<const double, int64_t> X,
+                 std::optional<raft::host_vector_view<const double, int64_t>> sample_weight,
+                 raft::device_matrix_view<double, int64_t> centroids,
+                 raft::host_vector_view<int64_t, int64_t> labels,
+                 raft::host_scalar_view<double> inertia,
+                 raft::host_scalar_view<int64_t> n_iter);
+
 /**
  * @brief Find clusters with k-means algorithm.
  *   Initial centroids are chosen with k-means++ algorithm. Empty