Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 0 additions & 180 deletions c/include/cuvs/cluster/kmeans.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ typedef enum {

/**
* @brief Hyper-parameters for the kmeans algorithm
* NB: The inertia_check field is kept for ABI compatibility. Removed in cuvsKMeansParams_v2.
* TODO: CalVer for the replacement: 26.08
*/
struct cuvsKMeansParams {
cuvsDistanceType metric;
Expand Down Expand Up @@ -95,88 +93,6 @@ struct cuvsKMeansParams {
*/
int batch_centroids;

/** Deprecated, ignored. Kept for ABI compatibility. */
bool inertia_check;

/**
* Whether to use hierarchical (balanced) kmeans or not
*/
bool hierarchical;

/**
* For hierarchical k-means , defines the number of training iterations
*/
int hierarchical_n_iters;

/**
* Number of samples to process per GPU batch for the batched (host-data) API.
* When set to 0, defaults to n_samples (process all at once).
*/
int64_t streaming_batch_size;

/**
* Number of samples to draw for KMeansPlusPlus initialization.
* When set to 0, uses heuristic min(3 * n_clusters, n_samples) for host data,
* or n_samples for device data.
*/
int64_t init_size;
};

/**
* @brief Hyper-parameters for the kmeans algorithm
* TODO: Remove this after cuvsKMeansParams is replaced in ABI 2.0
*/
struct cuvsKMeansParams_v2 {
cuvsDistanceType metric;

/**
* The number of clusters to form as well as the number of centroids to generate (default:8).
*/
int n_clusters;

/**
* Method for initialization, defaults to k-means++:
* - cuvsKMeansInitMethod::KMeansPlusPlus (k-means++): Use scalable k-means++ algorithm
* to select the initial cluster centers.
* - cuvsKMeansInitMethod::Random (random): Choose 'n_clusters' observations (rows) at
* random from the input data for the initial centroids.
* - cuvsKMeansInitMethod::Array (ndarray): Use 'centroids' as initial cluster centers.
*/
cuvsKMeansInitMethod init;

/**
* Maximum number of iterations of the k-means algorithm for a single run.
*/
int max_iter;

/**
* Relative tolerance with regards to inertia to declare convergence.
*/
double tol;

/**
* Number of instance k-means algorithm will be run with different seeds.
*/
int n_init;

/**
* Oversampling factor for use in the k-means|| algorithm
*/
double oversampling_factor;

/**
* batch_samples and batch_centroids are used to tile 1NN computation which is
* useful to optimize/control the memory footprint
* Default tile is [batch_samples x n_clusters] i.e. when batch_centroids is 0
* then don't tile the centroids
*/
int batch_samples;

/**
* if 0 then batch_centroids = n_clusters
*/
int batch_centroids;

/**
* Whether to use hierarchical (balanced) kmeans or not
*/
Expand All @@ -202,14 +118,10 @@ struct cuvsKMeansParams {
};

typedef struct cuvsKMeansParams* cuvsKMeansParams_t;
typedef struct cuvsKMeansParams_v2* cuvsKMeansParams_v2_t;

/**
* @brief Allocate KMeans params, and populate with default values
*
* @note In cuVS 26.08 (next ABI major version) this signature will be
* replaced by cuvsKMeansParamsCreate_v2.
*
* @param[in] params cuvsKMeansParams_t to allocate
* @return cuvsError_t
*/
Expand All @@ -218,33 +130,11 @@ CUVS_EXPORT cuvsError_t cuvsKMeansParamsCreate(cuvsKMeansParams_t* params);
/**
* @brief De-allocate KMeans params
*
* @note In cuVS 26.08 (next ABI major version) this signature will be
* replaced by cuvsKMeansParamsDestroy_v2.
*
* @param[in] params
* @return cuvsError_t
*/
CUVS_EXPORT cuvsError_t cuvsKMeansParamsDestroy(cuvsKMeansParams_t params);

/**
* @brief Allocate KMeans params
*
* Mirrors cuvsKMeansParamsCreate but operates on cuvsKMeansParams_v2.
* Will become the unsuffixed cuvsKMeansParamsCreate in cuVS 26.08.
*
* @param[in] params cuvsKMeansParams_v2_t to allocate
* @return cuvsError_t
*/
CUVS_EXPORT cuvsError_t cuvsKMeansParamsCreate_v2(cuvsKMeansParams_v2_t* params);

/**
* @brief De-allocate KMeans params allocated by cuvsKMeansParamsCreate_v2.
*
* @param[in] params
* @return cuvsError_t
*/
CUVS_EXPORT cuvsError_t cuvsKMeansParamsDestroy_v2(cuvsKMeansParams_v2_t params);

/**
* @brief Type of k-means algorithm.
*/
Expand All @@ -270,9 +160,6 @@ typedef enum { CUVS_KMEANS_TYPE_KMEANS = 0, CUVS_KMEANS_TYPE_KMEANS_BALANCED = 1
* When X is on the host the data is streamed to the GPU in
* batches controlled by params->streaming_batch_size.
*
* @note In cuVS 26.08 (next ABI major version) this signature will be
* replaced by cuvsKMeansFit_v2.
*
* @param[in] res opaque C handle
* @param[in] params Parameters for KMeans model.
* @param[in] X Training instances to cluster. The data must
Expand Down Expand Up @@ -300,45 +187,9 @@ CUVS_EXPORT cuvsError_t cuvsKMeansFit(cuvsResources_t res,
double* inertia,
int* n_iter);

/**
* @brief Find clusters with k-means algorithm (v2 params layout).
*
* Mirrors cuvsKMeansFit but takes cuvsKMeansParams_v2_t. Will become the
* unsuffixed cuvsKMeansFit in cuVS 26.08.
*
* @param[in] res opaque C handle
* @param[in] params Parameters for KMeans model (v2 layout).
* @param[in] X Training instances to cluster. The data must
* be in row-major format. May be on host or
* device memory.
* [dim = n_samples x n_features]
* @param[in] sample_weight Optional weights for each observation in X.
* Must be on the same memory space as X.
* [len = n_samples]
* @param[inout] centroids [in] When init is InitMethod::Array, use
* centroids as the initial cluster centers.
* [out] The generated centroids from the
* kmeans algorithm are stored at the address
* pointed by 'centroids'. Must be on device.
* [dim = n_clusters x n_features]
* @param[out] inertia Sum of squared distances of samples to their
* closest cluster center.
* @param[out] n_iter Number of iterations run.
*/
CUVS_EXPORT cuvsError_t cuvsKMeansFit_v2(cuvsResources_t res,
cuvsKMeansParams_v2_t params,
DLManagedTensor* X,
DLManagedTensor* sample_weight,
DLManagedTensor* centroids,
double* inertia,
int* n_iter);

/**
* @brief Predict the closest cluster each sample in X belongs to.
*
* @note In cuVS 26.08 (next ABI major version) this signature will be
* replaced by cuvsKMeansPredict_v2.
*
* @param[in] res opaque C handle
* @param[in] params Parameters for KMeans model.
* @param[in] X New data to predict.
Expand All @@ -364,37 +215,6 @@ CUVS_EXPORT cuvsError_t cuvsKMeansPredict(cuvsResources_t res,
bool normalize_weight,
double* inertia);

/**
* @brief Predict the closest cluster each sample in X belongs to (v2 params layout).
*
* Mirrors cuvsKMeansPredict but takes cuvsKMeansParams_v2_t. Will become the
* unsuffixed cuvsKMeansPredict in cuVS 26.08.
*
* @param[in] res opaque C handle
* @param[in] params Parameters for KMeans model (v2 layout).
* @param[in] X New data to predict.
* [dim = n_samples x n_features]
* @param[in] sample_weight Optional weights for each observation in X.
* [len = n_samples]
* @param[in] centroids Cluster centroids. The data must be in
* row-major format.
* [dim = n_clusters x n_features]
* @param[in] normalize_weight True if the weights should be normalized
* @param[out] labels Index of the cluster each sample in X
* belongs to.
* [len = n_samples]
* @param[out] inertia Sum of squared distances of samples to
* their closest cluster center.
*/
CUVS_EXPORT cuvsError_t cuvsKMeansPredict_v2(cuvsResources_t res,
cuvsKMeansParams_v2_t params,
DLManagedTensor* X,
DLManagedTensor* sample_weight,
DLManagedTensor* centroids,
DLManagedTensor* labels,
bool normalize_weight,
double* inertia);

/**
* @brief Compute cluster cost
*
Expand Down
89 changes: 6 additions & 83 deletions c/src/cluster/kmeans.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@

namespace {

// The conversions are templated on the C struct type and reused by both API surfaces.
template <typename ParamsT>
cuvs::cluster::kmeans::params convert_params(const ParamsT& params)
cuvs::cluster::kmeans::params convert_params(const cuvsKMeansParams& params)
{
auto kmeans_params = cuvs::cluster::kmeans::params();
kmeans_params.metric = static_cast<cuvs::distance::DistanceType>(params.metric);
Expand All @@ -35,18 +33,17 @@ cuvs::cluster::kmeans::params convert_params(const ParamsT& params)
return kmeans_params;
}

template <typename ParamsT>
cuvs::cluster::kmeans::balanced_params convert_balanced_params(const ParamsT& params)
cuvs::cluster::kmeans::balanced_params convert_balanced_params(const cuvsKMeansParams& params)
{
auto kmeans_params = cuvs::cluster::kmeans::balanced_params();
kmeans_params.metric = static_cast<cuvs::distance::DistanceType>(params.metric);
kmeans_params.n_iters = params.hierarchical_n_iters;
return kmeans_params;
}

template <typename T, typename ParamsT, typename IdxT = int64_t>
template <typename T, typename IdxT = int64_t>
void _fit(cuvsResources_t res,
const ParamsT& params,
const cuvsKMeansParams& params,
DLManagedTensor* X_tensor,
DLManagedTensor* sample_weight_tensor,
DLManagedTensor* centroids_tensor,
Expand Down Expand Up @@ -143,9 +140,9 @@ void _fit(cuvsResources_t res,
}
}

template <typename T, typename ParamsT, typename IdxT = int32_t, typename LabelsT = int32_t>
template <typename T, typename IdxT = int32_t, typename LabelsT = int32_t>
void _predict(cuvsResources_t res,
const ParamsT& params,
const cuvsKMeansParams& params,
DLManagedTensor* X_tensor,
DLManagedTensor* sample_weight_tensor,
DLManagedTensor* centroids_tensor,
Expand Down Expand Up @@ -240,7 +237,6 @@ extern "C" cuvsError_t cuvsKMeansParamsCreate(cuvsKMeansParams_t* params)
.oversampling_factor = cpp_params.oversampling_factor,
.batch_samples = cpp_params.batch_samples,
.batch_centroids = cpp_params.batch_centroids,
.inertia_check = false,
.hierarchical = false,
.hierarchical_n_iters = static_cast<int>(cpp_balanced_params.n_iters),
.streaming_batch_size = cpp_params.streaming_batch_size,
Expand Down Expand Up @@ -298,79 +294,6 @@ extern "C" cuvsError_t cuvsKMeansPredict(cuvsResources_t res,
});
}

extern "C" cuvsError_t cuvsKMeansParamsCreate_v2(cuvsKMeansParams_v2_t* params)
{
return cuvs::core::translate_exceptions([=] {
cuvs::cluster::kmeans::params cpp_params;
cuvs::cluster::kmeans::balanced_params cpp_balanced_params;
*params = new cuvsKMeansParams_v2{
.metric = static_cast<cuvsDistanceType>(cpp_params.metric),
.n_clusters = cpp_params.n_clusters,
.init = static_cast<cuvsKMeansInitMethod>(cpp_params.init),
.max_iter = cpp_params.max_iter,
.tol = cpp_params.tol,
.n_init = cpp_params.n_init,
.oversampling_factor = cpp_params.oversampling_factor,
.batch_samples = cpp_params.batch_samples,
.batch_centroids = cpp_params.batch_centroids,
.hierarchical = false,
.hierarchical_n_iters = static_cast<int>(cpp_balanced_params.n_iters),
.streaming_batch_size = cpp_params.streaming_batch_size,
.init_size = cpp_params.init_size};
});
}

extern "C" cuvsError_t cuvsKMeansParamsDestroy_v2(cuvsKMeansParams_v2_t params)
{
return cuvs::core::translate_exceptions([=] { delete params; });
}

extern "C" cuvsError_t cuvsKMeansFit_v2(cuvsResources_t res,
cuvsKMeansParams_v2_t params,
DLManagedTensor* X,
DLManagedTensor* sample_weight,
DLManagedTensor* centroids,
double* inertia,
int* n_iter)
{
return cuvs::core::translate_exceptions([=] {
auto dataset = X->dl_tensor;
if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) {
_fit<float>(res, *params, X, sample_weight, centroids, inertia, n_iter);
} else if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 64) {
_fit<double>(res, *params, X, sample_weight, centroids, inertia, n_iter);
} else {
RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
dataset.dtype.code,
dataset.dtype.bits);
}
});
}

extern "C" cuvsError_t cuvsKMeansPredict_v2(cuvsResources_t res,
cuvsKMeansParams_v2_t params,
DLManagedTensor* X,
DLManagedTensor* sample_weight,
DLManagedTensor* centroids,
DLManagedTensor* labels,
bool normalize_weight,
double* inertia)
{
return cuvs::core::translate_exceptions([=] {
auto dataset = X->dl_tensor;
if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) {
_predict<float>(res, *params, X, sample_weight, centroids, labels, normalize_weight, inertia);
} else if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 64) {
_predict<double>(
res, *params, X, sample_weight, centroids, labels, normalize_weight, inertia);
} else {
RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
dataset.dtype.code,
dataset.dtype.bits);
}
});
}

extern "C" cuvsError_t cuvsKMeansClusterCost(cuvsResources_t res,
DLManagedTensor* X,
DLManagedTensor* centroids,
Expand Down
Loading
Loading