From 197354184f5afaaa3d0cb7d5ce7e479fb68fc4dd Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 22 May 2025 14:54:51 -0400 Subject: [PATCH 1/3] add KMeansCustom with manhattan --- pydfc/dfc_methods/sliding_window_clustr.py | 62 +++++------------- pydfc/dfc_utils.py | 75 ++++++++++++++++++++++ pyproject.toml | 1 - 3 files changed, 90 insertions(+), 48 deletions(-) diff --git a/pydfc/dfc_methods/sliding_window_clustr.py b/pydfc/dfc_methods/sliding_window_clustr.py index 137552c..eae3f9a 100644 --- a/pydfc/dfc_methods/sliding_window_clustr.py +++ b/pydfc/dfc_methods/sliding_window_clustr.py @@ -8,13 +8,10 @@ import time import numpy as np -from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer -from pyclustering.cluster.kmeans import kmeans -from pyclustering.utils.metric import distance_metric, type_metric from sklearn.cluster import KMeans from ..dfc import DFC -from ..dfc_utils import dFC_mat2vec, dFC_vec2mat +from ..dfc_utils import KMeansCustom, dFC_mat2vec, dFC_vec2mat from ..time_series import TIME_SERIES from .base_dfc_method import BaseDFCMethod from .sliding_window import SLIDING_WINDOW @@ -36,9 +33,6 @@ Input signals. dt : float Sample spacing. - -todo: -- pyclustering(manhattan) has a problem when suing predict """ @@ -103,32 +97,9 @@ def measure_name(self): def dFC_mat2vec(self, C_t): return dFC_mat2vec(C_t) - # if len(C_t.shape)==2: - # assert C_t.shape[0]==C_t.shape[1],\ - # 'C is not a square matrix' - # return C_t[np.triu_indices(C_t.shape[1], k=0)] - - # F = list() - # for t in range(C_t.shape[0]): - # C = C_t[t, : , :] - # assert C.shape[0]==C.shape[1],\ - # 'C is not a square matrix' - # F.append(C[np.triu_indices(C_t.shape[1], k=0)]) - - # F = np.array(F) - # return F def dFC_vec2mat(self, F, N): return dFC_vec2mat(F=F, N=N) - # C = list() - # iu = np.triu_indices(N, k=0) - # for i in range(F.shape[0]): - # K = np.zeros((N, N)) - # K[iu] = F[i,:] - # K = K + np.multiply(K.T, 1-np.eye(N)) - # C.append(K) - # C = np.array(C) - # return C def clusters_lst2idx(self, clusters): Z = np.zeros((self.F.shape[0],)) @@ -142,21 +113,18 @@ def cluster_FC(self, FCS_raw, n_clusters, n_regions): F = self.dFC_mat2vec(FCS_raw) if self.params["clstr_distance"] == "manhattan": - pass - # ########### Manhattan Clustering ############## - # # Prepare initial centers using K-Means++ method. - # initial_centers = kmeans_plusplus_initializer(F, self.n_states).initialize() - # # create metric that will be used for clustering - # manhattan_metric = distance_metric(type_metric.MANHATTAN) - # # Create instance of K-Means algorithm with prepared centers. - # kmeans_ = kmeans(F, initial_centers, metric=manhattan_metric) - # # Run cluster analysis and obtain results. - # kmeans_.process() - # Z = self.clusters_lst2idx(kmeans_.get_clusters()) - # F_cent = np.array(kmeans_.get_centers()) + ########### Manhattan Clustering ############## + kmeans_ = KMeansCustom( + n_clusters=n_clusters, + n_init=100, + init="k-means++", + metric="manhattan", + ).fit(F) + kmeans_.cluster_centers_ = kmeans_.cluster_centers_.astype(np.float32) + F_cent = kmeans_.cluster_centers_ else: ########### Euclidean Clustering ############## - kmeans_ = KMeans(n_clusters=n_clusters, n_init=500).fit(F) + kmeans_ = KMeans(n_clusters=n_clusters, n_init=500, init="k-means++").fit(F) kmeans_.cluster_centers_ = kmeans_.cluster_centers_.astype(np.float32) F_cent = kmeans_.cluster_centers_ @@ -265,11 +233,11 @@ def estimate_dFC(self, time_series): F = self.dFC_mat2vec(dFC_raw.get_dFC_mat(TRs=dFC_raw.TR_array)) + # The code below is similar for both clustering methods, + # but is kept this way for clarity. if self.params["clstr_distance"] == "manhattan": - pass - # ########### Manhattan Clustering ############## - # self.kmeans_.predict(F) - # Z = self.clusters_lst2idx(self.kmeans_.get_clusters()) + ########### Manhattan Clustering ############## + Z = self.kmeans_.predict(F.astype(np.float32)) else: ########### Euclidean Clustering ############## Z = self.kmeans_.predict(F.astype(np.float32)) diff --git a/pydfc/dfc_utils.py b/pydfc/dfc_utils.py index 2e9ab73..e1a449c 100644 --- a/pydfc/dfc_utils.py +++ b/pydfc/dfc_utils.py @@ -15,6 +15,8 @@ import seaborn as sns from nilearn.plotting import plot_markers from scipy import signal, stats +from sklearn.cluster import kmeans_plusplus +from sklearn.metrics import pairwise_distances # np.seterr(invalid='ignore') @@ -438,6 +440,79 @@ def dFC_vec2mat(F, N): return C +############################ K-means Clustering with Manhattan distance ############################ + + +class KMeansCustom: + def __init__( + self, n_clusters, max_iter=300, n_init=100, init="k-means++", metric="manhattan" + ): + self.n_clusters = n_clusters + self.max_iter = max_iter + self.n_init = n_init + self.init = init + self.metric = metric + self.labels_ = None + self.cluster_centers_ = None + self.inertia_ = None + + def _custom_distance(self, p1, p2): + return pairwise_distances([p1], [p2], metric=self.metric)[0][0] + + def _assign_clusters(self, X, centroids): + clusters = [] + for x in X: + distances = [self._custom_distance(x, c) for c in centroids] + clusters.append(np.argmin(distances)) + return clusters + + def _compute_centroids(self, X, labels): + centroids = [] + for i in range(self.n_clusters): + points = X[np.array(labels) == i] + centroids.append(points.mean(axis=0)) + return np.array(centroids) + + def fit(self, X): + X = deepcopy(X) + min_inertia = None + best_centroids = None + best_labels = None + for _ in range(self.n_init): + if self.init == "random": + centroids = X[np.random.choice(len(X), self.n_clusters, replace=False)] + elif self.init == "k-means++": + centroids, _ = kmeans_plusplus(X, n_clusters=self.n_clusters) + for _ in range(self.max_iter): + labels = self._assign_clusters(X, centroids) + new_centroids = self._compute_centroids(X, labels) + if np.allclose(centroids, new_centroids, atol=1e-6): + break + centroids = new_centroids + inertia = np.sum( + [ + self._custom_distance(x, centroids[label]) ** 2 + for x, label in zip(X, labels) + ] + ) + if min_inertia is None or inertia < min_inertia: + min_inertia = inertia + best_centroids = centroids + best_labels = labels + + self.labels_ = np.array(best_labels) + self.cluster_centers_ = np.array(best_centroids) + self.inertia_ = min_inertia + return self + + def predict(self, X): + X = deepcopy(X) + return self._assign_clusters(X, self.cluster_centers_) + + +#################################################################################################### + + # test def common_subj_lst(time_series_dict): SUBJECTs = None diff --git a/pyproject.toml b/pyproject.toml index e6ef28d..704e014 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,6 @@ dependencies = [ 'matplotlib', 'networkx', 'nilearn>=0.10.2,!=0.10.3', - 'pyclustering', 'pycwt', 'seaborn', 'statsmodels' From 67f274a2829e7a5bf965b0223ae3498260489a26 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 22 May 2025 15:06:38 -0400 Subject: [PATCH 2/3] minor --- pydfc/dfc_methods/sliding_window_clustr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydfc/dfc_methods/sliding_window_clustr.py b/pydfc/dfc_methods/sliding_window_clustr.py index eae3f9a..8a7c1bb 100644 --- a/pydfc/dfc_methods/sliding_window_clustr.py +++ b/pydfc/dfc_methods/sliding_window_clustr.py @@ -116,7 +116,7 @@ def cluster_FC(self, FCS_raw, n_clusters, n_regions): ########### Manhattan Clustering ############## kmeans_ = KMeansCustom( n_clusters=n_clusters, - n_init=100, + n_init=500, init="k-means++", metric="manhattan", ).fit(F) From 093a1d825de205eb81876f87e6c1f36eb0d46782 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 22 May 2025 19:40:52 -0400 Subject: [PATCH 3/3] add clstr_distance to params instead of default of euclidean --- pydfc/dfc_methods/discrete_hmm.py | 1 + pydfc/dfc_methods/sliding_window_clustr.py | 13 +++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pydfc/dfc_methods/discrete_hmm.py b/pydfc/dfc_methods/discrete_hmm.py index e57de21..77e1a91 100644 --- a/pydfc/dfc_methods/discrete_hmm.py +++ b/pydfc/dfc_methods/discrete_hmm.py @@ -55,6 +55,7 @@ def __init__(self, **params): "measure_name", "is_state_based", "clstr_base_measure", + "clstr_distance", "sw_method", "tapered_window", "dhmm_obs_state_ratio", diff --git a/pydfc/dfc_methods/sliding_window_clustr.py b/pydfc/dfc_methods/sliding_window_clustr.py index 8a7c1bb..25d7386 100644 --- a/pydfc/dfc_methods/sliding_window_clustr.py +++ b/pydfc/dfc_methods/sliding_window_clustr.py @@ -38,12 +38,8 @@ class SLIDING_WINDOW_CLUSTR(BaseDFCMethod): - def __init__(self, clstr_distance="euclidean", **params): + def __init__(self, **params): - assert ( - clstr_distance == "euclidean" or clstr_distance == "manhattan" - ), "Clustering distance not recognized. It must be either \ - euclidean or manhattan." self.logs_ = "" self.TPM = [] self.FCS_ = [] @@ -85,7 +81,12 @@ def __init__(self, clstr_distance="euclidean", **params): self.params["measure_name"] = "Clustering" self.params["is_state_based"] = True - self.params["clstr_distance"] = clstr_distance + + assert ( + self.params["clstr_distance"] == "euclidean" + or self.params["clstr_distance"] == "manhattan" + ), "Clustering distance not recognized. It must be either \ + euclidean or manhattan." assert ( self.params["clstr_base_measure"] in self.base_methods_name_lst