From 197354184f5afaaa3d0cb7d5ce7e479fb68fc4dd Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 22 May 2025 14:54:51 -0400
Subject: [PATCH 1/3] add KMeansCustom with manhattan

---
 pydfc/dfc_methods/sliding_window_clustr.py | 62 +++++-------------
 pydfc/dfc_utils.py                         | 75 ++++++++++++++++++++++
 pyproject.toml                             |  1 -
 3 files changed, 90 insertions(+), 48 deletions(-)

diff --git a/pydfc/dfc_methods/sliding_window_clustr.py b/pydfc/dfc_methods/sliding_window_clustr.py
index 137552c..eae3f9a 100644
--- a/pydfc/dfc_methods/sliding_window_clustr.py
+++ b/pydfc/dfc_methods/sliding_window_clustr.py
@@ -8,13 +8,10 @@
 import time
 
 import numpy as np
-from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
-from pyclustering.cluster.kmeans import kmeans
-from pyclustering.utils.metric import distance_metric, type_metric
 from sklearn.cluster import KMeans
 
 from ..dfc import DFC
-from ..dfc_utils import dFC_mat2vec, dFC_vec2mat
+from ..dfc_utils import KMeansCustom, dFC_mat2vec, dFC_vec2mat
 from ..time_series import TIME_SERIES
 from .base_dfc_method import BaseDFCMethod
 from .sliding_window import SLIDING_WINDOW
@@ -36,9 +33,6 @@
         Input signals.
     dt : float
         Sample spacing.
-
-todo:
-- pyclustering(manhattan) has a problem when suing predict
 """
 
 
@@ -103,32 +97,9 @@ def measure_name(self):
 
     def dFC_mat2vec(self, C_t):
         return dFC_mat2vec(C_t)
-        # if len(C_t.shape)==2:
-        #     assert C_t.shape[0]==C_t.shape[1],\
-        #         'C is not a square matrix'
-        #     return C_t[np.triu_indices(C_t.shape[1], k=0)]
-
-        # F = list()
-        # for t in range(C_t.shape[0]):
-        #     C = C_t[t, : , :]
-        #     assert C.shape[0]==C.shape[1],\
-        #         'C is not a square matrix'
-        #     F.append(C[np.triu_indices(C_t.shape[1], k=0)])
-
-        # F = np.array(F)
-        # return F
 
     def dFC_vec2mat(self, F, N):
         return dFC_vec2mat(F=F, N=N)
-        # C = list()
-        # iu = np.triu_indices(N, k=0)
-        # for i in range(F.shape[0]):
-        #     K = np.zeros((N, N))
-        #     K[iu] = F[i,:]
-        #     K = K + np.multiply(K.T, 1-np.eye(N))
-        #     C.append(K)
-        # C = np.array(C)
-        # return C
 
     def clusters_lst2idx(self, clusters):
         Z = np.zeros((self.F.shape[0],))
@@ -142,21 +113,18 @@ def cluster_FC(self, FCS_raw, n_clusters, n_regions):
         F = self.dFC_mat2vec(FCS_raw)
 
         if self.params["clstr_distance"] == "manhattan":
-            pass
-            # ########### Manhattan Clustering ##############
-            # # Prepare initial centers using K-Means++ method.
-            # initial_centers = kmeans_plusplus_initializer(F, self.n_states).initialize()
-            # # create metric that will be used for clustering
-            # manhattan_metric = distance_metric(type_metric.MANHATTAN)
-            # # Create instance of K-Means algorithm with prepared centers.
-            # kmeans_ = kmeans(F, initial_centers, metric=manhattan_metric)
-            # # Run cluster analysis and obtain results.
-            # kmeans_.process()
-            # Z = self.clusters_lst2idx(kmeans_.get_clusters())
-            # F_cent = np.array(kmeans_.get_centers())
+            ########### Manhattan Clustering ##############
+            kmeans_ = KMeansCustom(
+                n_clusters=n_clusters,
+                n_init=100,
+                init="k-means++",
+                metric="manhattan",
+            ).fit(F)
+            kmeans_.cluster_centers_ = kmeans_.cluster_centers_.astype(np.float32)
+            F_cent = kmeans_.cluster_centers_
         else:
             ########### Euclidean Clustering ##############
-            kmeans_ = KMeans(n_clusters=n_clusters, n_init=500).fit(F)
+            kmeans_ = KMeans(n_clusters=n_clusters, n_init=500, init="k-means++").fit(F)
             kmeans_.cluster_centers_ = kmeans_.cluster_centers_.astype(np.float32)
             F_cent = kmeans_.cluster_centers_
 
@@ -265,11 +233,11 @@ def estimate_dFC(self, time_series):
 
         F = self.dFC_mat2vec(dFC_raw.get_dFC_mat(TRs=dFC_raw.TR_array))
 
+        # The code below is similar for both clustering methods,
+        # but is kept this way for clarity.
         if self.params["clstr_distance"] == "manhattan":
-            pass
-            # ########### Manhattan Clustering ##############
-            # self.kmeans_.predict(F)
-            # Z = self.clusters_lst2idx(self.kmeans_.get_clusters())
+            ########### Manhattan Clustering ##############
+            Z = self.kmeans_.predict(F.astype(np.float32))
         else:
             ########### Euclidean Clustering ##############
             Z = self.kmeans_.predict(F.astype(np.float32))
diff --git a/pydfc/dfc_utils.py b/pydfc/dfc_utils.py
index 2e9ab73..e1a449c 100644
--- a/pydfc/dfc_utils.py
+++ b/pydfc/dfc_utils.py
@@ -15,6 +15,8 @@
 import seaborn as sns
 from nilearn.plotting import plot_markers
 from scipy import signal, stats
+from sklearn.cluster import kmeans_plusplus
+from sklearn.metrics import pairwise_distances
 
 # np.seterr(invalid='ignore')
 
@@ -438,6 +440,79 @@ def dFC_vec2mat(F, N):
     return C
 
 
+############################ K-means Clustering with Manhattan distance ############################
+
+
+class KMeansCustom:
+    def __init__(
+        self, n_clusters, max_iter=300, n_init=100, init="k-means++", metric="manhattan"
+    ):
+        self.n_clusters = n_clusters
+        self.max_iter = max_iter
+        self.n_init = n_init
+        self.init = init
+        self.metric = metric
+        self.labels_ = None
+        self.cluster_centers_ = None
+        self.inertia_ = None
+
+    def _custom_distance(self, p1, p2):
+        return pairwise_distances([p1], [p2], metric=self.metric)[0][0]
+
+    def _assign_clusters(self, X, centroids):
+        clusters = []
+        for x in X:
+            distances = [self._custom_distance(x, c) for c in centroids]
+            clusters.append(np.argmin(distances))
+        return clusters
+
+    def _compute_centroids(self, X, labels):
+        centroids = []
+        for i in range(self.n_clusters):
+            points = X[np.array(labels) == i]
+            centroids.append(points.mean(axis=0))
+        return np.array(centroids)
+
+    def fit(self, X):
+        X = deepcopy(X)
+        min_inertia = None
+        best_centroids = None
+        best_labels = None
+        for _ in range(self.n_init):
+            if self.init == "random":
+                centroids = X[np.random.choice(len(X), self.n_clusters, replace=False)]
+            elif self.init == "k-means++":
+                centroids, _ = kmeans_plusplus(X, n_clusters=self.n_clusters)
+            for _ in range(self.max_iter):
+                labels = self._assign_clusters(X, centroids)
+                new_centroids = self._compute_centroids(X, labels)
+                if np.allclose(centroids, new_centroids, atol=1e-6):
+                    break
+                centroids = new_centroids
+            inertia = np.sum(
+                [
+                    self._custom_distance(x, centroids[label]) ** 2
+                    for x, label in zip(X, labels)
+                ]
+            )
+            if min_inertia is None or inertia < min_inertia:
+                min_inertia = inertia
+                best_centroids = centroids
+                best_labels = labels
+
+        self.labels_ = np.array(best_labels)
+        self.cluster_centers_ = np.array(best_centroids)
+        self.inertia_ = min_inertia
+        return self
+
+    def predict(self, X):
+        X = deepcopy(X)
+        return self._assign_clusters(X, self.cluster_centers_)
+
+
+####################################################################################################
+
+
 # test
 def common_subj_lst(time_series_dict):
     SUBJECTs = None
diff --git a/pyproject.toml b/pyproject.toml
index e6ef28d..704e014 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,6 @@ dependencies = [
     'matplotlib',
     'networkx',
     'nilearn>=0.10.2,!=0.10.3',
-    'pyclustering',
     'pycwt',
     'seaborn',
     'statsmodels'

From 67f274a2829e7a5bf965b0223ae3498260489a26 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 22 May 2025 15:06:38 -0400
Subject: [PATCH 2/3] minor

---
 pydfc/dfc_methods/sliding_window_clustr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydfc/dfc_methods/sliding_window_clustr.py b/pydfc/dfc_methods/sliding_window_clustr.py
index eae3f9a..8a7c1bb 100644
--- a/pydfc/dfc_methods/sliding_window_clustr.py
+++ b/pydfc/dfc_methods/sliding_window_clustr.py
@@ -116,7 +116,7 @@ def cluster_FC(self, FCS_raw, n_clusters, n_regions):
             ########### Manhattan Clustering ##############
             kmeans_ = KMeansCustom(
                 n_clusters=n_clusters,
-                n_init=100,
+                n_init=500,
                 init="k-means++",
                 metric="manhattan",
             ).fit(F)

From 093a1d825de205eb81876f87e6c1f36eb0d46782 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 22 May 2025 19:40:52 -0400
Subject: [PATCH 3/3] add clstr_distance to params instead of default of
 euclidean

---
 pydfc/dfc_methods/discrete_hmm.py          |  1 +
 pydfc/dfc_methods/sliding_window_clustr.py | 13 +++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/pydfc/dfc_methods/discrete_hmm.py b/pydfc/dfc_methods/discrete_hmm.py
index e57de21..77e1a91 100644
--- a/pydfc/dfc_methods/discrete_hmm.py
+++ b/pydfc/dfc_methods/discrete_hmm.py
@@ -55,6 +55,7 @@ def __init__(self, **params):
             "measure_name",
             "is_state_based",
             "clstr_base_measure",
+            "clstr_distance",
             "sw_method",
             "tapered_window",
             "dhmm_obs_state_ratio",
diff --git a/pydfc/dfc_methods/sliding_window_clustr.py b/pydfc/dfc_methods/sliding_window_clustr.py
index 8a7c1bb..25d7386 100644
--- a/pydfc/dfc_methods/sliding_window_clustr.py
+++ b/pydfc/dfc_methods/sliding_window_clustr.py
@@ -38,12 +38,8 @@
 
 class SLIDING_WINDOW_CLUSTR(BaseDFCMethod):
 
-    def __init__(self, clstr_distance="euclidean", **params):
+    def __init__(self, **params):
 
-        assert (
-            clstr_distance == "euclidean" or clstr_distance == "manhattan"
-        ), "Clustering distance not recognized. It must be either \
-                euclidean or manhattan."
         self.logs_ = ""
         self.TPM = []
         self.FCS_ = []
@@ -85,7 +81,12 @@ def __init__(self, clstr_distance="euclidean", **params):
 
         self.params["measure_name"] = "Clustering"
         self.params["is_state_based"] = True
-        self.params["clstr_distance"] = clstr_distance
+
+        assert (
+            self.params["clstr_distance"] == "euclidean"
+            or self.params["clstr_distance"] == "manhattan"
+        ), "Clustering distance not recognized. It must be either \
+                euclidean or manhattan."
 
         assert (
             self.params["clstr_base_measure"] in self.base_methods_name_lst