ADHD_Project/GaussianKluster_usingCarlesDF.py at master · AdeuAndreu/ADHD_Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
# -*- coding: utf-8 -*-
"""

Created on Wed Jun 22 13:06:19 2016
@author: and_ma

"""

import itertools
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import metrics
from sklearn import mixture
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

import loadingADHD_Data
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

patientsDF = loadingADHD_Data.load()
ApatientsDF = patientsDF[patientsDF['experiment']=='A']
dropApatientsDF = ApatientsDF.drop(['experiment', 'patientName'],1)
BpatientsDF = patientsDF[patientsDF['experiment']=='B']
dropBpatientsDF = BpatientsDF.drop(['experiment', 'patientName'],1)
CpatientsDF = patientsDF[patientsDF['experiment']=='C']
dropCpatientsDF = CpatientsDF.drop(['experiment', 'patientName'],1)


scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

dropPatientsDF = patientsDF.drop(['experiment','patientName'],1)
numberA = len(patientsDF[patientsDF['experiment']=='A'])
numberB = len(patientsDF[patientsDF['experiment']=='B'])
numberC = len(patientsDF[patientsDF['experiment']=='C'])


### Do the PCA decomposition! for the sake of visualisation, only 3 components are considered
pca = PCA(n_components=3)
newdataPCA = pca.fit_transform(dropPatientsDF.values)  # concatenates vectors row by row
pca.explained_variance_ratio_

# the sum of the variance ratios for three components of the PCA is 0.928521380016
#plot of the data on the PCA space: 3D
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111, projection='3d')
plt.rcParams['legend.fontsize'] = 10
ax.plot(newdataPCA[0:numberA, 0], newdataPCA[0:numberA, 1],\
    newdataPCA[0:numberA, 2], 'o', markersize=8, color='blue', alpha=0.5, label='A')
ax.plot(newdataPCA[numberA:(numberA+numberB), 0], newdataPCA[numberA:(numberA+numberB), 1],\
    newdataPCA[numberA:(numberA+numberB), 2], '^', markersize=8, color='red', alpha=0.5, label='B')
ax.plot(newdataPCA[(numberA+numberB):, 0], newdataPCA[(numberA+numberB):, 1],\
    newdataPCA[(numberA+numberB):, 2], '+', markersize=8, color='green', alpha=0.5, label='C')


plt.title('Samples for experiment A, B and C in the PCA space')
ax.legend(loc='upper right')
ax.set_xlabel('First PCA base vector')
ax.set_ylabel('Second PCA base vector')
ax.set_zlabel('Third PCA base vector')
plt.show()
#no conlusion from that: only visualization


### Plot in two dimensions
fig = plt.figure(figsize=(8,8))
plt.rcParams['legend.fontsize'] = 10
plt.plot(newdataPCA[0:numberA, 0], newdataPCA[0:numberA, 1], 'o', markersize=8, color='blue', alpha=0.5, label='A')
plt.plot(newdataPCA[numberA:(numberA+numberB), 0], newdataPCA[numberA:(numberA+numberB), 1], '^', markersize=8, alpha=0.5, color='red', label='B')
plt.plot(newdataPCA[(numberA+numberB):, 0], newdataPCA[(numberA+numberB):, 1], '+', markersize=8, color='green', alpha=0.5, label='C')
plt.title('Samples for experiment A, B and C in the PCA space')
plt.legend(loc='upper right')
plt.show()


#### Gaussian Clustering : Gaussian mixture models
#GMM methods
#aic(X)	Akaike information criterion for the current model fit
#bic(X)	Bayesian information criterion for the current model fit
#fit(X[, y])	Estimate model parameters with the EM algorithm.
#fit_predict(X[, y])	Fit and then predict labels for data.
#get_params([deep])	Get parameters for this estimator.
#predict(X)	Predict label for data.
#predict_proba(X)	Predict posterior probability of data under each Gaussian in the model.
#sample([n_samples, random_state])	Generate random samples from the model.
#score(X[, y])	Compute the log probability under the model.
#score_samples(X)	Return the per-sample likelihood of the data under the model.
#set_params(**params)	Set the parameters of this estimator.

def getBestGMM(X,n_components,cv_types):

    '''
    Function that finds the best GMM cluster trying different gaussians and
    different number of clusters
    '''

    lowest_bic = np.infty
    bic = []
    silhouette = []
    for cv_type in cv_types:
        for n_components in n_components_range:
            # Fit a mixture of Gaussians with EM
            gmm = mixture.GMM(n_components=n_components, covariance_type=cv_type)
            gmm.fit(X)
            bic.append(gmm.bic(X))
            Y_predicted=gmm.predict(X)
            if cv_type =='tied':
                silhouette.append(metrics.silhouette_score(X, Y_predicted,  metric='euclidean'))
                #I only save the values for tied, because i know from the first run that its the best gaussian
            if n_components>=1:
                if bic[-1] < lowest_bic:
                    lowest_bic = bic[-1]
                    best_gmm = gmm

    bic = np.array(bic)
    color_iter = itertools.cycle(['k', 'r', 'g', 'b', 'c', 'm','y'])
    return best_gmm,color_iter,bic,silhouette

def plotBIC_Scores(color_iter,n_components,cv_types,bic):
    spl = plt.subplot(1, 1, 1)
    bars = []

    for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
        xpos = np.array(n_components_range) + .2 * (i - 2)
        bars.append(plt.bar(xpos, bic[i * len(n_components_range):
                                      (i + 1) * len(n_components_range)],
                            width=.2, color=color))

    plt.xticks(n_components_range)
    plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()])
    plt.title('Bayesian Information Criteria score per model')
    xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\
        .2 * np.floor(bic.argmin() / len(n_components_range))
    plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14)
    spl.set_xlabel('Number of components')
    spl.legend([b[0] for b in bars], cv_types)

### 1. Get best model for A patients
X_A = dropApatientsDF.values
n_components_range = range(2,15)
cv_types = ['spherical', 'tied', 'diag', 'full']
clf_A,color_iter,bic,silhouette_A = getBestGMM(X_A,n_components_range,cv_types)

### 2. Save the Clf winner results
Y_A = clf_A.predict(X_A)
# Very interesting clf_A.predict_proba(X_A), it gives back the actual likelihood


# Value metrics silhouette score
best_A = metrics.silhouette_score(X_A, Y_A,  metric='euclidean')
print ("Best sillhouette A patients ",best_A)

### 3. Plotting sillhouette for cluster A
plt.figure()
plt.plot(np.arange(2,15),np.array(silhouette_A),color='b')
plt.title("Group A Sillhouette Values for GMM Klustering, Best = %s"%(best_A))
plt.xlabel("K value: number of clusters")
plt.ylabel("Silhouette Score")


### 4. Ploting GMM clustering result using PCA Components
pca = PCA(n_components=3)
newdataPCA = pca.fit_transform(X_A)  # concatenates vectors row by row
pca.explained_variance_ratio_
plotApatientsGMM = ApatientsDF.copy()
plotApatientsGMM['PCA_x'] = newdataPCA.T[0]
plotApatientsGMM['PCA_y'] = newdataPCA.T[1]
plotApatientsGMM['cluster'] = Y_A

plotApatientsGMM.sort_values(by='cluster', ascending= True, inplace=True) # Dataframe is sorted by cluster type: 0 or 1

### Indexes for ploting
number1 = len(plotApatientsGMM[plotApatientsGMM['cluster']==0])
number2 = len(plotApatientsGMM[plotApatientsGMM['cluster']==1])

### Plot result clustering in two dimensions
fig = plt.figure(figsize=(8,8))
plt.rcParams['legend.fontsize'] = 10
plt.plot(plotApatientsGMM['PCA_x'].values[0:number1], plotApatientsGMM['PCA_y'].values[0:number1], 'o', markersize=8, color='blue', alpha=0.5, label='0')
plt.plot(plotApatientsGMM['PCA_x'].values[number1:], plotApatientsGMM['PCA_y'].values[number1:], '^', markersize=8, alpha=0.5, color='red', label='1')
plt.xlabel('PCA eingenvector 1')
plt.ylabel('PCA eingenvector 2')
plt.title('Samples for experiment A, GMM Clustering')
plt.legend(loc='upper right')
plt.show()

plotApatientsGMM.to_csv('gmmApatients.csv', sep=',', encoding='utf-8')
###################################### END Patients A

#### Gaussian Clustering: Experiment B

### 1. Get best model for A patients
X_B = dropBpatientsDF.values
n_components_range = range(2,15)
cv_types = ['spherical', 'tied', 'diag', 'full']
clf_B,color_iter,bic,silhouette_B = getBestGMM(X_B,n_components_range,cv_types)

### 2. Save the Clf winner results
Y_B= clf_B.predict(X_B)

# Value metrics silhouette score
best_B = metrics.silhouette_score(X_B, Y_B,  metric='euclidean')
print ("Best sillhouette B patients ",best_B)

### 3. Plotting sillhouette for cluster B
plt.figure()
plt.plot(np.arange(2,15),np.array(silhouette_B),color='b')
plt.title("Group B Sillhouette Values for GMM Klustering, Best = %s"%best_B)
plt.xlabel("K value: number of clusters")
plt.ylabel("Silhouette Score")


### 4. Ploting GMM clustering result using PCA Components
pca = PCA(n_components=3)
newdataPCA = pca.fit_transform(X_B)  # concatenates vectors row by row
pca.explained_variance_ratio_
plotBpatientsGMM = BpatientsDF.copy()
plotBpatientsGMM['PCA_x'] = newdataPCA.T[0]
plotBpatientsGMM['PCA_y'] = newdataPCA.T[1]
plotBpatientsGMM['cluster'] = Y_B

plotBpatientsGMM.sort_values(by='cluster', ascending= True, inplace=True) # Dataframe is sorted by cluster type: 0 or 1

### Indexes for ploting
number1 = len(plotBpatientsGMM[plotBpatientsGMM['cluster']==0])
number2 = len(plotBpatientsGMM[plotBpatientsGMM['cluster']==1])

### Plot result clustering in two dimensions
fig = plt.figure(figsize=(8,8))
plt.rcParams['legend.fontsize'] = 10
plt.plot(plotBpatientsGMM['PCA_x'].values[0:number1], plotBpatientsGMM['PCA_y'].values[0:number1], 'o', markersize=8, color='blue', alpha=0.5, label='0')
plt.plot(plotBpatientsGMM['PCA_x'].values[number1:], plotBpatientsGMM['PCA_y'].values[number1:], '^', markersize=8, alpha=0.5, color='red', label='1')
plt.xlabel('PCA eingenvector 1')
plt.ylabel('PCA eingenvector 2')
plt.title('Samples for experiment B, GMM Clustering')
plt.legend(loc='upper right')
plt.show()
plotBpatientsGMM.to_csv('gmmBpatients.csv', sep=',', encoding='utf-8')
###################################### END Patients B

#### Gaussian Clustering: Experiment C
### 1. Get best model for A patients
X_C = dropCpatientsDF.values
n_components_range = range(2,15)
cv_types = ['spherical', 'tied', 'diag', 'full']
clf_C,color_iter,bic,silhouette_C = getBestGMM(X_C,n_components_range,cv_types)

### 2. Save the Clf winner results
Y_C = clf_C.predict(X_C)

# Value metrics silhouette score
best_C = metrics.silhouette_score(X_C, Y_C,  metric='euclidean')
print ("Best sillhouette C patients ",best_C)

### 3. Plotting sillhouette for cluster A
plt.figure()
plt.plot(np.arange(2,15),np.array(silhouette_C),color='b')
plt.title("Group C Sillhouette Values for GMM Klustering, Best = %s"%best_C)
plt.xlabel("K value: number of clusters")
plt.ylabel("Silhouette Score")


### 4. Ploting GMM clustering result using PCA Components
pca = PCA(n_components=3)
newdataPCA = pca.fit_transform(X_C)  # concatenates vectors row by row
pca.explained_variance_ratio_
plotCpatientsGMM = CpatientsDF.copy()
plotCpatientsGMM['PCA_x'] = newdataPCA.T[0]
plotCpatientsGMM['PCA_y'] = newdataPCA.T[1]
plotCpatientsGMM['cluster'] = Y_C

plotCpatientsGMM.sort_values(by='cluster', ascending= True, inplace=True) # Dataframe is sorted by cluster type: 0 or 1

### Indexes for ploting
number1 = len(plotCpatientsGMM[plotCpatientsGMM['cluster']==0])
number2 = len(plotCpatientsGMM[plotCpatientsGMM['cluster']==1])

### Plot result clustering in two dimensions
fig = plt.figure(figsize=(8,8))
plt.rcParams['legend.fontsize'] = 10
plt.plot(plotCpatientsGMM['PCA_x'].values[0:number1], plotCpatientsGMM['PCA_y'].values[0:number1], 'o', markersize=8, color='blue', alpha=0.5, label='0')
plt.plot(plotCpatientsGMM['PCA_x'].values[number1:], plotCpatientsGMM['PCA_y'].values[number1:], '^', markersize=8, alpha=0.5, color='red', label='1')
plt.xlabel('PCA eingenvector 1')
plt.ylabel('PCA eingenvector 2')
plt.title('Samples for experiment C, GMM Clustering')
plt.legend(loc='upper right')
plt.show()
plotCpatientsGMM.to_csv('gmmCpatients.csv', sep=',', encoding='utf-8')
#imagePath_C = '/Users/and_ma/Documents/DataScience/UB_DataScience/DataScience_Project/TeamWork/imagesGMM/C_GMM_Clustering.png'
#plt.savefig(imagePath_C, format='png', dpi=900)