-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathisml_assignment_3_.py
More file actions
162 lines (120 loc) · 5.48 KB
/
isml_assignment_3_.py
File metadata and controls
162 lines (120 loc) · 5.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
# Open and read the data, first data in each line is label.
file = open('mnist.csv', "r")
reader = csv.reader(file)
images = []
labels = []
for line in reader:
labels.append(float(line[0]))
image =[]
for data in line[1:]:
image.append(float(data))
images.append(image)
# ---------------------Question 1---------------------------
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
# Using the sklearn to find covariance matrix
def question1_sklearn():
pca = PCA(n_components=10)
new_images = pca.fit_transform(images)
original_cov_sum = np.sum(np.cov(images, rowvar=False))
print(np.cov(new_images, rowvar=False))
print("Shape of transformed data (using sklearn):", new_images.shape)
print("Sum of input covariance matrix (using sklearn):", original_cov_sum)
print("Sum of transformed covariance matrix (using sklearn):", np.sum(np.cov(new_images, rowvar=False)))
def question1(images, n_components=10):
# centralize the data
mean = np.mean(images, axis=0)
centered_data = images - mean
# calculate the covariance matrix
cov_matrix = np.cov(centered_data, rowvar=False)
# calculate the eigenvalue and eigenvectors
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
# sort rhe eigenvalue and eigenvector
sorted_indices = np.argsort(eigenvalues)[::-1]
top_indices = sorted_indices[:n_components]
projection_matrix = eigenvectors[:, top_indices]
# project the data onto a new low-dimensional space.
transformed_data = np.dot(centered_data, projection_matrix)
final_cov_matrix = np.cov(transformed_data, rowvar=False)
original_cov_sum = np.sum(np.cov(centered_data, rowvar=False))
print(final_cov_matrix)
print("Shape of transformed data (manual implementation):", transformed_data.shape)
print("Sum of input covariance matrix (manual implementation):", original_cov_sum)
print("Sum of transformed covariance matrix (manual implementation):", np.sum(final_cov_matrix))
# ---------------------End Question 1-------------------------
# ---------------------Question 2---------------------------
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
def question2_sklearn():
# initialize the KMeans object with 10 clusters
kmeans = KMeans(n_clusters=10, random_state=1).fit(images)
# extract the centroids and labels
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
# compute the sum of each centroid
centroid_sums = [np.sum(centroid) for centroid in centroids]
# print out the sum of the centroids
for i, centroid_sum in enumerate(centroid_sums):
print(f"Sum of centroid {i}: {centroid_sum}")
# return the centroids, labels, and sum of centroids
return centroids, labels, centroid_sums
def question2():
return
# ---------------------End Question 2-------------------------
# ---------------------Question 3---------------------------
def question3_sklearn(images, max_iters=300):
losses = []
for i in range(1, max_iters + 1):
kmeans = KMeans(n_clusters=10, max_iter=i, n_init=1, random_state=1, init='random').fit(images)
# from sklearn kmean: inertia_: Sum of squared distances of samples to their closest cluster center, weighted by the sample weights if provided.
losses.append(kmeans.inertia_)
# Plotting the loss curve
plt.plot(range(1, max_iters + 1), losses)
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.title('K-means Loss Curve')
plt.show()
# ---------------------End Question 3-------------------------
# ---------------------Question 4---------------------------
# train_data: The first 4000 samples.
# validation_data: The remaining 2000 samples.
def split_data(images):
return images[:4000], images[4000:]
# Computes the validation loss given the centroids and the validation data
def compute_validation_loss(centroids, validation_data):
distances = np.sqrt(((validation_data - centroids[:, np.newaxis])**2).sum(axis=2))
closest_centroids = np.argmin(distances, axis=0)
validation_loss = sum([np.square(np.linalg.norm(validation_data[i] - centroids[closest_centroids[i]])) for i in range(len(validation_data))])
return validation_loss
def question4():
# define range of k
k_range=[5, 10, 15, 20, 25, 30]
train_data, validation_data = split_data(images)
validation_losses = []
# for each k, fit the kmean model and compute the validation loss
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=1, n_init=10).fit(train_data)
centroids = kmeans.cluster_centers_
validation_loss = compute_validation_loss(centroids, validation_data)
validation_losses.append(validation_loss)
# find which one is optimal
optimal_k = k_range[np.argmin(validation_losses)]
print(f"Optimal k value: {optimal_k}")
# # Plotting the validation loss curve
# plt.plot(k_range, validation_losses)
# plt.xlabel('k')
# plt.ylabel('Validation Loss')
# plt.title('Validation Loss vs. k')
# plt.show()
# ---------------------End Question 4-------------------------
# ---------------------Question 5---------------------------
# ---------------------End Question 5-------------------------
# run each questions
# question1_sklearn()
# question1(images, 10)
# question2_sklearn()
# question3_sklearn(images)
# question4()