Assignment3ISML/isml_assignment_3_.py at main · myshkin451/Assignment3ISML · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


# Open and read the data, first data in each line is label.
file = open('mnist.csv', "r")
reader = csv.reader(file)

images = []
labels = []

for line in reader:
    labels.append(float(line[0]))

    image =[]
    for data in line[1:]:
        image.append(float(data))

    images.append(image)


# ---------------------Question 1---------------------------

# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
# Using the sklearn to find covariance matrix
def question1_sklearn():
    pca = PCA(n_components=10)
    new_images = pca.fit_transform(images)
    original_cov_sum = np.sum(np.cov(images, rowvar=False))

    print(np.cov(new_images, rowvar=False))
    print("Shape of transformed data (using sklearn):", new_images.shape)
    print("Sum of input covariance matrix (using sklearn):", original_cov_sum)
    print("Sum of transformed covariance matrix (using sklearn):", np.sum(np.cov(new_images, rowvar=False)))

def question1(images, n_components=10):
    # centralize the data
    mean = np.mean(images, axis=0)
    centered_data = images - mean

    # calculate the covariance matrix
    cov_matrix = np.cov(centered_data, rowvar=False)

    # calculate the eigenvalue and eigenvectors
    eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

    # sort rhe eigenvalue and eigenvector
    sorted_indices = np.argsort(eigenvalues)[::-1]
    top_indices = sorted_indices[:n_components]
    projection_matrix = eigenvectors[:, top_indices]

    # project the data onto a new low-dimensional space.
    transformed_data = np.dot(centered_data, projection_matrix)
    final_cov_matrix = np.cov(transformed_data, rowvar=False)

    original_cov_sum = np.sum(np.cov(centered_data, rowvar=False))

    print(final_cov_matrix)
    print("Shape of transformed data (manual implementation):", transformed_data.shape)
    print("Sum of input covariance matrix (manual implementation):", original_cov_sum)
    print("Sum of transformed covariance matrix (manual implementation):", np.sum(final_cov_matrix))

# ---------------------End Question 1-------------------------


# ---------------------Question 2---------------------------

# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
def question2_sklearn():
    # initialize the KMeans object with 10 clusters
    kmeans = KMeans(n_clusters=10, random_state=1).fit(images)

    # extract the centroids and labels
    centroids = kmeans.cluster_centers_
    labels = kmeans.labels_

    # compute the sum of each centroid
    centroid_sums = [np.sum(centroid) for centroid in centroids]

    # print out the sum of the centroids
    for i, centroid_sum in enumerate(centroid_sums):
        print(f"Sum of centroid {i}: {centroid_sum}")

    # return the centroids, labels, and sum of centroids
    return centroids, labels, centroid_sums

def question2():
    return

# ---------------------End Question 2-------------------------

# ---------------------Question 3---------------------------
def question3_sklearn(images, max_iters=300):
    losses = []

    for i in range(1, max_iters + 1):
        kmeans = KMeans(n_clusters=10, max_iter=i, n_init=1, random_state=1, init='random').fit(images)
        # from sklearn kmean: inertia_: Sum of squared distances of samples to their closest cluster center, weighted by the sample weights if provided.
        losses.append(kmeans.inertia_)

    # Plotting the loss curve
    plt.plot(range(1, max_iters + 1), losses)
    plt.xlabel('Iterations')
    plt.ylabel('Loss')
    plt.title('K-means Loss Curve')
    plt.show()
# ---------------------End Question 3-------------------------

# ---------------------Question 4---------------------------
    # train_data: The first 4000 samples.
    # validation_data: The remaining 2000 samples.
def split_data(images):
    return images[:4000], images[4000:]

#  Computes the validation loss given the centroids and the validation data
def compute_validation_loss(centroids, validation_data):
    distances = np.sqrt(((validation_data - centroids[:, np.newaxis])**2).sum(axis=2))
    closest_centroids = np.argmin(distances, axis=0)
    validation_loss = sum([np.square(np.linalg.norm(validation_data[i] - centroids[closest_centroids[i]])) for i in range(len(validation_data))])
    return validation_loss

def question4():
    # define range of k
    k_range=[5, 10, 15, 20, 25, 30]
    train_data, validation_data = split_data(images)
    validation_losses = []

# for each k, fit the kmean model and compute the validation loss
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=1, n_init=10).fit(train_data)
        centroids = kmeans.cluster_centers_
        validation_loss = compute_validation_loss(centroids, validation_data)
        validation_losses.append(validation_loss)

# find which one is optimal
    optimal_k = k_range[np.argmin(validation_losses)]
    print(f"Optimal k value: {optimal_k}")

    # # Plotting the validation loss curve
    # plt.plot(k_range, validation_losses)
    # plt.xlabel('k')
    # plt.ylabel('Validation Loss')
    # plt.title('Validation Loss vs. k')
    # plt.show()


# ---------------------End Question 4-------------------------

# ---------------------Question 5---------------------------
# ---------------------End Question 5-------------------------

# run each questions
# question1_sklearn()
# question1(images, 10)
# question2_sklearn()
# question3_sklearn(images)
# question4()