Renyi-Fair-Inference/FairKmeans.py at master · optimization-for-data-driven-science/Renyi-Fair-Inference · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import math
import csv
import numpy as np
from sklearn.preprocessing import normalize

rows = []
s = []
k = 10
lambda_list = [0, 0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.5, 1, 10, 20]


data = []
with open('bank.csv') as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        data.append([float(row[0]), float(row[1]), float(row[2])])
        s.append(int(row[3]))

data = data[0:10000]

data = normalize(data, axis=0)

data = np.array(data)
# data = normalize(data, axis=0)
s = np.array(s)

n, m = data.shape
print(n, m)

for lam in lambda_list:
    for k in range(5, 21):
        a = np.zeros((n, 1))
        w = np.zeros((k, 1))
        centers = np.zeros((k, m))
        for i in range(k):
            centers[i, :] = data[i, :]

        previous_solution = np.ones_like(a)

        while np.linalg.norm(a - previous_solution) > 0:
            previous_solution = a
            # Update assignments:
            for i in range(n):  # for any data point find the cluster
                min_score = 999999999999999999999
                min_index = -1
                for j in range(k):
                    norm = np.linalg.norm(data[i, :] - centers[j, :])
                    norm1 = norm * norm
                    # norm1 = norm

                    norm = w[j][0] - s[i]
                    norm2 = lam * norm * norm
                    # norm2 = lam * norm

                    score_j = norm1 - norm2
                    if score_j < min_score:
                        min_score = score_j
                        min_index = j
                a[i][0] = min_index

                # Update the weights
                # Updating w:
                for t in range(k):
                    ith_cluster_size = 0
                    ith_cluster_group1 = 0
                    for j in range(n):
                        if a[j][0] == t:
                            ith_cluster_size += 1
                        if a[j][0] == t and s[j] == 1:
                            ith_cluster_group1 += 1
                    if ith_cluster_size != 0:
                        w[t][0] = float(ith_cluster_group1) / ith_cluster_size

            # Update Centers
            centers = np.random.rand(k, m)
            for i in range(k):
                ith_cluster_size = 0
                for j in range(n):
                    if a[j][0] == i:
                        ith_cluster_size += 1
                        centers[i, :] += data[j, :]
                if ith_cluster_size != 0:
                    centers[i, :] /= ith_cluster_size

        print('################w##########################')
        print(k)
        print("Lambda: ", lam)
        for item in w:
            for item2 in item:
                print(item2)

        loss = 0
        num = 0
        for i in range(n):
            assigned_cluster = a[i][0]
            if assigned_cluster > -1:
                norm = np.linalg.norm(centers[int(assigned_cluster), :] - data[i, :])
                num += 1
                loss += norm
        print(loss)