-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathk_means.py
More file actions
101 lines (80 loc) · 3.39 KB
/
k_means.py
File metadata and controls
101 lines (80 loc) · 3.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from __future__ import division
__author__ = 'sagabhan, ctewani'
from collections import Counter, defaultdict
from Queue import PriorityQueue
from data_cache import Cache
from random import sample
import knn_kmeans
from numpy import array, linalg
class Node:
def __init__(self, dist, centeriodIndex):
self.dist = dist
self.centeriodIndex = centeriodIndex
def __lt__(self, other):
return self.dist < other.dist
def kmeans(l, k, isNumpy):
# randomly initialize clusters
cluster_centeriods = [x[2] for x in sample(Cache.train, l)]
iterations = 0
while iterations < 10:
cluster_nodes_acc = defaultdict(list)
cluster_nodes = defaultdict(list)
cluster_node_count = Counter()
if isNumpy:
cluster_centeriods_array = [array(x) for x in cluster_centeriods]
# initialize cluster_nodes to zeros
for i in range(len(cluster_centeriods)):
cluster_nodes[i] = [0] * len(cluster_centeriods[0])
# assign each sample in training set
for train in Cache.train:
NodePQ = PriorityQueue()
if isNumpy:
trainArray = array(train[2])
for centeriodIndex in range(len(cluster_centeriods)):
# manhatten - Numpy
if isNumpy:
totalDist = sum(abs(trainArray - cluster_centeriods_array[centeriodIndex]))
else:
# manhatten - Tradition
totalDist = 0
for index in range(len(train[2])):
diff = train[2][index] - cluster_centeriods[centeriodIndex][index]
if diff >= 0:
totalDist += diff
else:
totalDist -= diff
'''
# euclidean - Numpy
if isNumpy:
totalDist = linalg.norm(trainArray - cluster_centeriods_array[centeriodIndex])
else:
# euclidean - Traditional
totalDist = 0
for index in range(len(train[2])):
totalDist += pow((train[2][index] - cluster_centeriods[centeriodIndex][index]), 2)
totalDist = pow(totalDist, 1 / 2)
'''
node = Node(totalDist, centeriodIndex)
NodePQ.put(node)
node = NodePQ.get()
for vectorIndex in range(len(cluster_nodes[node.centeriodIndex])):
cluster_nodes[node.centeriodIndex][vectorIndex] += train[2][vectorIndex]
cluster_nodes_acc[node.centeriodIndex].append(train)
cluster_node_count[node.centeriodIndex] += 1
# mean of cluster
cluster_centeriods = []
for i in range(len(cluster_nodes)):
cluster_nodes[i] = [cluster_nodes[i][j]/cluster_node_count[i] for j in range(len(cluster_nodes[i])) if cluster_node_count[i] != 0]
cluster_centeriods.append(cluster_nodes[i])
#print cluster_node_count
print "iterations", iterations
iterations += 1
'''
with open('knn_features.txt', 'w') as f:
f.write(str(cluster_node_count))
f.write('\n\n')
f.write(str(cluster_node_count))
f.write('\n\n')
f.write(str(cluster_nodes_acc))
'''
return knn_kmeans.knn_means(k, cluster_centeriods, cluster_nodes_acc, isNumpy)