-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathhierarchicalClusterer.py
More file actions
249 lines (206 loc) · 8.39 KB
/
hierarchicalClusterer.py
File metadata and controls
249 lines (206 loc) · 8.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
from queue import PriorityQueue
import math
"""
Example code for hierarchical clustering
"""
def getMedian(alist):
"""get median value of list alist"""
tmp = list(alist)
tmp.sort()
alen = len(tmp)
if (alen % 2) == 1:
return tmp[alen // 2]
else:
return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2
def normalizeColumn(column):
"""Normalize column using Modified Standard Score"""
median = getMedian(column)
asd = sum([abs(x - median) for x in column]) / len(column)
result = [(x - median) / asd for x in column]
return result
class hClusterer:
""" this clusterer assumes that the first column of the data is a label
not used in the clustering. The other columns contain numeric data"""
def __init__(self, filename):
file = open(filename)
self.data = {}
self.counter = 0
self.queue = PriorityQueue()
lines = file.readlines()
file.close()
header = lines[0].split(',')
self.cols = len(header)
self.data = [[] for i in range(len(header))]
for line in lines[1:]:
cells = line.split(',')
toggle = 0
for cell in range(self.cols):
if toggle == 0:
self.data[cell].append(cells[cell])
toggle = 1
else:
self.data[cell].append(float(cells[cell]))
# now normalize number columns (that is, skip the first column)
for i in range(1, self.cols):
self.data[i] = normalizeColumn(self.data[i])
###
### I have read in the data and normalized the
### columns. Now for each element i in the data, I am going to
### 1. compute the Euclidean Distance from element i to all the
### other elements. This data will be placed in neighbors,
### which is a Python dictionary. Let's say i = 1, and I am
### computing the distance to the neighbor j and let's say j
### is 2. The neighbors dictionary for i will look like
### {2: ((1,2), 1.23), 3: ((1, 3), 2.3)... }
###
### 2. find the closest neighbor
###
### 3. place the element on a priority queue, called simply queue,
### based on the distance to the nearest neighbor (and a counter
### used to break ties.
# now push distances on queue
rows = len(self.data[0])
for i in range(rows):
minDistance = 99999
nearestNeighbor = 0
neighbors = {}
for j in range(rows):
if i != j:
dist = self.distance(i, j)
if i < j:
pair = (i,j)
else:
pair = (j,i)
neighbors[j] = (pair, dist)
if dist < minDistance:
minDistance = dist
nearestNeighbor = j
nearestNum = j
# create nearest Pair
if i < nearestNeighbor:
nearestPair = (i, nearestNeighbor)
else:
nearestPair = (nearestNeighbor, i)
# put instance on priority queue
self.queue.put((minDistance, self.counter,
[[self.data[0][i]], nearestPair, neighbors]))
self.counter += 1
def distance(self, i, j):
sumSquares = 0
for k in range(1, self.cols):
sumSquares += (self.data[k][i] - self.data[k][j])**2
return math.sqrt(sumSquares)
def cluster(self):
done = False
while not done:
topOne = self.queue.get()
nearestPair = topOne[2][1]
if not self.queue.empty():
nextOne = self.queue.get()
nearPair = nextOne[2][1]
tmp = []
##
## I have just popped two elements off the queue,
## topOne and nextOne. I need to check whether nextOne
## is topOne's nearest neighbor and vice versa.
## If not, I will pop another element off the queue
## until I find topOne's nearest neighbor. That is what
## this while loop does.
##
while nearPair != nearestPair:
tmp.append((nextOne[0], self.counter, nextOne[2]))
self.counter += 1
nextOne = self.queue.get()
nearPair = nextOne[2][1]
##
## this for loop pushes the elements I popped off in the
## above while loop.
##
for item in tmp:
self.queue.put(item)
if len(topOne[2][0]) == 1:
item1 = topOne[2][0][0]
else:
item1 = topOne[2][0]
if len(nextOne[2][0]) == 1:
item2 = nextOne[2][0][0]
else:
item2 = nextOne[2][0]
## curCluster is, perhaps obviously, the new cluster
## which combines cluster item1 with cluster item2.
curCluster = (item1, item2)
## Now I am doing two things. First, finding the nearest
## neighbor to this new cluster. Second, building a new
## neighbors list by merging the neighbors lists of item1
## and item2. If the distance between item1 and element 23
## is 2 and the distance betweeen item2 and element 23 is 4
## the distance between element 23 and the new cluster will
## be 2 (i.e., the shortest distance).
##
minDistance = 99999
nearestPair = ()
nearestNeighbor = ''
merged = {}
nNeighbors = nextOne[2][2]
for (key, value) in topOne[2][2].items():
if key in nNeighbors:
if nNeighbors[key][1] < value[1]:
dist = nNeighbors[key]
else:
dist = value
if dist[1] < minDistance:
minDistance = dist[1]
nearestPair = dist[0]
nearestNeighbor = key
merged[key] = dist
if merged == {}:
return curCluster
else:
self.queue.put( (minDistance, self.counter,
[curCluster, nearestPair, merged]))
self.counter += 1
def printDendrogram(T, sep=3):
"""Print dendrogram of a binary tree. Each tree node is represented by a
length-2 tuple. printDendrogram is written and provided by David Eppstein
2002. Accessed on 14 April 2014:
http://code.activestate.com/recipes/139422-dendrogram-drawing/ """
def isPair(T):
return type(T) == tuple and len(T) == 2
def maxHeight(T):
if isPair(T):
h = max(maxHeight(T[0]), maxHeight(T[1]))
else:
h = len(str(T))
return h + sep
activeLevels = {}
def traverse(T, h, isFirst):
if isPair(T):
traverse(T[0], h-sep, 1)
s = [' ']*(h-sep)
s.append('|')
else:
s = list(str(T))
s.append(' ')
while len(s) < h:
s.append('-')
if (isFirst >= 0):
s.append('+')
if isFirst:
activeLevels[h] = 1
else:
del activeLevels[h]
A = list(activeLevels)
A.sort()
for L in A:
if len(s) < L:
while len(s) < L:
s.append(' ')
s.append('|')
print (''.join(s))
if isPair(T):
traverse(T[1], h-sep, 0)
traverse(T, maxHeight(T), -1)
filename = '//Users/raz/Dropbox/guide/data/dogs.csv'
hg = hClusterer(filename)
cluster = hg.cluster()
printDendrogram(cluster)