-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathCURE.py
More file actions
130 lines (114 loc) · 5.11 KB
/
CURE.py
File metadata and controls
130 lines (114 loc) · 5.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# -*- coding: utf-8 -*-
###########################################################################################
# Implementation of CURE (Clustering Using Representatives) Clustering Algorithm
# Author for codes: Chu Kun(kun_chu@outlook.com)
# Paper: https://www.sciencedirect.com/science/article/pii/S0306437901000084
# Reference: https://github.com/Kchu/CURE-cluster-python
###########################################################################################
import numpy as np
import scipy.spatial.distance as distance
import sys
# Returns the distance between two vectors
def dist(vecA, vecB):
return np.sqrt(np.power(vecA - vecB, 2).sum())
# This class describes the data structure and method of operation for CURE clustering.
class CureCluster:
def __init__(self, id__, center__):
self.points = center__
self.repPoints = center__
self.center = center__
self.index = [id__]
def __repr__(self):
return "Cluster " + " Size: " + str(len(self.points))
# Computes and stores the centroid of this cluster, based on its points
def computeCentroid(self, clust):
totalPoints_1 = len(self.index)
totalPoints_2 = len(clust.index)
self.center = (self.center*totalPoints_1 + clust.center*totalPoints_2) / (totalPoints_1 + totalPoints_2)
# Computes and stores representative points for this cluster
def generateRepPoints(self, numRepPoints, alpha):
tempSet = None
for i in range(1, numRepPoints+1):
maxDist = 0
maxPoint = None
for p in range(0, len(self.index)):
if i == 1:
minDist = dist(self.points[p,:], self.center)
else:
X = np.vstack([tempSet, self.points[p, :]])
tmpDist = distance.pdist(X)
minDist = tmpDist.min()
if minDist >= maxDist:
maxDist = minDist
maxPoint = self.points[p,:]
if tempSet is None:
tempSet = maxPoint
else:
tempSet = np.vstack((tempSet, maxPoint))
for j in range(len(tempSet)):
if self.repPoints is None:
self.repPoints = tempSet[j,:] + alpha * (self.center - tempSet[j,:])
else:
self.repPoints = np.vstack((self.repPoints, tempSet[j,:] + alpha * (self.center - tempSet[j,:])))
# Computes and stores distance between this cluster and the other one.
def distRep(self, clust):
distRep = float('inf')
for repA in self.repPoints:
if type(clust.repPoints[0]) != list:
repB = clust.repPoints
distTemp = dist(repA, repB)
if distTemp < distRep:
distRep = distTemp
else:
for repB in clust.repPoints:
distTemp = dist(repA, repB)
if distTemp < distRep:
distRep = distTemp
return distRep
# Merges this cluster with the given cluster, recomputing the centroid and the representative points.
def mergeWithCluster(self, clust, numRepPoints, alpha):
self.computeCentroid(clust)
self.points = np.vstack((self.points, clust.points))
self.index = np.append(self.index, clust.index)
self.repPoints = None
self.generateRepPoints(numRepPoints, alpha)
# Describe the process of the CURE algorithm
def runCURE(data, numRepPoints, alpha, numDesCluster):
# Initialization
Clusters = []
numCluster = len(data)
numPts = len(data)
distCluster = np.ones([len(data), len(data)])
distCluster = distCluster * float('inf')
for idPoint in range(len(data)):
newClust = CureCluster(idPoint, data[idPoint,:])
Clusters.append(newClust)
for row in range(0, numPts):
for col in range(0, row):
distCluster[row][col] = dist(Clusters[row].center, Clusters[col].center)
while numCluster > numDesCluster:
if np.mod(numCluster, 50) == 0:
print('Cluster count:', numCluster)
# Find a pair of closet clusters
minIndex = np.where(distCluster == np.min(distCluster))
minIndex1 = minIndex[0][0]
minIndex2 = minIndex[1][0]
# Merge
Clusters[minIndex1].mergeWithCluster(Clusters[minIndex2], numRepPoints, alpha)
# Update the distCluster matrix
for i in range(0, minIndex1):
distCluster[minIndex1, i] = Clusters[minIndex1].distRep(Clusters[i])
for i in range(minIndex1+1, numCluster):
distCluster[i, minIndex1] = Clusters[minIndex1].distRep(Clusters[i])
# Delete the merged cluster and its disCluster vector.
distCluster = np.delete(distCluster, minIndex2, axis=0)
distCluster = np.delete(distCluster, minIndex2, axis=1)
del Clusters[minIndex2]
numCluster = numCluster - 1
print('Cluster count:', numCluster)
# Generate sample labels
Label = [0] * numPts
for i in range(0, len(Clusters)):
for j in range(0, len(Clusters[i].index)):
Label[Clusters[i].index[j]] = i + 1
return Label