Skip to content
6 changes: 5 additions & 1 deletion AUTHORS
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
Michel Albert (exhuma@users.sourceforge.net)
Sam Sandberg (@LoisaidaSam)
Sam Sandberg (@LoisaidaSam)

high dimensionality functionalities:
Jose J. GarciaAranda (@jjaranda13)
Juan Ramos Diaz (@juanrd0088)
131 changes: 131 additions & 0 deletions HDexample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# -*- coding: cp1252 -*-
###############################################################################
# High Dimensionality problem example
# Authors:
# 2015 Jose Javier Garcia Aranda , Juan Ramos Diaz
#
###############################################################################
# This High Dimensionality example creates N items (which are "users").
# Each user is defined by his profile.
# A profile is a tuple of 10 pairs of keyword and weight ( 20 fields in total)
# weights are floating numbers and belong to 0..1
# The summation of weights of a profile is normalized to 1
# we consider 1000 diferent keywords
# A profile takes 8 keywords from first 200 keywords (the "popular" keywords)
# Each keyword is a dimension. Therefore there are 1000 possible dimensions
# A single user only have 10 dimensions
# Different users can have different dimensions.
# A new distance and equality function are defined for this use case
#
# cl = KMeansClustering(users,HDdistItems,HDequals);
#
# Additionally, now the number of iterations can be limited in order to save time
# Experimentally, we have concluded that 10 iterations is enough accurate for most cases.
# The new HDgetClusters() function is linear. Avoid the recalculation of centroids
# whereas original function getClusters() is N*N complex, because recalculate the
# centroid when move an item from one cluster to another.
# This new function can be used for low and high dimensionality problems, increasing
# performance in both cases
#
# solution = cl.HDgetclusters(numclusters,max_iterations);
#
# Other new available optimization inside HDcentroid() function in is the use of mean instead median at centroid calculation.
# median is more accurate but involves more computations when N is huge.
# The function HDcentroid() is invoked internally by HDgetclusters()
#
# The optional invocation of HDcomputeSSE() assist the computation of the optimal number or clusters.
#
#
from cluster import KMeansClustering
from cluster import ClusteringError
from cluster import util
from cluster.util import HDcentroid
from cluster.HDdistances import HDdistItems, HDequals, HDcomputeSSE
import time
import datetime

import random

def createProfile():
num_words=1000
total_weight=0;
marked_word=[0]*num_words
repeated_word=False
list_profile=[]
returned_profile=();
profile_aux=[];
#10 pairs word, weight.
#Don't repeated words.
for i in range(8):
partial_weight=random.uniform(0,1)
total_weight+=partial_weight
repeated_word=False
while repeated_word==False:
random_word=random.randint(0,299)
if marked_word[random_word]==0:
marked_word[random_word]=1
repeated_word=True
random_word= str(random_word)
tupla=[random_word,partial_weight]
list_profile.append(tupla)
for i in range(2):
partial_weight=random.uniform(0,1)
total_weight+=partial_weight
repeated_word=False
while repeated_word==False:
random_word=random.randint(300,999)
if marked_word[random_word]==0:
marked_word[random_word]=1
repeated_word=True
random_word= str(random_word)
tupla=[random_word,partial_weight]
list_profile.append(tupla)
#Normalization of the profile
for i in range(5):
a=list_profile[i][0]
b=list_profile[i][1]
b=b/total_weight; #the sum of the weights must be 1
profile_aux=([a,b])
returned_profile+=tuple(profile_aux)
return returned_profile

####################################################
# MAIN #
####################################################
sses=[0]*10 #stores the sse metric for each number of clusters from 5 to 50
num_users=100
numsse=0
numclusters=5 # starts at 5
max_iteraciones=10
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to rename this to max_iterations to keep the code in English, so other readers have an easier time reading this.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

of course, i will rename it

ts = time.time()
start_time=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can simplify these two lines by simply using:

start_time = datetime.now()

There is also no need to use time.time() first an there is also no need to run strftime. Python takes care of that when printing (but the format will be slightly different).

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you are right

while numclusters<=50: # compute SSE from num_clusters=5 to 50
supersol=0#supersolucion, distancias entre el clusters y los usuarios.
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please translate this comment to English.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

of course, i will do it

users=[] # users are the items of this example
for i in range(num_users):#en el range el numero de usuarios
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please translate this comment to English

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

of course, i will do it

user = createProfile()
users.append(user)
#x=0;
print " inicializing kmeans..."
cl = KMeansClustering(users,HDdistItems,HDequals);
print " executing...",numclusters
ts = time.time()
st=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as with start_time above.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

print st
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not Python3 compatible (print is a function in Python 3). You might want to add this to the beginning of the module (if you are using Python 2):

from __future__ import print_function

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, i will do

numclusters=numclusters
solution = cl.HDgetclusters(numclusters,max_iteraciones);
for i in range(numclusters):
a = solution[i]
print util.HDcentroid(a),","
ts = time.time()
st=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as with start_time above.

I get the feeling that you are printing this to get some kind of "progress" on your screen and to see how long things take.

While this is only an example file, I suggest having a look into the logging package. It allows you to easily emit status messages of a running process. By default it will not display the timestamp though, you need to configure it, or use my package https://github.com/exhuma/gouge

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok


sses[numsse]=HDcomputeSSE(solution,numclusters)
numsse+=1
numclusters+=5
ts = time.time()
end_time=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as with start_time above.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

print "start_time:",start_time
print "end_time:",end_time
print "sses:",sses

48 changes: 48 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,51 @@ The parameter passed to getclusters is the count of clusters generated.
.. image:: https://readthedocs.org/projects/python-cluster/badge/?version=latest
:target: http://python-cluster.readthedocs.org
:alt: Documentation Status



2015/07/20 NEW FUNCTIONALITIES FOR HIGH AND LOW DIMENSIONALITY PROBLEMS
=======================================================================
Authors of new added functionalities:
- Garcia Aranda, Jose Javier jose_javier.garcia_aranda@alcatel-lucent.com
- Ramos Diaz, Juan juanrd0088@gmail.com

Acknoledgements:
Authors want to thank the Spanish Economy & competitiveness Ministry which funds this research
through "INNPACTO" innovation program IPT-2012-0839-430000.


High dimensionality (HD) problems are those which have items with high number of dimensions
There are two types of HD problems::
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The :: at the end of the line marks the beginning of a "plain-text" block. I would rather format this like the following:

[...]
There are two type of HD problems:

a) Set of items [...]
b) Set of items [...]::

      items1 = (X=2 [...]
      items2 = (x=6 [...] 

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, you are right

a)set of items with large number of dimensions.
b)set of items with a limited number of dimensions from a large available number of dimensions
For example considering dimensions X, Y, Z, K, L, M and the items:
item1=(X=2, Z=5, L=7)
item2=(X=6, Y=5, M=7)

The HD problems involves a high cost computation because distance functions in this case takes more
operations than Low dimensionality problems.

For case "b" (valid also for "a"), a new distance for HD problems is available: HDdistItems() ,HDequals()
This distance function compares dimensions between 2 items.
Each dimension of item1 is searched in item2, and if it is found, then the distance takes into account the difference (manhattan style). If the dimension does not exist in item2, a maximum value is added to the total distance between item1 and item2

There is no difference with current usage::

>>>cl = KMeansClustering(users,HDdistItems,HDequals);


Additionally, now the number of iterations can be limited in order to save time
Experimentally, we have concluded that 10 iterations is enough accurate for most cases.
The new HDgetClusters() function is linear. Avoid the recalculation of centroids
whereas original function getClusters() is N*N complex, because recalculate the
Comment thread
exhuma marked this conversation as resolved.
centroid when move an item from one cluster to another.
This new function can be used for low and high dimensionality problems, increasing
performance in both cases::

>>>solution = cl.HDgetclusters(numclusters,max_iterations)

Other new available optimization inside HDcentroid() function in is the use of mean instead median at centroid calculation.
median is more accurate but involves more computations when N is huge.
The function HDcentroid() is invoked internally by HDgetclusters()

71 changes: 71 additions & 0 deletions cluster/HDdistances.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@

# This file provides functionalities for High dimensionality problems but also for low dimensionality problems
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of a comment, this should preferably be a docstring, making is available to documentation tools like Sphinx.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fully agree. i will do it

# - New Distance computation
# - SSE metric computation for assist the computation of the optimal number of clusters
#
# Authors:
# Jose Javier Garcia Aranda
# Juan Ramos Diaz



#from cluster import KMeansClustering
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove these "disabled" code-lines.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

#import KMeansClustering
#import ClusteringError
import util
import time
import datetime

import random

def HDdistItems(profile1,profile2):
#Distance function, this distance between two profiles is based on:
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be a docstring

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, you are right. I will change it

#For each keyword of user A, if the keyword is not present in user B , then the distance for this keyword is the weight in the user A.
#If the keyword exists in both users, the weights are compared and the distance is the absolute difference
len1=len(profile1)/2
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Has this been tested in both Python 2 & 3? Division behaves a bit differently in both 2 & 3. In Python 2 this will always return an int (rounding down), whereas in Python 3 this might return a float on uneven lengths!

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we will make tests in python2 and 3 . I dont remember now

len2=len(profile2)/2
total_len=len1+len2 #this value usually is 20
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you look at this function in isolation (without any other context), then this comment is not correct and should be removed.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i agree

factor_len=20.0/total_len #this only work if the profile has less than 10 keys
distance = 0.0
marked=[0]*(total_len*2);
for i in range(len1):
found=False
for j in range(len2):
if profile1[i*2]==profile2[j*2]:
distance+=abs(profile1[i*2+1]-profile2[j*2+1]);
found=True;
marked[j*2]=1;
break;
if found==False:
distance+=profile1[i*2+1];

for i in range(len2):
if marked[i*2]==1:
continue;
distance+=profile2[i*2+1]

distance=distance*factor_len
return distance

def HDequals(profile1,profile2):
for i in range(10):
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this hardcoded to 10?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is designed for profiles of length=10 elements, although the number of possible dimensions may be huge. We can make it configurable, give us some weeks. In June will be ready

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. No problems. I'm not in a hurry ;) I will leave this PR open until I hear from you again.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

everything solved!

for j in range(10):
if profile1[i*2]!=profile2[j*2]:
return False
elif profile1[i*2+1]!=profile2[j*2+1]:
return False
return True
#return True;

def HDcomputeSSE(solution,numclusters):
#This metric measure the cohesion of users into a cluster and the separation among clusters at the same time
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be a docstring.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fully agree

partial_solution=0
total_solution=0
dist=0
for i in range(numclusters):
partial_solution=0
for j in solution[i]:
dist=HDdistItems(util.HDcentroid(solution[i]),j)
partial_solution+=dist*dist
total_solution+=partial_solution
return total_solution
107 changes: 105 additions & 2 deletions cluster/method/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@
# along with this library; if not, write to the Free Software Foundation,
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# new functions: HDgetCluster() and HDassignItem by:
# 2015 Jose Javier Garcia Aranda, Juan Ramos Diaz


from cluster.util import ClusteringError, centroid, minkowski_distance
from cluster.util import ClusteringError, centroid, minkowski_distance, HDcentroid
import time
import datetime


class KMeansClustering(object):
Expand Down Expand Up @@ -166,3 +169,103 @@ def initialise_clusters(self, input_, clustercount):
for item in input_:
self.__clusters[count % clustercount].append(item)
count += 1


def HDgetclusters(self, count, max_iterations):
"""
Generates *count* clusters.

:param count: The amount of clusters that should be generated. count
must be greater than ``1``.
:raises ClusteringError: if *count* is out of bounds.
"""

# only proceed if we got sensible input
if count <= 1:
raise ClusteringError("When clustering, you need to ask for at "
"least two clusters! "
"You asked for %d" % count)

# return the data straight away if there is nothing to cluster
if (self.__data == [] or len(self.__data) == 1 or
count == self.__initial_length):
return self.__data

# It makes no sense to ask for more clusters than data-items available
if count > self.__initial_length:
raise ClusteringError(
"Unable to generate more clusters than "
"items available. You supplied %d items, and asked for "
"%d clusters." % (self.__initial_length, count))

self.initialise_clusters(self.__data, count)

items_moved = True # tells us if any item moved between the clusters,
# as we initialised the clusters, we assume that
# is the case

iteration=0
#asi no, no obligar a hacer iteraciones, lo hago segun dice el algoritmo
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please translate this to English

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

of course, i will do

#pero si llego a iteraciones paro, si termino antes de llegar, mejor
while items_moved is True:
items_moved = False
print "iterating",iteration
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be replaced with a logging statement. Otherwise it will cause unwanted "noise" on stdout for everyone using this method which is even impossible to silence without modifying the code.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, you are right

ts = time.time()
st=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be simplified with (see also my comments on the "example" script for more detail):

st = datetime.now()

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

print st
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Either remove this, or replace it with a logging statement

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

iteration=iteration+1

#computation of centroids
my_centroids={} # new!!
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These "new" comments don't add any useful information to the code and should be removed.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i agree

for cluster in self.__clusters:# new!!
one_centroid=HDcentroid(cluster)# new!!
my_centroids[one_centroid]=cluster # new!!




#this few lines are new:
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These "disabled" lines should be removed.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

#print centroids . it works, for debug purposes only!!
#for i in my_centroids.keys():
# print "key:",i # print the centroid!!
# print "value:",my_centroids[i] # print all elements of the cluster!!
#print my_centroids.keys()[0] # imprime el primer centroide. es una prueba

#now we scan the N items without recalculation of centroids. Therefore, it is linear
for cluster in self.__clusters:
for centroid_aux, cluster_aux in my_centroids.iteritems():
if cluster_aux == cluster:
centroid_cluster=centroid_aux
break;
for item in cluster:
res = self.HDassign_item(item, cluster,centroid_cluster,my_centroids)#modified!!
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This "modified" comment should be removed.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

if items_moved is False:
items_moved = res

if (iteration == max_iterations):
items_moved = False
return self.__clusters


def HDassign_item(self, item, origin, origin_centroid, my_centroids):
"""
Assigns an item from a given cluster to the closest located cluster.

:param item: the item to be moved.
:param origin: the originating cluster.
:param origin_centroid: centroid of the originating cluster
:my_centroids: dictionary of centroid,cluster
"""
closest_cluster=origin #my_centroids[closest_centroid]=closest_cluster
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment should be removed.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

closest_centroid=origin_centroid
#for cluster in self.__clusters:
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment should be removed

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

for centro in my_centroids.keys():
if self.distance(item, centro) < self.distance(
item, closest_centroid):
closest_cluster = my_centroids[centro]

if id(closest_cluster) != id(origin):
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a specific reason why you used id( here instead of using the is operator? If not, this might be a bit more pythonic by writing:

if closest_cluster is origin:
    ...

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you are right

self.move_item(item, origin, closest_cluster)
return True
else:
return False
Loading