11module ParallelKMeans
2- # #######################################
3- # Based on discourse discussion
4- # https://discourse.julialang.org/t/optimization-tips-for-my-julia-code-can-i-make-it-even-faster-and-or-memory-efficient/34614/20
2+
53
64using StatsBase
75import Base. Threads: @spawn , @threads
86
97export kmeans
108
9+ """
10+ TODO: Document function
11+ """
1112function divider (n, k)
1213 d = div (n, k)
1314 xz = vcat (collect ((0 : k- 1 ) * d), n)
1415 return [t[1 ]: t[2 ] for t in zip (xz[1 : end - 1 ] .+ 1 , xz[2 : end ])]
1516end
1617
18+ """
19+ TODO: Document function
20+ """
1721function pl_pairwise! (target, x, y, nth = Threads. nthreads ())
1822 ncol = size (x, 2 )
1923 nrow = size (x, 1 )
@@ -30,6 +34,9 @@ function pl_pairwise!(target, x, y, nth = Threads.nthreads())
3034 target
3135end
3236
37+ """
38+ TODO: Document function
39+ """
3340function inner_pairwise! (target, x, y, r)
3441 ncol = size (x, 2 )
3542 @inbounds for k in axes (y, 1 )
@@ -46,6 +53,9 @@ function inner_pairwise!(target, x, y, r)
4653 target
4754end
4855
56+ """
57+ TODO: Document function
58+ """
4959function pairwise! (target, x, y)
5060 ncol = size (x, 2 )
5161 @inbounds for k in axes (y, 1 )
@@ -62,11 +72,18 @@ function pairwise!(target, x, y)
6272 target
6373end
6474
75+
6576"""
6677 smart_init(X, k; init="k-means++")
6778
6879 This function handles the random initialisation of the centroids from the
6980 design matrix (X) and desired groups (k) that a user supplies.
81+
82+ `k-means++` algorithm is used by default with the normal random selection
83+ of centroids from X used if any other string is attempted.
84+
85+ A tuple representing the centroids, number of rows, & columns respecitively
86+ is returned.
7087"""
7188function smart_init (X:: Array{Float64, 2} , k:: Int ; init:: String = " k-means++" )
7289 n_row, n_col = size (X)
@@ -121,10 +138,14 @@ function smart_init(X::Array{Float64, 2}, k::Int; init::String="k-means++")
121138end
122139
123140
141+
124142"""
125143 sum_of_squares(x, labels, centre, k)
126144
127- This function computes the total sum of squares
145+ This function computes the total sum of squares based on the assigned (labels)
146+ design matrix(x), centroids (centre), and the number of desired groups (k).
147+
148+ A Float type representing the computed metric is returned.
128149"""
129150function sum_of_squares (x:: Array{Float64,2} , labels:: Array{Int64,1} , centre:: Array )
130151 s = 0.0
@@ -138,42 +159,26 @@ function sum_of_squares(x::Array{Float64,2}, labels::Array{Int64,1}, centre::Arr
138159 return s
139160end
140161
141- # function sum_of_squares(x::Array{Float64,2}, labels::Array{Int64,1}, centre::Array, nth = Base.Threads.nthreads())
142- # s = 0.0
143- #
144- # @inbounds for j in axes(x, 2)
145- # for i in axes(x, 1)
146- # s += (x[i, j] - centre[labels[i], j])^2
147- # end
148- # end
149- #
150- # return s
151- # end
152- #
153- #
154- # function inner_sum_of_squares(x::Array{Float64,2}, labels::Array{Int64,1}, centre::Array, r)
155- # s = 0.0
156- #
157- # @inbounds for j in axes(x, 2)
158- # for i in r
159- # s += (x[i, j] - centre[labels[i], j])^2
160- # end
161- # end
162- #
163- # return s
164- # end
165162
166163"""
167164 Kmeans(design_matrix, k; k_init="k-means++", max_iters=300, tol=1e-4, verbose=true)
168165
169- This main function employs the K-means algorithm to cluster all examples
170- in the training data (design_matrix) into k groups using either the
171- `k-means++` or random initialisation.
166+ This main function employs the K-means algorithm to cluster all examples
167+ in the training data (design_matrix) into k groups using either the
168+ `k-means++` or random initialisation technique for selecting the initial
169+ centroids.
170+
171+ At the end of the number of iterations specified (max_iters), convergence is
172+ achieved if difference between the current and last cost objective is
173+ less than the tolerance level (tol). An error is thrown if convergence fails.
174+
175+ Details of operations can be either printed or not by setting verbose accordingly.
176+
177+ A tuple representing labels, centroids, and sum_squares respectively is returned.
172178
173- design_matrix should have the form (number of points x point dimensionality).
174179"""
175180function kmeans (design_matrix:: Array{Float64, 2} , k:: Int ; k_init:: String = " k-means++" ,
176- max_iters:: Int = 300 , tol = 1e-4 , verbose:: Bool = true )
181+ max_iters:: Int = 300 , tol:: Float64 = 1e-4 , verbose:: Bool = true )
177182
178183 centroids, n_row, n_col = smart_init (design_matrix, k, init= k_init)
179184
0 commit comments