refactoring with multiple dispatch

Andrey Oskin · Andrey Oskin · commit 522ebb0856a5 · 2020-02-20T21:26:59.000+02:00
diff --git a/src/ParallelKMeans.jl b/src/ParallelKMeans.jl
@@ -1,13 +1,52 @@
 module ParallelKMeans
 
-
 using StatsBase
 import Base.Threads: @spawn, @threads
 
 export kmeans
 
+abstract type CalculationMode end
+
+struct SingleThread <: CalculationMode end
+
+struct MultiThread <: CalculationMode
+    n::Int
+end
+MultiThread() = MultiThread(Threads.nthreads())
+
+"""
+    pairwise!(target, x, y, mode)
+
+Let X and Y respectively have m and n columns. Then the `pairwise!` function
+computes distances between each pair of columns in X and Y and store result
+in `target` array. Argument `mode` defines calculation mode, currently
+following modes supported
+- SingleThread()
+- MultiThread()
+"""
+pairwise!(target, x, y) = pairwise!(target, x, y, SingleThread())
+
+function pairwise!(target, x, y, mode::SingleThread)
+    ncol = size(x, 2)
+
+    @inbounds for k in axes(y, 1)
+        for i in axes(x, 1)
+            target[i, k] = (x[i, 1] - y[k, 1])^2
+        end
+
+        for j in 2:ncol
+            for i in axes(x, 1)
+                target[i, k] += (x[i, j] - y[k, j])^2
+            end
+        end
+    end
+    target
+end
+
 """
-TODO 1: Document function
+    divider(n, k)
+
+Utility function, splits 1:n sequence to k chunks of approximately same size.
 """
 function divider(n, k)
     d = div(n, k)
@@ -16,14 +55,11 @@ function divider(n, k)
 end
 
 
-"""
-TODO 2: Document function
-"""
-function pl_pairwise!(target, x, y, nth = Threads.nthreads())
+function pairwise!(target, x, y, mode::MultiThread)
     ncol = size(x, 2)
     nrow = size(x, 1)
 
-    ranges = divider(nrow, nth)
+    ranges = divider(nrow, mode.n)
     waiting_list = Task[]
 
     for i in 1:length(ranges) - 1
@@ -41,7 +77,11 @@ end
 
 
 """
-TODO 3: Document function
+    inner_pairwise!(target, x, y, r)
+
+Utility function for calculation of [pairwise!(target, x, y, mode)](@ref) function.
+UnitRange argument `r` select subarray of original design matrix `x` that is going
+to be processed.
 """
 function inner_pairwise!(target, x, y, r)
     ncol = size(x, 2)
@@ -61,40 +101,21 @@ function inner_pairwise!(target, x, y, r)
 end
 
 
-"""
-TODO 4: Document function
-"""
-function pairwise!(target, x, y)
-    ncol = size(x, 2)
-
-    @inbounds for k in axes(y, 1)
-        for i in axes(x, 1)
-            target[i, k] = (x[i, 1] - y[k, 1])^2
-        end
-
-        for j in 2:ncol
-            for i in axes(x, 1)
-                target[i, k] += (x[i, j] - y[k, j])^2
-            end
-        end
-    end
-    target
-end
-
-
 """
     smart_init(X, k; init="k-means++")
 
-    This function handles the random initialisation of the centroids from the
-    design matrix (X) and desired groups (k) that a user supplies.
+This function handles the random initialisation of the centroids from the
+design matrix (X) and desired groups (k) that a user supplies.
 
-    `k-means++` algorithm is used by default with the normal random selection
-    of centroids from X used if any other string is attempted.
+`k-means++` algorithm is used by default with the normal random selection
+of centroids from X used if any other string is attempted.
 
-    A tuple representing the centroids, number of rows, & columns respecitively
-    is returned.
+A tuple representing the centroids, number of rows, & columns respecitively
+is returned.
 """
-function smart_init(X::Array{Float64, 2}, k::Int; init::String="k-means++")
+function smart_init(X::Array{Float64, 2}, k::Int, mode::T = SingleThread();
+        init::String="k-means++") where {T <: CalculationMode}
+
     n_row, n_col = size(X)
 
     if init == "k-means++"
@@ -111,7 +132,7 @@ function smart_init(X::Array{Float64, 2}, k::Int; init::String="k-means++")
 
         # flatten distances
         # distances = vec(pairwise(SqEuclidean(), X, first_centroid_matrix, dims = 1))
-        pairwise!(distances, X, first_centroid_matrix)
+        pairwise!(distances, X, first_centroid_matrix, mode)
 
         for i = 2:k
             # choose the next centroid, the probability for each data point to be chosen
@@ -127,7 +148,7 @@ function smart_init(X::Array{Float64, 2}, k::Int; init::String="k-means++")
             # compute distances from the centroids to all data points
             current_centroid_matrix = convert(Matrix, centroids[i, :]')
             # new_distances = vec(pairwise(SqEuclidean(), X, current_centroid_matrix, dims = 1))
-            pairwise!(new_distances, X, first_centroid_matrix)
+            pairwise!(new_distances, X, first_centroid_matrix, mode)
 
             # and update the squared distance as the minimum distance to all centroid
             # distances = minimum([distances, new_distances])
@@ -140,7 +161,6 @@ function smart_init(X::Array{Float64, 2}, k::Int; init::String="k-means++")
         # randomly select points from the design matrix as the initial centroids
         rand_indices = rand(1:n_row, k)
         centroids = X[rand_indices, :]
-
     end
 
     return centroids, n_row, n_col
@@ -150,10 +170,10 @@ end
 """
     sum_of_squares(x, labels, centre, k)
 
-    This function computes the total sum of squares based on the assigned (labels)
-    design matrix(x), centroids (centre), and the number of desired groups (k).
+This function computes the total sum of squares based on the assigned (labels)
+design matrix(x), centroids (centre), and the number of desired groups (k).
 
-    A Float type representing the computed metric is returned.
+A Float type representing the computed metric is returned.
 """
 function sum_of_squares(x::Array{Float64,2}, labels::Array{Int64,1}, centre::Array)
     s = 0.0
@@ -171,24 +191,24 @@ end
 """
     Kmeans(design_matrix, k; k_init="k-means++", max_iters=300, tol=1e-4, verbose=true)
 
-    This main function employs the K-means algorithm to cluster all examples
-    in the training data (design_matrix) into k groups using either the
-    `k-means++` or random initialisation technique for selecting the initial
-    centroids.
-
-    At the end of the number of iterations specified (max_iters), convergence is
-    achieved if difference between the current and last cost objective is
-    less than the tolerance level (tol). An error is thrown if convergence fails.
+This main function employs the K-means algorithm to cluster all examples
+in the training data (design_matrix) into k groups using either the
+`k-means++` or random initialisation technique for selecting the initial
+centroids.
 
-    Details of operations can be either printed or not by setting verbose accordingly.
+At the end of the number of iterations specified (max_iters), convergence is
+achieved if difference between the current and last cost objective is
+less than the tolerance level (tol). An error is thrown if convergence fails.
 
-    A tuple representing labels, centroids, and sum_squares respectively is returned.
+Details of operations can be either printed or not by setting verbose accordingly.
 
+A tuple representing labels, centroids, and sum_squares respectively is returned.
 """
-function kmeans(design_matrix::Array{Float64, 2}, k::Int; k_init::String = "k-means++",
-    max_iters::Int = 300, tol = 1e-4, verbose::Bool = true)
+function kmeans(design_matrix::Array{Float64, 2}, k::Int, mode::T = SingleThread();
+        k_init::String = "k-means++", max_iters::Int = 300, tol = 1e-4,
+        verbose::Bool = true) where {T <: CalculationMode}
 
-    centroids, n_row, n_col = smart_init(design_matrix, k, init=k_init)
+    centroids, n_row, n_col = smart_init(design_matrix, k, mode, init=k_init)
 
     labels = Vector{Int}(undef, n_row)
     distances = Vector{Float64}(undef, n_row)
@@ -199,7 +219,7 @@ function kmeans(design_matrix::Array{Float64, 2}, k::Int; k_init::String = "k-me
     nearest_neighbour = Array{Float64, 2}(undef, size(design_matrix, 1), size(centroids, 1))
     # Update centroids & labels with closest members until convergence
     for iter = 1:max_iters
-        pairwise!(nearest_neighbour, design_matrix, centroids)
+        pairwise!(nearest_neighbour, design_matrix, centroids, mode)
 
         @inbounds for i in axes(nearest_neighbour, 1)
             labels[i] = 1
@@ -230,11 +250,11 @@ function kmeans(design_matrix::Array{Float64, 2}, k::Int; k_init::String = "k-me
 
         if verbose
             # Show progress and terminate if J stopped decreasing.
-            println("Iteration ", iter, ": Jclust = ", J, ".")
+            println("Iteration $iter: Jclust = $J.")
         end
 
         # Final Step: Check for convergence
-        if iter > 1 && abs(J - J_previous) < (tol * J)
+        if (iter > 1) & (abs(J - J_previous) < (tol * J))
 
             sum_squares = sum_of_squares(design_matrix, labels, centroids)
 
@@ -243,16 +263,15 @@ function kmeans(design_matrix::Array{Float64, 2}, k::Int; k_init::String = "k-me
                 println("Successfully terminated with convergence.")
             end
 
-            return labels, centroids, sum_squares
+            return (labels=labels, centroids=centroids, sum_squares=sum_squares)
 
-        elseif iter == max_iters && abs(J - J_previous) > (tol * J)
+        elseif (iter == max_iters) & (abs(J - J_previous) > (tol * J))
             throw(error("Failed to converge Check data and/or implementation or increase max_iter."))
 
         end
 
         J_previous = J
     end
-
 end
 
 end # module
diff --git a/test/test01_distance.jl b/test/test01_distance.jl
@@ -1,5 +1,5 @@
 module TestDistance
-using ParallelKMeans: pairwise!, pl_pairwise!
+using ParallelKMeans: pairwise!, SingleThread, MultiThread
 using Test
 
 @testset "naive singlethread pairwise" begin
@@ -11,4 +11,14 @@ using Test
     @test all(r .≈ [0.0, 13.0, 25.0])
 end
 
+@testset "multithread pairwise" begin
+    X = [1.0 2.0; 3.0 5.0; 4.0 6.0]
+    y = [1.0 2.0; ]
+    r = Array{Float64, 2}(undef, 3, 1)
+
+    pairwise!(r, X, y, MultiThread())
+    @test all(r .≈ [0.0, 13.0, 25.0])
+end
+
+
 end # module
diff --git a/test/test02_kmeans.jl b/test/test02_kmeans.jl
@@ -1,9 +1,11 @@
 module TestKMeans
+
 using ParallelKMeans
+using ParallelKMeans: MultiThread
 using Test
 using Random
 
-@testset "linear separation" begin
+@testset "singlethread linear separation" begin
     Random.seed!(2020)
 
     X = rand(100, 3)
@@ -15,4 +17,17 @@ using Random
     @test sum_squares ≈ 15.314823028363763
 end
 
+
+@testset "multithread linear separation" begin
+    Random.seed!(2020)
+
+    X = rand(100, 3)
+    labels, centroids, sum_squares = kmeans(X, 3, MultiThread(); tol = 1e-10, verbose = false)
+
+    # for future reference: Clustering shows here 14.964882850452984
+    # guess they use better initialisation. For now we will use own
+    # value
+    @test sum_squares ≈ 15.314823028363763
+end
+
 end # module