MiniBatch algorithm draft

PyDataBlog · PyDataBlog · commit 121573cfe3dd · 2021-04-02T13:24:47.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "ParallelKMeans"
 uuid = "42b8e9d4-006b-409a-8472-7f34b3fb58af"
 authors = ["Bernard Brenyah", "Andrey Oskin"]
-version = "0.2.0"
+version = "0.2.1"
 
 [deps]
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -213,6 +213,8 @@ ________________________________________________________________________________
 - 0.1.7 Added `Yinyang` and `Coreset` support in MLJ interface; added `weights` support in MLJ; added RNG seed support in MLJ interface and through all algorithms; added metric support.
 - 0.1.8 Minor cleanup
 - 0.1.9 Added travis support for Julia 1.5
+- 0.2.0 Updated MLJ Interface
+- 0.2.1 Mini-batch implementation
 
 ## Contributing
 
diff --git a/src/ParallelKMeans.jl b/src/ParallelKMeans.jl
@@ -15,9 +15,10 @@ include("hamerly.jl")
 include("elkan.jl")
 include("yinyang.jl")
 include("coreset.jl")
+include("mini_batch.jl")
 include("mlj_interface.jl")
 
 export kmeans
-export Lloyd, Hamerly, Elkan, Yinyang, 阴阳, Coreset
+export Lloyd, Hamerly, Elkan, Yinyang, 阴阳, Coreset, MiniBatch
 
 end # module
diff --git a/src/kmeans.jl b/src/kmeans.jl
@@ -115,7 +115,7 @@ Allocationless calculation of square eucledean distance between vectors X1[:, i1
 @inline function distance(metric::Euclidean, X1, X2, i1, i2)
     # here goes my definition
     d = zero(eltype(X1))
-    # TODO: break of the loop if d is larger than threshold (known minimum disatnce)
+    # TODO: break of the loop if d is larger than threshold (known minimum distance)
     @inbounds @simd for i in axes(X1, 1)
         d += (X1[i, i1] - X2[i, i2])^2
     end
diff --git a/src/mini_batch.jl b/src/mini_batch.jl
@@ -0,0 +1,111 @@
+"""
+    MiniBatch(b::Int)
+
+    Sculley et al. 2007 Mini batch k-means algorithm implementation.
+"""
+struct MiniBatch <: AbstractKMeansAlg
+    b::Int  # batch size
+end
+
+
+MiniBatch() = MiniBatch(100)
+
+function kmeans!(alg::MiniBatch, X, k;
+                 weights = nothing, metric = Euclidean(), n_threads = Threads.nthreads(),
+                 k_init = "k-means++", init = nothing, max_iters = 300,
+                 tol = 0, max_no_improvement = 10, verbose = false, rng = Random.GLOBAL_RNG)
+
+    # Get the type and dimensions of design matrix, X
+    T = eltype(X)
+    nrow, ncol = size(X)
+
+    # Initiate cluster centers - (Step 2) in paper
+    centroids = isnothing(init) ? smart_init(X, k, n_threads, weights, rng, init = k_init).centroids : deepcopy(init)
+
+    # Initialize counter for the no. of data in each cluster - (Step 3) in paper
+    N = zeros(T, k)
+
+    # Initialize nearest centers
+    labels = Vector{Int}(undef, alg.b)
+
+    converged = false
+    niters = 0
+    J_previous = zero(T)
+    J = zero(T)
+
+    # TODO: Main Steps. Batch update centroids until convergence
+    while niters <= max_iters
+        counter = 0
+
+        # b examples picked randomly from X (Step 5 in paper)
+        batch_rand_idx = isnothing(weights) ? rand(rng, 1:ncol, alg.b) : wsample(rng, 1:ncol, weights, alg.b)
+        batch_sample = X[:, batch_rand_idx]
+
+        # Cache/label the batch samples nearest to the centers (Step 6 & 7)
+        @inbounds for i in axes(batch_sample, 2)
+            min_dist = distance(metric, batch_sample, centroids, i, 1)
+            label = 1
+
+            for j in 2:size(centroids, 2)
+                dist = distance(metric, batch_sample, centroids, i, j)
+                label = dist < min_dist ? j : label
+                min_dist = dist < min_dist ? dist : min_dist
+            end
+
+            labels[i] = label
+        end
+
+        # TODO: Batch gradient step
+        for j in axes(batch_sample, 2)  # iterate over examples (Step 9)
+
+            # Get cached center/label for this x  => labels[j] (Step 10)
+            label = labels[j]
+            # Update per-center counts
+            N[label] += isnothing(weights) ? 1 : weights[j]  # verify (Step 11)
+
+            # Get per-center learning rate (Step 12)
+            lr = 1 / N[label]
+
+            # Take gradient step (Step 13) # TODO: Replace with an allocation-less loop.
+            centroids[:, label] .= (1 - lr) .* centroids[:, label] .+ (lr .* batch_sample[:, j])
+        end
+
+        # TODO: Calculate cost and check for convergence
+        J = sum_of_squares(batch_sample, labels, centroids)  # just a placeholder for now
+
+        if verbose
+            # Show progress and terminate if J stopped decreasing.
+            println("Iteration $niters: Jclust = $J")
+        end
+
+        # TODO: Check for early stopping convergence
+        if (niters > 1) & abs(J - J_previous)
+            counter += 1
+
+            # Declare convergence if max_no_improvement criterion is met
+            if counter >= max_no_improvement
+                converged = true
+                break
+            end
+
+        end
+
+        J_previous = J
+        niters += 1
+    end
+
+    return centroids, niters, converged, labels, J  # TODO: push learned artifacts to KmeansResult
+    #return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged)
+end
+
+# TODO: Only being used to test generic implementation. Get rid off after!
+function sum_of_squares(x, labels, centre)
+    s = 0.0
+
+    for i in axes(x, 2)
+        for j in axes(x, 1)
+            s += (x[j, i] - centre[j, labels[i]])^2
+        end
+    end
+    return s
+end
diff --git a/src/mlj_interface.jl b/src/mlj_interface.jl
@@ -15,7 +15,10 @@ const MLJDICT = Dict(:Lloyd => Lloyd(),
 ####
 #### MODEL DEFINITION
 ####
-
+"""
+    ParallelKMeans model constructed by the user.
+    See also the [package documentation](https://pydatablog.github.io/ParallelKMeans.jl/stable).
+"""
 mutable struct KMeans <: MMI.Unsupervised
     algo::Union{Symbol, AbstractKMeansAlg}
     k_init::String
@@ -80,7 +83,7 @@ end
 #### FIT FUNCTION
 ####
 """
-    Fit the specified ParaKMeans model constructed by the user.
+    Fit the specified ParallelKMeans model constructed by the user.
 
     See also the [package documentation](https://pydatablog.github.io/ParallelKMeans.jl/stable).
 """
@@ -187,21 +190,21 @@ end
 #### METADATA
 ####
 
-# TODO 4: metadata for the package and for each of the model interfaces
+# Metadata for the package and for each of the model interfaces
 MMI.metadata_pkg.(KMeans,
-    name = "ParallelKMeans",
-    uuid = "42b8e9d4-006b-409a-8472-7f34b3fb58af",
-    url  = "https://github.com/PyDataBlog/ParallelKMeans.jl",
-    julia = true,
-    license = "MIT",
-    is_wrapper = false)
+    name        = "ParallelKMeans",
+    uuid        = "42b8e9d4-006b-409a-8472-7f34b3fb58af",
+    url         = "https://github.com/PyDataBlog/ParallelKMeans.jl",
+    julia       = true,
+    license     = "MIT",
+    is_wrapper  = false)
 
 
 # Metadata for ParaKMeans model interface
 MMI.metadata_model(KMeans,
     input   = MMI.Table(MMI.Continuous),
     output  = MMI.Table(MMI.Continuous),
-    target =  AbstractArray{<:MMI.Multiclass},
+    target  =  AbstractArray{<:MMI.Multiclass},
     weights = false,
     descr   = ParallelKMeans_Desc,
 	path	= "ParallelKMeans.KMeans")
diff --git a/test/test90_minibatch.jl b/test/test90_minibatch.jl
@@ -0,0 +1,49 @@
+module TestMiniBatch
+
+using ParallelKMeans
+using Test
+using StableRNGs
+using StatsBase
+using Distances
+
+
+@testset "MiniBatch default batch size" begin
+    @test MiniBatch() == MiniBatch(100)
+end
+
+
+@testset "MiniBatch convergence" begin
+    X = [1 1 1 4 4 4 4 0 2 3 5 1; 2 4 0 2 0 4 5 1 2 2 5 -1.]
+
+    rng = StableRNG(2020)
+    baseline = kmeans(Lloyd(), X, 2, rng = rng)
+
+    rng = StableRNG(2020)
+    res = kmeans(MiniBatch(6), X, 2, rng = rng)
+
+    @test baseline.totalcost ≈ res.totalcost
+end
+
+
+@testset "MiniBatch metric support" begin
+    X = [1 1 1 4 4 4 4 0 2 3 5 1; 2 4 0 2 0 4 5 1 2 2 5 -1.]
+    rng = StableRNG(2020)
+    rng_orig = deepcopy(rng)
+
+    baseline = kmeans(Lloyd(), X, 2, tol = 1e-16, metric=Cityblock(), rng = rng)
+
+    rng = deepcopy(rng_orig)
+    res = kmeans(MiniBatch(6), X, 2; tol = 1e-16, metric=Cityblock(), rng = rng)
+
+    @test res.totalcost ≈ baseline.totalcost
+    @test res.converged == baseline.converged
+end
+
+
+
+
+
+
+
+
+end # module