additional environment setup

Andrey Oskin · Andrey Oskin · commit 87d75b805832 · 2020-02-17T21:24:50.000+02:00
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
@@ -0,0 +1,29 @@
+name: CompatHelper
+
+on:
+  schedule:
+    - cron: '00 00 * * *'
+  push:
+    branches:
+      - actions/trigger/CompatHelper
+
+jobs:
+  CompatHelper:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: julia-actions/setup-julia@latest
+        with:
+          version: 1.3
+      - name: Pkg.add("CompatHelper")
+        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
+      - name: CompatHelper.main()
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: >-
+          julia -e '
+          using CompatHelper;
+          CompatHelper.main() do;
+              run(`julia --project=test/environments/main -e "import Pkg; Pkg.instantiate(); Pkg.update()"`);
+              run(`julia --project=docs -e "import Pkg; Pkg.instantiate(); Pkg.update()"`);
+          end
+          '
diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
@@ -0,0 +1,11 @@
+name: TagBot
+on:
+  schedule:
+    - cron: 0 * * * *
+jobs:
+  TagBot:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: JuliaRegistries/TagBot@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -0,0 +1,21 @@
+name: Run benchmarks
+
+on:
+  pull_request:
+
+jobs:
+  Benchmark:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: julia-actions/setup-julia@latest
+        with:
+          version: 1.3
+      - name: Install dependencies
+        run: julia -e 'using Pkg; pkg"add PkgBenchmark BenchmarkCI@0.1"'
+      - name: Run benchmarks
+        run: julia -e 'using PkgBenchmark, BenchmarkCI; BenchmarkCI.judge();'
+      - name: Post results
+        run: julia -e "using BenchmarkCI; BenchmarkCI.postjudge()"
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,5 @@
 /dev/
 /docs/build/
 /docs/site/
+/benchmark/tune.json
+.benchmarkci/
diff --git a/Project.toml b/Project.toml
@@ -1,15 +1,17 @@
 name = "ParallelKMeans"
 uuid = "42b8e9d4-006b-409a-8472-7f34b3fb58af"
-authors = ["Andrey Oskin"]
+authors = ["Bernard Brenyah", "Andrey Oskin"]
 version = "0.1.0"
 
 [deps]
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
 julia = "1.3"
 
 [extras]
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test"]
+test = ["Test", "Random"]
diff --git a/benchmark/bench01_distance.jl b/benchmark/bench01_distance.jl
@@ -0,0 +1,26 @@
+module BenchDistance
+using BenchmarkTools
+using ParallelKMeans
+using Distances
+using Random
+
+suite = BenchmarkGroup()
+
+Random.seed!(2020)
+X = rand(100_000, 3)
+centroids = rand(2, 3)
+d = rand(100_000, 2)
+suite["100kx3"] = @benchmarkable ParallelKMeans.pairwise!($d, $X, $centroids)
+
+X = rand(100_000, 10)
+centroids = rand(2, 10)
+d = rand(100_000, 2)
+suite["100kx10"] = @benchmarkable ParallelKMeans.pairwise!($d, $X, $centroids)
+
+# for reference
+metric = SqEuclidean()
+suite["100kx10_distances"] = @benchmarkable Distances.pairwise!($d, $metric, $X, $centroids, dims = 1)
+
+end # module
+
+BenchDistance.suite
diff --git a/src/ParallelKMeans.jl b/src/ParallelKMeans.jl
@@ -3,6 +3,7 @@ module ParallelKMeans
 # Based on discourse discussion
 # https://discourse.julialang.org/t/optimization-tips-for-my-julia-code-can-i-make-it-even-faster-and-or-memory-efficient/34614/20
 
+using StatsBase
 import Base.Threads: @spawn, @threads
 
 export kmeans
@@ -137,30 +138,30 @@ function sum_of_squares(x::Array{Float64,2}, labels::Array{Int64,1}, centre::Arr
     return s
 end
 
-function sum_of_squares(x::Array{Float64,2}, labels::Array{Int64,1}, centre::Array, nth = Base.Threads.nthreads())
-    s = 0.0
-
-    @inbounds for j in axes(x, 2)
-        for i in axes(x, 1)
-            s += (x[i, j] - centre[labels[i], j])^2
-        end
-    end
-
-    return s
-end
-
-
-function inner_sum_of_squares(x::Array{Float64,2}, labels::Array{Int64,1}, centre::Array, r)
-    s = 0.0
-
-    @inbounds for j in axes(x, 2)
-        for i in r
-            s += (x[i, j] - centre[labels[i], j])^2
-        end
-    end
-
-    return s
-end
+# function sum_of_squares(x::Array{Float64,2}, labels::Array{Int64,1}, centre::Array, nth = Base.Threads.nthreads())
+#     s = 0.0
+#
+#     @inbounds for j in axes(x, 2)
+#         for i in axes(x, 1)
+#             s += (x[i, j] - centre[labels[i], j])^2
+#         end
+#     end
+#
+#     return s
+# end
+#
+#
+# function inner_sum_of_squares(x::Array{Float64,2}, labels::Array{Int64,1}, centre::Array, r)
+#     s = 0.0
+#
+#     @inbounds for j in axes(x, 2)
+#         for i in r
+#             s += (x[i, j] - centre[labels[i], j])^2
+#         end
+#     end
+#
+#     return s
+# end
 
 """
     Kmeans(design_matrix, k; k_init="k-means++", max_iters=300, tol=1e-4, verbose=true)
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,6 +1,16 @@
-using ParallelKMeans
+module TestParallelKMeans
 using Test
 
-@testset "ParallelKMeans.jl" begin
-    # Write your own tests here.
+for file in sort([file for file in readdir(@__DIR__) if
+                                   occursin(r"^test[_0-9]+.*\.jl$", file)])
+    m = match(r"test[_0-9]+(.*).jl", file)
+
+    @testset "$(m[1])" begin
+        # Here you can optionally exclude some test files
+        # VERSION < v"1.1" && file == "test_xxx.jl" && continue
+
+        include(file)
+    end
 end
+
+end  # module
diff --git a/test/test01_distance.jl b/test/test01_distance.jl
@@ -0,0 +1,14 @@
+module TestDistance
+using ParallelKMeans: pairwise!, pl_pairwise!
+using Test
+
+@testset "naive singlethread pairwise" begin
+    X = [1.0 2.0; 3.0 5.0; 4.0 6.0]
+    y = [1.0 2.0; ]
+    r = Array{Float64, 2}(undef, 3, 1)
+
+    pairwise!(r, X, y)
+    @test all(r .≈ [0.0, 13.0, 25.0])
+end
+
+end # module
diff --git a/test/test02_kmeans.jl b/test/test02_kmeans.jl
@@ -0,0 +1,18 @@
+module TestKMeans
+using ParallelKMeans
+using Test
+using Random
+
+@testset "linear separation" begin
+    Random.seed!(2020)
+
+    X = rand(100, 3)
+    labels, centroids, sum_squares = kmeans(X, 3; tol = 1e-10, verbose = false)
+
+    # for future reference: Clustering shows here 14.964882850452984
+    # guess they use better initialisation. For now we will use own
+    # value
+    @test sum_squares ≈ 15.314823028363763
+end
+
+end # module