link clean-up

PyDataBlog · PyDataBlog · commit 1dd5f1c4acaa · 2020-02-18T16:12:14.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,4 @@
 /docs/site/
 /benchmark/tune.json
 .benchmarkci/
+.idea/*
diff --git a/LICENSE b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2020 Andrey Oskin
+Copyright (c) 2020 Bernard Brenyah & Andrey Oskin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # ParallelKMeans
 
-[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://Arkoniak.github.io/ParallelKMeans.jl/stable)
-[![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://Arkoniak.github.io/ParallelKMeans.jl/dev)
-[![Build Status](https://travis-ci.com/Arkoniak/ParallelKMeans.jl.svg?branch=master)](https://travis-ci.com/Arkoniak/ParallelKMeans.jl)
-[![Coveralls](https://coveralls.io/repos/github/Arkoniak/ParallelKMeans.jl/badge.svg?branch=master)](https://coveralls.io/github/Arkoniak/ParallelKMeans.jl?branch=master)
+[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://PyDataBlog.github.io/ParallelKMeans.jl/stable)
+[![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://PyDataBlog.github.io/ParallelKMeans.jl/dev)
+[![Build Status](https://www.travis-ci.org/PyDataBlog/ParallelKMeans.jl.svg?branch=master)](https://www.travis-ci.org/PyDataBlog/ParallelKMeans.jl)
+[![Coveralls](https://coveralls.io/repos/github/PyDataBlog/ParallelKMeans.jl/badge.svg?branch=master)](https://coveralls.io/github/PyDataBlog/ParallelKMeans.jl?branch=master)
diff --git a/docs/make.jl b/docs/make.jl
@@ -6,12 +6,12 @@ makedocs(;
     pages=[
         "Home" => "index.md",
     ],
-    repo="https://github.com/Arkoniak/ParallelKMeans.jl/blob/{commit}{path}#L{line}",
+    repo="https://github.com/PyDataBlog/ParallelKMeans.jl/blob/{commit}{path}#L{line}",
     sitename="ParallelKMeans.jl",
-    authors="Andrey Oskin",
+    authors="Bernard Brenyah & Andrey Oskin",
     assets=String[],
 )
 
 deploydocs(;
-    repo="github.com/Arkoniak/ParallelKMeans.jl",
+    repo="github.com/PyDataBlog/ParallelKMeans.jl",
 )
diff --git a/src/ParallelKMeans.jl b/src/ParallelKMeans.jl
@@ -1,19 +1,23 @@
 module ParallelKMeans
-########################################
-# Based on discourse discussion
-# https://discourse.julialang.org/t/optimization-tips-for-my-julia-code-can-i-make-it-even-faster-and-or-memory-efficient/34614/20
+
 
 using StatsBase
 import Base.Threads: @spawn, @threads
 
 export kmeans
 
+"""
+TODO: Document function
+"""
 function divider(n, k)
     d = div(n, k)
     xz = vcat(collect((0:k-1) * d), n)
     return [t[1]:t[2] for t in zip(xz[1:end-1] .+ 1, xz[2:end])]
 end
 
+"""
+TODO: Document function
+"""
 function pl_pairwise!(target, x, y, nth = Threads.nthreads())
     ncol = size(x, 2)
     nrow = size(x, 1)
@@ -30,6 +34,9 @@ function pl_pairwise!(target, x, y, nth = Threads.nthreads())
     target
 end
 
+"""
+TODO: Document function
+"""
 function inner_pairwise!(target, x, y, r)
     ncol = size(x, 2)
     @inbounds for k in axes(y, 1)
@@ -46,6 +53,9 @@ function inner_pairwise!(target, x, y, r)
     target
 end
 
+"""
+TODO: Document function
+"""
 function pairwise!(target, x, y)
     ncol = size(x, 2)
     @inbounds for k in axes(y, 1)
@@ -62,11 +72,18 @@ function pairwise!(target, x, y)
     target
 end
 
+
 """
     smart_init(X, k; init="k-means++")
 
     This function handles the random initialisation of the centroids from the
     design matrix (X) and desired groups (k) that a user supplies.
+
+    `k-means++` algorithm is used by default with the normal random selection
+    of centroids from X used if any other string is attempted.
+
+    A tuple representing the centroids, number of rows, & columns respecitively
+    is returned.
 """
 function smart_init(X::Array{Float64, 2}, k::Int; init::String="k-means++")
     n_row, n_col = size(X)
@@ -121,10 +138,14 @@ function smart_init(X::Array{Float64, 2}, k::Int; init::String="k-means++")
 end
 
 
+
 """
     sum_of_squares(x, labels, centre, k)
 
-    This function computes the total sum of squares
+    This function computes the total sum of squares based on the assigned (labels)
+    design matrix(x), centroids (centre), and the number of desired groups (k).
+
+    A Float type representing the computed metric is returned.
 """
 function sum_of_squares(x::Array{Float64,2}, labels::Array{Int64,1}, centre::Array)
     s = 0.0
@@ -138,42 +159,26 @@ function sum_of_squares(x::Array{Float64,2}, labels::Array{Int64,1}, centre::Arr
     return s
 end
 
-# function sum_of_squares(x::Array{Float64,2}, labels::Array{Int64,1}, centre::Array, nth = Base.Threads.nthreads())
-#     s = 0.0
-#
-#     @inbounds for j in axes(x, 2)
-#         for i in axes(x, 1)
-#             s += (x[i, j] - centre[labels[i], j])^2
-#         end
-#     end
-#
-#     return s
-# end
-#
-#
-# function inner_sum_of_squares(x::Array{Float64,2}, labels::Array{Int64,1}, centre::Array, r)
-#     s = 0.0
-#
-#     @inbounds for j in axes(x, 2)
-#         for i in r
-#             s += (x[i, j] - centre[labels[i], j])^2
-#         end
-#     end
-#
-#     return s
-# end
 
 """
     Kmeans(design_matrix, k; k_init="k-means++", max_iters=300, tol=1e-4, verbose=true)
 
-This main function employs the K-means algorithm to cluster all examples
-in the training data (design_matrix) into k groups using either the
-`k-means++` or random initialisation.
+    This main function employs the K-means algorithm to cluster all examples
+    in the training data (design_matrix) into k groups using either the
+    `k-means++` or random initialisation technique for selecting the initial
+    centroids.
+
+    At the end of the number of iterations specified (max_iters), convergence is
+    achieved if difference between the current and last cost objective is
+    less than the tolerance level (tol). An error is thrown if convergence fails.
+
+    Details of operations can be either printed or not by setting verbose accordingly.
+
+    A tuple representing labels, centroids, and sum_squares respectively is returned.
 
-design_matrix should have the form (number of points x point dimensionality).
 """
 function kmeans(design_matrix::Array{Float64, 2}, k::Int; k_init::String = "k-means++",
-    max_iters::Int = 300, tol = 1e-4, verbose::Bool = true)
+    max_iters::Int = 300, tol::Float64 = 1e-4, verbose::Bool = true)
 
     centroids, n_row, n_col = smart_init(design_matrix, k, init=k_init)
 

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-Copyright (c) 2020 Andrey Oskin`
	`1`	`+Copyright (c) 2020 Bernard Brenyah & Andrey Oskin`
`2`	`2`
`3`	`3`	`Permission is hereby granted, free of charge, to any person obtaining a copy`
`4`	`4`	`of this software and associated documentation files (the "Software"), to deal`