Skip to content

Commit 87a49b5

Browse files
authored
Issue-17: Spearman's rank correlation & Kolmogorov–Smirnov test (#27)
* spearman-rank-coefficient: Implement coefficient calcluations. * spearman-rank-coefficient: Extend unit test cases. * distribution: Implement empirical distribution. This change implements the Empirical distribution that is used by the KS-Test. It only implements the CDF, also known as ECDF. * statistical-test: Implement the two-samples kolmogorov-smirnof test. This change implements the ks-test for two groups of samples, using the empirical distribution and calculates the D statistic, which is accessible on the response of the class method. * ks-test: Try to implementfunction that tries to find critical values for test. * version: Codename Random 2.1.0.
1 parent 5fcdee0 commit 87a49b5

8 files changed

Lines changed: 410 additions & 2 deletions

File tree

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
module Statistics
2+
module Distribution
3+
class Empirical
4+
attr_accessor :samples
5+
6+
def initialize(samples:)
7+
self.samples = samples
8+
end
9+
10+
# Formula grabbed from here: https://statlect.com/asymptotic-theory/empirical-distribution
11+
def cumulative_function(x:)
12+
cumulative_sum = samples.reduce(0) do |summation, sample|
13+
summation += if sample <= x
14+
1
15+
else
16+
0
17+
end
18+
19+
summation
20+
end
21+
22+
cumulative_sum / samples.size.to_f
23+
end
24+
end
25+
end
26+
end

lib/statistics/distribution/weibull.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def variance
4545
# Using the inverse CDF function, also called quantile, we can calculate
4646
# a random sample that follows a weibull distribution.
4747
#
48-
# Formula extracted from http://www.stat.yale.edu/Courses/1997-98/101/chigf.htm
48+
# Formula extracted from https://www.taygeta.com/random/weibull.html
4949
def random(elements: 1, seed: Random.new_seed)
5050
results = []
5151

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
module Statistics
2+
class SpearmanRankCoefficient
3+
def self.rank(data:, return_ranks_only: true)
4+
descending_order_data = data.sort { |a, b| b <=> a }
5+
rankings = {}
6+
7+
data.each do |value|
8+
# If we have ties, the find_index method will only retrieve the index of the
9+
# first element in the list (i.e, the most close to the left of the array),
10+
# so when a tie is detected, we increase the temporal ranking by the number of
11+
# counted elements at that particular time and then we increase the counter.
12+
temporal_ranking = descending_order_data.find_index(value) + 1 # 0-index
13+
14+
if rankings.fetch(value, false)
15+
rankings[value][:rank] += (temporal_ranking + rankings[value][:counter])
16+
rankings[value][:counter] += 1
17+
rankings[value][:tie_rank] = rankings[value][:rank] / rankings[value][:counter].to_f
18+
else
19+
rankings[value] = { counter: 1, rank: temporal_ranking, tie_rank: temporal_ranking }
20+
end
21+
end
22+
23+
if return_ranks_only
24+
data.map do |value|
25+
rankings[value][:tie_rank]
26+
end
27+
else
28+
rankings
29+
end
30+
end
31+
32+
# Formulas extracted from: https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide.php
33+
def self.coefficient(set_one, set_two)
34+
raise 'Both group sets must have the same number of cases.' if set_one.size != set_two.size
35+
return if set_one.size == 0 && set_two.size == 0
36+
37+
set_one_mean, set_two_mean = set_one.mean, set_two.mean
38+
have_tie_ranks = (set_one + set_two).any? { |rank| rank.is_a?(Float) }
39+
40+
if have_tie_ranks
41+
numerator = 0
42+
squared_differences_set_one = 0
43+
squared_differences_set_two = 0
44+
45+
set_one.size.times do |idx|
46+
local_diff_one = (set_one[idx] - set_one_mean)
47+
local_diff_two = (set_two[idx] - set_two_mean)
48+
49+
squared_differences_set_one += local_diff_one ** 2
50+
squared_differences_set_two += local_diff_two ** 2
51+
52+
numerator += local_diff_one * local_diff_two
53+
end
54+
55+
denominator = Math.sqrt(squared_differences_set_one * squared_differences_set_two)
56+
57+
numerator / denominator.to_f # This is rho or spearman's coefficient.
58+
else
59+
sum_squared_differences = set_one.each_with_index.reduce(0) do |memo, (rank_one, index)|
60+
memo += ((rank_one - set_two[index]) ** 2)
61+
memo
62+
end
63+
64+
numerator = 6 * sum_squared_differences
65+
denominator = ((set_one.size ** 3) - set_one.size)
66+
67+
1.0 - (numerator / denominator.to_f) # This is rho or spearman's coefficient.
68+
end
69+
end
70+
end
71+
end
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
module Statistics
2+
module StatisticalTest
3+
class KolmogorovSmirnovTest
4+
# Common alpha, and critical D are calculated following formulas from: https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test#Two-sample_Kolmogorov%E2%80%93Smirnov_test
5+
def self.two_samples(group_one:, group_two:, alpha: 0.05)
6+
samples = group_one + group_two # We can use unbalaced group samples
7+
8+
ecdf_one = Distribution::Empirical.new(samples: group_one)
9+
ecdf_two = Distribution::Empirical.new(samples: group_two)
10+
11+
d_max = samples.sort.map do |sample|
12+
d1 = ecdf_one.cumulative_function(x: sample)
13+
d2 = ecdf_two.cumulative_function(x: sample)
14+
15+
(d1 - d2).abs
16+
end.max
17+
18+
# TODO: Validate calculation of Common alpha.
19+
common_alpha = Math.sqrt((-0.5 * Math.log(alpha)))
20+
radicand = (group_one.size + group_two.size) / (group_one.size * group_two.size).to_f
21+
22+
critical_d = common_alpha * Math.sqrt(radicand)
23+
# critical_d = self.critical_d(alpha: alpha, n: samples.size)
24+
25+
# We are unable to calculate the p_value, because we don't have the Kolmogorov distribution
26+
# defined. We reject the null hypotesis if Dmax is > than Dcritical.
27+
{ d_max: d_max,
28+
d_critical: critical_d,
29+
total_samples: samples.size,
30+
alpha: alpha,
31+
null: d_max <= critical_d,
32+
alternative: d_max > critical_d,
33+
confidence_level: 1.0 - alpha }
34+
end
35+
36+
# This is an implementation of the formula presented by Paul Molin and Hervé Abdi in a paper,
37+
# called "New Table and numerical approximations for Kolmogorov-Smirnov / Lilliefors / Van Soest
38+
# normality test".
39+
# In this paper, the authors defines a couple of 6th-degree polynomial functions that allow us
40+
# to find an aproximation of the real critical value. This is based in the conclusions made by
41+
# Dagnelie (1968), where indicates that critical values given by Lilliefors can be approximated
42+
# numerically.
43+
#
44+
# In general, the formula found is:
45+
# C(N, alpha) ^ -2 = A(alpha) * N + B(alpha).
46+
#
47+
# Where A(alpha), B(alpha) are two 6th degree polynomial functions computed using the principle
48+
# of Monte Carlo simulations.
49+
#
50+
# paper can be found here: https://utdallas.edu/~herve/MolinAbdi1998-LillieforsTechReport.pdf
51+
# def self.critical_d(alpha:, n:)
52+
# confidence = 1.0 - alpha
53+
54+
# a_alpha = 6.32207539843126 -17.1398870006148 * confidence +
55+
# 38.42812675101057 * (confidence ** 2) - 45.93241384693391 * (confidence ** 3) +
56+
# 7.88697700041829 * (confidence ** 4) + 29.79317711037858 * (confidence ** 5) -
57+
# 18.48090137098585 * (confidence ** 6)
58+
59+
# b_alpha = 12.940399038404 - 53.458334259532 * confidence +
60+
# 186.923866119699 * (confidence ** 2) - 410.582178349305 * (confidence ** 3) +
61+
# 517.377862566267 * (confidence ** 4) - 343.581476222384 * (confidence ** 5) +
62+
# 92.123451358715 * (confidence ** 6)
63+
64+
# Math.sqrt(1.0 / (a_alpha * n + b_alpha))
65+
# end
66+
end
67+
68+
KSTest = KolmogorovSmirnovTest # Alias
69+
end
70+
end

lib/statistics/version.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
module Statistics
2-
VERSION = "2.0.5"
2+
VERSION = "2.1.0"
33
end
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
require 'spec_helper'
2+
3+
describe Statistics::Distribution::Empirical do
4+
describe '#cumulative_function' do
5+
it 'calculates the CDF for the specified value using a group of samples' do
6+
# Result in R
7+
# > cdf <- ecdf(c(1,2,3,4,5,6,7,8,9,0))
8+
# > cdf(7)
9+
# [1] 0.8
10+
samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
11+
x = 7
12+
x_prob = 0.8
13+
14+
expect(described_class.new(samples: samples).cumulative_function(x: x)).to eq x_prob
15+
end
16+
end
17+
end
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
require 'spec_helper'
2+
3+
describe Statistics::SpearmanRankCoefficient do
4+
describe '.rank' do
5+
context 'when only ranks are needed' do
6+
it 'returns an array of elements corresponding to the expected ranks wihout altering order' do
7+
expected_ranks = [4, 1, 3, 2, 5]
8+
9+
result = described_class.rank(data: [10, 30, 12, 15, 3], return_ranks_only: true)
10+
11+
expect(result).to eq expected_ranks
12+
end
13+
end
14+
15+
context 'when ranks and passed elements are needed' do
16+
it 'returns a hash composed by the elements and ranking information' do
17+
expected_ranks = {
18+
30 => { counter: 1, rank: 1, tie_rank: 1 },
19+
15 => { counter: 1, rank: 2, tie_rank: 2 },
20+
12 => { counter: 1, rank: 3, tie_rank: 3 },
21+
10 => { counter: 1, rank: 4, tie_rank: 4 },
22+
3 => { counter: 1, rank: 5, tie_rank: 5 }
23+
}
24+
25+
result = described_class.rank(data: [10, 30, 12, 15, 3], return_ranks_only: false)
26+
27+
expect(result).to eq expected_ranks
28+
end
29+
end
30+
31+
context 'when there are ties' do
32+
it 'returns a ranking list with solved ties when ranks only are needed' do
33+
expected_ranking = [9, 3, 10, 4, 6.5, 5, 8, 1, 2, 6.5]
34+
data = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61]
35+
36+
result = described_class.rank(data: data, return_ranks_only: true)
37+
38+
expect(result).to eq expected_ranking
39+
end
40+
41+
it 'returns a hash composed by the elements and some ranking information' do
42+
expected_ranks = {
43+
80 => { counter: 1, rank: 1, tie_rank: 1 },
44+
76 => { counter: 1, rank: 2, tie_rank: 2 },
45+
75 => { counter: 1, rank: 3, tie_rank: 3 },
46+
71 => { counter: 1, rank: 4, tie_rank: 4 },
47+
64 => { counter: 1, rank: 5, tie_rank: 5 },
48+
61 => { counter: 2, rank: 13, tie_rank: 6.5 },
49+
58 => { counter: 1, rank: 8, tie_rank: 8 },
50+
56 => { counter: 1, rank: 9, tie_rank: 9 },
51+
45 => { counter: 1, rank: 10, tie_rank: 10 }
52+
}
53+
data = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61]
54+
55+
result = described_class.rank(data: data, return_ranks_only: false)
56+
57+
expect(result).to include(expected_ranks)
58+
end
59+
60+
it 'returns a hash containing information about the existing ties' do
61+
tie_rank = { 61 => { counter: 2, tie_rank: 6.5, rank: 13 } }
62+
data = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61]
63+
64+
result = described_class.rank(data: data, return_ranks_only: false)
65+
66+
expect(result).to include(tie_rank)
67+
end
68+
end
69+
end
70+
71+
describe '.coefficient' do
72+
it 'raises an error when the groups have different number of cases' do
73+
expect do
74+
described_class.coefficient([1, 2, 3], [1, 2, 3, 4])
75+
end.to raise_error(StandardError, 'Both group sets must have the same number of cases.')
76+
end
77+
78+
it 'returns nothing when both groups have a size of zero cases' do
79+
expect(described_class.coefficient([], [])).to be_nil
80+
end
81+
82+
context 'when there are ties in the data' do
83+
it 'calculates the spearman rank coefficient for example one' do
84+
# Example taken from http://www.biostathandbook.com/spearman.html
85+
volume = [1760, 2040, 2440, 2550, 2730, 2740, 3010, 3080, 3370, 3740, 4910, 5090, 5090, 5380, 5850, 6730, 6990, 7960]
86+
frequency = [529, 566, 473, 461, 465, 532, 484, 527, 488, 485, 478, 434, 468, 449, 425, 389, 421, 416]
87+
88+
volume_rank = described_class.rank(data: volume)
89+
frequency_rank = described_class.rank(data: frequency)
90+
91+
rho = described_class.coefficient(volume_rank, frequency_rank)
92+
expect(rho.round(3)).to eq -0.763
93+
end
94+
95+
it 'calcultes the spearman rank coefficient for example two' do
96+
# Example taken from https://geographyfieldwork.com/SpearmansRank.htm
97+
# Results from R:
98+
# cor(c(50, 175, 270, 375, 425, 580, 710, 790, 890, 980), c(1.80, 1.20, 2.0, 1.0, 1.0, 1.20, 0.80, 0.60, 1.0, 0.85), method = 'spearman')
99+
# [1] -0.7570127
100+
distance = [50, 175, 270, 375, 425, 580, 710, 790, 890, 980]
101+
price = [1.80, 1.20, 2.0, 1.0, 1.0, 1.20, 0.80, 0.60, 1.0, 0.85]
102+
103+
distance_rank = described_class.rank(data: distance)
104+
price_rank = described_class.rank(data: price)
105+
106+
rho = described_class.coefficient(distance_rank, price_rank)
107+
108+
expect(rho.round(7)).to eq -0.7570127
109+
end
110+
111+
it 'calculates the spearman rank coefficient for example three' do
112+
# Example taken from http://www.real-statistics.com/correlation/spearmans-rank-correlation/spearmans-rank-correlation-detailed/
113+
114+
life_exp = [80, 78, 60, 53, 85, 84, 73, 79, 81, 75, 68, 72, 58, 92, 65]
115+
cigarretes = [5, 23, 25, 48, 17, 8, 4, 26, 11, 19, 14, 35, 29, 4, 23]
116+
117+
life_rank = described_class.rank(data: life_exp)
118+
cigarretes_rank = described_class.rank(data: cigarretes)
119+
120+
rho = described_class.coefficient(life_rank, cigarretes_rank)
121+
122+
expect(rho.round(5)).to eq -0.67442
123+
end
124+
end
125+
126+
context 'when there are no ties in the data' do
127+
it 'calculates the spearman rank coefficient for example one' do
128+
# Example taken from here: https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php
129+
english_data = [56, 75, 45, 71, 62, 64, 58, 80, 76, 61]
130+
math_data = [66, 70, 40, 60, 65, 56, 59, 77, 67, 63]
131+
132+
english_rank = described_class.rank(data: english_data)
133+
math_rank = described_class.rank(data: math_data)
134+
135+
rho = described_class.coefficient(english_rank, math_rank)
136+
137+
expect(rho.round(2)).to eq 0.67
138+
end
139+
140+
it 'calculates the spearman rank coefficient for example two' do
141+
# Example taken from here: https://www.statisticshowto.datasciencecentral.com/spearman-rank-correlation-definition-calculate/
142+
physics = [35, 23, 47, 17, 10, 43, 9, 6, 28]
143+
math = [30, 33, 45, 23, 8, 49, 12, 4, 31]
144+
145+
physics_rank = described_class.rank(data: physics)
146+
math_rank = described_class.rank(data: math)
147+
148+
rho = described_class.coefficient(physics_rank, math_rank)
149+
150+
expect(rho).to eq 0.9
151+
end
152+
end
153+
end
154+
end

0 commit comments

Comments
 (0)