Issue-17: Spearman's rank correlation & Kolmogorov–Smirnov test (#27)

estebanz01 · web-flow · commit 87a49b56381b · 2018-12-31T18:22:02.000-05:00
* spearman-rank-coefficient: Implement coefficient calcluations.

* spearman-rank-coefficient: Extend unit test cases.

* distribution: Implement empirical distribution.

This change implements the Empirical distribution that is used
by the KS-Test. It only implements the CDF, also known as ECDF.

* statistical-test: Implement the two-samples kolmogorov-smirnof test.

This change implements the ks-test for two groups of samples, using the
empirical distribution and calculates the D statistic, which is
accessible on the response of the class method.

* ks-test: Try to implementfunction that tries to find critical values for test.

* version: Codename Random 2.1.0.
diff --git a/lib/statistics/distribution/empirical.rb b/lib/statistics/distribution/empirical.rb
@@ -0,0 +1,26 @@
+module Statistics
+  module Distribution
+    class Empirical
+      attr_accessor :samples
+
+      def initialize(samples:)
+        self.samples = samples
+      end
+
+      # Formula grabbed from here: https://statlect.com/asymptotic-theory/empirical-distribution
+      def cumulative_function(x:)
+        cumulative_sum = samples.reduce(0) do |summation, sample|
+          summation += if sample <= x
+                         1
+                       else
+                         0
+                       end
+
+          summation
+        end
+
+        cumulative_sum / samples.size.to_f
+      end
+    end
+  end
+end
diff --git a/lib/statistics/distribution/weibull.rb b/lib/statistics/distribution/weibull.rb
@@ -45,7 +45,7 @@ def variance
       # Using the inverse CDF function, also called quantile, we can calculate
       # a random sample that follows a weibull distribution.
       #
-      # Formula extracted from http://www.stat.yale.edu/Courses/1997-98/101/chigf.htm
+      # Formula extracted from https://www.taygeta.com/random/weibull.html
       def random(elements: 1, seed: Random.new_seed)
         results = []
 
diff --git a/lib/statistics/spearman_rank_coefficient.rb b/lib/statistics/spearman_rank_coefficient.rb
@@ -0,0 +1,71 @@
+module Statistics
+  class SpearmanRankCoefficient
+    def self.rank(data:, return_ranks_only: true)
+      descending_order_data = data.sort { |a, b| b <=> a }
+      rankings = {}
+
+      data.each do |value|
+        # If we have ties, the find_index method will only retrieve the index of the
+        # first element in the list (i.e, the most close to the left of the array),
+        # so when a tie is detected, we increase the temporal ranking by the number of
+        # counted elements at that particular time and then we increase the counter.
+        temporal_ranking = descending_order_data.find_index(value) + 1 # 0-index
+
+        if rankings.fetch(value, false)
+          rankings[value][:rank] += (temporal_ranking + rankings[value][:counter])
+          rankings[value][:counter] += 1
+          rankings[value][:tie_rank] = rankings[value][:rank] / rankings[value][:counter].to_f
+        else
+          rankings[value] = { counter: 1, rank: temporal_ranking, tie_rank: temporal_ranking }
+        end
+      end
+
+      if return_ranks_only
+        data.map do |value|
+          rankings[value][:tie_rank]
+        end
+      else
+        rankings
+      end
+    end
+
+    # Formulas extracted from: https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide.php
+    def self.coefficient(set_one, set_two)
+      raise 'Both group sets must have the same number of cases.' if set_one.size != set_two.size
+      return if set_one.size == 0 && set_two.size == 0
+
+      set_one_mean, set_two_mean = set_one.mean, set_two.mean
+      have_tie_ranks = (set_one + set_two).any? { |rank| rank.is_a?(Float) }
+
+      if have_tie_ranks
+        numerator = 0
+        squared_differences_set_one = 0
+        squared_differences_set_two = 0
+
+        set_one.size.times do |idx|
+          local_diff_one = (set_one[idx] - set_one_mean)
+          local_diff_two = (set_two[idx] - set_two_mean)
+
+          squared_differences_set_one += local_diff_one ** 2
+          squared_differences_set_two += local_diff_two ** 2
+
+          numerator += local_diff_one * local_diff_two
+        end
+
+        denominator = Math.sqrt(squared_differences_set_one * squared_differences_set_two)
+
+        numerator / denominator.to_f # This is rho or spearman's coefficient.
+      else
+        sum_squared_differences = set_one.each_with_index.reduce(0) do |memo, (rank_one, index)|
+          memo += ((rank_one - set_two[index]) ** 2)
+          memo
+        end
+
+        numerator = 6 * sum_squared_differences
+        denominator = ((set_one.size ** 3) - set_one.size)
+
+        1.0 - (numerator / denominator.to_f) # This is rho or spearman's coefficient.
+      end
+    end
+  end
+end
diff --git a/lib/statistics/statistical_test/kolmogorov_smirnov_test.rb b/lib/statistics/statistical_test/kolmogorov_smirnov_test.rb
@@ -0,0 +1,70 @@
+module Statistics
+  module StatisticalTest
+    class KolmogorovSmirnovTest
+      # Common alpha, and critical D are calculated following formulas from: https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test#Two-sample_Kolmogorov%E2%80%93Smirnov_test
+      def self.two_samples(group_one:, group_two:, alpha: 0.05)
+        samples = group_one + group_two # We can use unbalaced group samples
+
+        ecdf_one = Distribution::Empirical.new(samples: group_one)
+        ecdf_two = Distribution::Empirical.new(samples: group_two)
+
+        d_max = samples.sort.map do |sample|
+          d1 = ecdf_one.cumulative_function(x: sample)
+          d2 = ecdf_two.cumulative_function(x: sample)
+
+          (d1 - d2).abs
+        end.max
+
+        # TODO: Validate calculation of Common alpha.
+        common_alpha = Math.sqrt((-0.5 * Math.log(alpha)))
+        radicand = (group_one.size + group_two.size) / (group_one.size * group_two.size).to_f
+
+        critical_d = common_alpha * Math.sqrt(radicand)
+        # critical_d = self.critical_d(alpha: alpha, n: samples.size)
+
+        # We are unable to calculate the p_value, because we don't have the Kolmogorov distribution
+        # defined. We reject the null hypotesis if Dmax is > than Dcritical.
+        { d_max: d_max,
+          d_critical: critical_d,
+          total_samples: samples.size,
+          alpha: alpha,
+          null: d_max <= critical_d,
+          alternative: d_max > critical_d,
+          confidence_level: 1.0 - alpha }
+      end
+
+      # This is an implementation of the formula presented by Paul Molin and Hervé Abdi in a paper,
+      # called "New Table and numerical approximations for Kolmogorov-Smirnov / Lilliefors / Van Soest
+      # normality test".
+      # In this paper, the authors defines a couple of 6th-degree polynomial functions that allow us
+      # to find an aproximation of the real critical value. This is based in the conclusions made by
+      # Dagnelie (1968), where indicates that critical values given by Lilliefors can be approximated
+      # numerically.
+      #
+      # In general, the formula found is:
+      #  C(N, alpha) ^ -2  = A(alpha) * N + B(alpha).
+      #
+      # Where A(alpha), B(alpha) are two 6th degree polynomial functions computed using the principle
+      # of Monte Carlo simulations.
+      #
+      # paper can be found here: https://utdallas.edu/~herve/MolinAbdi1998-LillieforsTechReport.pdf
+      # def self.critical_d(alpha:, n:)
+      #   confidence = 1.0 - alpha
+
+      #   a_alpha = 6.32207539843126 -17.1398870006148 * confidence +
+      #     38.42812675101057 * (confidence ** 2) - 45.93241384693391 * (confidence ** 3) +
+      #     7.88697700041829 * (confidence ** 4) + 29.79317711037858 * (confidence ** 5) -
+      #     18.48090137098585 * (confidence ** 6)
+
+      #   b_alpha = 12.940399038404 - 53.458334259532 * confidence +
+      #     186.923866119699 * (confidence ** 2) - 410.582178349305 * (confidence ** 3) +
+      #     517.377862566267 * (confidence ** 4) - 343.581476222384 * (confidence ** 5) +
+      #     92.123451358715 * (confidence ** 6)
+
+      #   Math.sqrt(1.0 / (a_alpha * n + b_alpha))
+      # end
+    end
+
+    KSTest = KolmogorovSmirnovTest # Alias
+  end
+end
diff --git a/lib/statistics/version.rb b/lib/statistics/version.rb
@@ -1,3 +1,3 @@
 module Statistics
-  VERSION = "2.0.5"
+  VERSION = "2.1.0"
 end
diff --git a/spec/statistics/distribution/empirical_spec.rb b/spec/statistics/distribution/empirical_spec.rb
@@ -0,0 +1,17 @@
+require 'spec_helper'
+
+describe Statistics::Distribution::Empirical do
+  describe '#cumulative_function' do
+    it 'calculates the CDF for the specified value using a group of samples' do
+      # Result in R
+      # > cdf <- ecdf(c(1,2,3,4,5,6,7,8,9,0))
+      # > cdf(7)
+      # [1] 0.8
+      samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
+      x = 7
+      x_prob = 0.8
+
+      expect(described_class.new(samples: samples).cumulative_function(x: x)).to eq x_prob
+    end
+  end
+end
diff --git a/spec/statistics/spearman_rank_coefficient_spec.rb b/spec/statistics/spearman_rank_coefficient_spec.rb
@@ -0,0 +1,154 @@
+require 'spec_helper'
+
+describe Statistics::SpearmanRankCoefficient do
+  describe '.rank' do
+    context 'when only ranks are needed' do
+      it 'returns an array of elements corresponding to the expected ranks wihout altering order' do
+        expected_ranks = [4, 1, 3, 2, 5]
+
+        result = described_class.rank(data: [10, 30, 12, 15, 3], return_ranks_only: true)
+
+        expect(result).to eq expected_ranks
+      end
+    end
+
+    context 'when ranks and passed elements are needed' do
+      it 'returns a hash composed by the elements and ranking information' do
+        expected_ranks = {
+          30 => { counter: 1, rank: 1, tie_rank: 1 },
+          15 => { counter: 1, rank: 2, tie_rank: 2 },
+          12 => { counter: 1, rank: 3, tie_rank: 3 },
+          10 => { counter: 1, rank: 4, tie_rank: 4 },
+          3 => { counter: 1, rank: 5, tie_rank: 5 }
+        }
+
+        result = described_class.rank(data: [10, 30, 12, 15, 3], return_ranks_only: false)
+
+        expect(result).to eq expected_ranks
+      end
+    end
+
+    context 'when there are ties' do
+      it 'returns a ranking list with solved ties when ranks only are needed' do
+        expected_ranking = [9, 3, 10, 4, 6.5, 5, 8, 1, 2, 6.5]
+        data = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61]
+
+        result = described_class.rank(data: data, return_ranks_only: true)
+
+        expect(result).to eq expected_ranking
+      end
+
+      it 'returns a hash composed by the elements and some ranking information' do
+        expected_ranks = {
+          80 => { counter: 1, rank: 1, tie_rank: 1 },
+          76 => { counter: 1, rank: 2, tie_rank: 2 },
+          75 => { counter: 1, rank: 3, tie_rank: 3 },
+          71 => { counter: 1, rank: 4, tie_rank: 4 },
+          64 => { counter: 1, rank: 5, tie_rank: 5 },
+          61 => { counter: 2, rank: 13, tie_rank: 6.5 },
+          58 => { counter: 1, rank: 8, tie_rank: 8 },
+          56 => { counter: 1, rank: 9, tie_rank: 9 },
+          45 => { counter: 1, rank: 10, tie_rank: 10 }
+        }
+        data = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61]
+
+        result = described_class.rank(data: data, return_ranks_only: false)
+
+        expect(result).to include(expected_ranks)
+      end
+
+      it 'returns a hash containing information about the existing ties' do
+        tie_rank = { 61 => { counter: 2, tie_rank: 6.5, rank: 13 } }
+        data = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61]
+
+        result = described_class.rank(data: data, return_ranks_only: false)
+
+        expect(result).to include(tie_rank)
+      end
+    end
+  end
+
+  describe '.coefficient' do
+    it 'raises an error when the groups have different number of cases' do
+      expect do
+        described_class.coefficient([1, 2, 3], [1, 2, 3, 4])
+      end.to raise_error(StandardError, 'Both group sets must have the same number of cases.')
+    end
+
+    it 'returns nothing when both groups have a size of zero cases' do
+      expect(described_class.coefficient([], [])).to be_nil
+    end
+
+    context 'when there are ties in the data' do
+      it 'calculates the spearman rank coefficient for example one' do
+        # Example taken from http://www.biostathandbook.com/spearman.html
+        volume = [1760, 2040, 2440, 2550, 2730, 2740, 3010, 3080, 3370, 3740, 4910, 5090, 5090, 5380, 5850, 6730, 6990, 7960]
+        frequency = [529, 566, 473, 461, 465, 532, 484, 527, 488, 485, 478, 434, 468, 449, 425, 389, 421, 416]
+
+        volume_rank = described_class.rank(data: volume)
+        frequency_rank = described_class.rank(data: frequency)
+
+        rho = described_class.coefficient(volume_rank, frequency_rank)
+        expect(rho.round(3)).to eq -0.763
+      end
+
+      it 'calcultes the spearman rank coefficient for example two' do
+        # Example taken from https://geographyfieldwork.com/SpearmansRank.htm
+        # Results from R:
+        # cor(c(50, 175, 270, 375, 425, 580, 710, 790, 890, 980), c(1.80, 1.20, 2.0, 1.0, 1.0, 1.20, 0.80, 0.60, 1.0, 0.85), method = 'spearman')
+        # [1] -0.7570127
+        distance = [50, 175, 270, 375, 425, 580, 710, 790, 890, 980]
+        price = [1.80, 1.20, 2.0, 1.0, 1.0, 1.20, 0.80, 0.60, 1.0, 0.85]
+
+        distance_rank = described_class.rank(data: distance)
+        price_rank = described_class.rank(data: price)
+
+        rho = described_class.coefficient(distance_rank, price_rank)
+
+        expect(rho.round(7)).to eq -0.7570127
+      end
+
+      it 'calculates the spearman rank coefficient for example three' do
+        # Example taken from http://www.real-statistics.com/correlation/spearmans-rank-correlation/spearmans-rank-correlation-detailed/
+
+        life_exp = [80, 78, 60, 53, 85, 84, 73, 79, 81, 75, 68, 72, 58, 92, 65]
+        cigarretes = [5, 23, 25, 48, 17, 8, 4, 26, 11, 19, 14, 35, 29, 4, 23]
+
+        life_rank = described_class.rank(data: life_exp)
+        cigarretes_rank = described_class.rank(data: cigarretes)
+
+        rho = described_class.coefficient(life_rank, cigarretes_rank)
+
+        expect(rho.round(5)).to eq -0.67442
+      end
+    end
+
+    context 'when there are no ties in the data' do
+      it 'calculates the spearman rank coefficient for example one' do
+        # Example taken from here: https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php
+        english_data = [56, 75, 45, 71, 62, 64, 58, 80, 76, 61]
+        math_data = [66, 70, 40, 60, 65, 56, 59, 77, 67, 63]
+
+        english_rank = described_class.rank(data: english_data)
+        math_rank = described_class.rank(data: math_data)
+
+        rho = described_class.coefficient(english_rank, math_rank)
+
+        expect(rho.round(2)).to eq 0.67
+      end
+
+      it 'calculates the spearman rank coefficient for example two' do
+        # Example taken from here: https://www.statisticshowto.datasciencecentral.com/spearman-rank-correlation-definition-calculate/
+        physics = [35, 23, 47, 17, 10, 43, 9, 6, 28]
+        math = [30, 33, 45, 23, 8, 49, 12, 4, 31]
+
+        physics_rank = described_class.rank(data: physics)
+        math_rank = described_class.rank(data: math)
+
+        rho = described_class.coefficient(physics_rank, math_rank)
+
+        expect(rho).to eq 0.9
+      end
+    end
+  end
+end
diff --git a/spec/statistics/statistical_test/kolmogorov_smirnov_test_spec.rb b/spec/statistics/statistical_test/kolmogorov_smirnov_test_spec.rb

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ def variance`
`45`	`45`	`# Using the inverse CDF function, also called quantile, we can calculate`
`46`	`46`	`# a random sample that follows a weibull distribution.`
`47`	`47`	`#`
`48`		`- # Formula extracted from http://www.stat.yale.edu/Courses/1997-98/101/chigf.htm`
	`48`	`+ # Formula extracted from https://www.taygeta.com/random/weibull.html`
`49`	`49`	`def random(elements: 1, seed: Random.new_seed)`
`50`	`50`	`results = []`
`51`	`51`