Fix Hodges-Lehmann distribution ratio calculation

mgrzaslewicz · mgrzaslewicz · commit fee23ec06988 · 2024-07-22T16:31:55.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,16 @@ Dropping a requirement of a major version of a dependency is a new contract.
 ## [Unreleased]
 [Unreleased]: https://github.com/atlassian/report/compare/release-4.5.0...master
 
+### Added
+- Add `DistributionComparator` which uses a proper Hodges-Lehmann implementation using pseudo-median instead of median
+
+### Fixed
+- Fix calculating distribution ratio in `RelativeNonparametricPerformanceJudge` by `DistributionComparator`
+
+### Deprecated
+- Deprecate `ShiftedDistributionRegressionTest` in favor of `DistributionComparator`
+
+
 ## [4.5.0] - 2024-07-01
 [4.5.0]: https://github.com/atlassian/report/compare/release-4.4.0...release-4.5.0
 
diff --git a/src/main/kotlin/com/atlassian/performance/tools/report/api/ShiftedDistributionRegressionTest.kt b/src/main/kotlin/com/atlassian/performance/tools/report/api/ShiftedDistributionRegressionTest.kt
@@ -13,7 +13,9 @@ import org.apache.commons.math3.stat.descriptive.rank.Median
  * @experiment experiment durations
  * @param mwAlpha Mann-Whitney significance level
  * @param ksAlpha Kolmogorov-Smirnov significance level
+ * @deprecated Use [DistributionComparator] instead
  */
+@Deprecated("Use DistributionComparator instead")
 class ShiftedDistributionRegressionTest(
     private val baseline: DoubleArray,
     private val experiment: DoubleArray,
@@ -57,7 +59,12 @@ class ShiftedDistributionRegressionTest(
     }
 
     internal fun overcomesTolerance(tolerance: Double): Boolean {
-        return isExperimentRegressed(tolerance) || isExperimentImproved(tolerance)
+        val isExperimentRegressed = isExperimentRegressed(tolerance)
+        val isExperimentImproved  = isExperimentImproved(tolerance)
+        if (isExperimentImproved && isExperimentRegressed) {
+            throw IllegalArgumentException("Experiment can't be both regressed and improved at the same time")
+        }
+        return isExperimentRegressed || isExperimentImproved
     }
 
     /**
diff --git a/src/main/kotlin/com/atlassian/performance/tools/report/api/distribution/DistributionComparator.kt b/src/main/kotlin/com/atlassian/performance/tools/report/api/distribution/DistributionComparator.kt
@@ -0,0 +1,113 @@
+package com.atlassian.performance.tools.report.api.distribution
+
+import com.numericalmethod.suanshu.stats.test.rank.wilcoxon.WilcoxonRankSum
+import org.apache.commons.math3.stat.descriptive.rank.Median
+import org.apache.commons.math3.stat.ranking.NaNStrategy
+
+class DistributionComparator private constructor(
+    private val baseline: DoubleArray,
+    private val experiment: DoubleArray,
+    /**
+     * A percentage by which experiment can be slower/faster than baseline and not considered as a regression/improvement
+     */
+    private val tolerance: Double,
+    private val significance: Double
+) {
+
+
+
+    /**
+     * Performs a one-tailed Mann–Whitney U test to check whether experiment is not slower than the baseline
+     *
+     * @return true if the experiment is slower than the baseline by more than tolerance, false otherwise
+     */
+    private fun isExperimentRegressed(baselineMedian: Double): Boolean {
+        val mu = -tolerance * baselineMedian
+        return WilcoxonRankSum(baseline, experiment, mu).pValue1SidedLess < significance
+    }
+
+    private fun isExperimentImproved(baselineMedian: Double): Boolean {
+        val mu = -tolerance * baselineMedian
+        val wilcoxon = WilcoxonRankSum(experiment, baseline, mu)
+        return wilcoxon.pValue1SidedLess < significance
+    }
+
+    /**
+     * Pseudo-median: the median of the Walsh (pairwise) averages
+     */
+    private fun pseudoMedian(array: DoubleArray): Double {
+        val n = array.size
+        val size = n * (n + 1) / 2 - n
+        val values = DoubleArray(size)
+        var k = 0
+        for (i in 0 until n) {
+            for (j in i + 1 until n) {
+                values[k++] = (array[i] + array[j]) / 2
+            }
+        }
+        return Median().evaluate(values)
+    }
+
+    private fun median(func: (xi: Double, yj: Double) -> Double): Double {
+        val values = DoubleArray(baseline.size * experiment.size)
+        var k = 0
+        for (i in baseline.indices) {
+            for (j in experiment.indices) {
+                values[k++] = func(baseline[i], experiment[j])
+            }
+        }
+        return Median().withNaNStrategy(NaNStrategy.MINIMAL).evaluate(values)
+    }
+
+    private fun shift(): Double {
+        return median { xi, yj -> yj - xi }
+    }
+
+    private fun ratio(): Double {
+        return median { xi, yj -> yj / xi }
+    }
+
+    /**
+     * Calculates the distance between two data sets based on the [Hodges-Lehmann estimator][].
+     * [Hodges-Lehmann estimator]: https://en.wikipedia.org/wiki/Hodges%E2%80%93Lehmann_estimator
+     * https://aakinshin.net/hodges-lehmann-estimator/
+     * https://github.com/AndreyAkinshin/perfolizer/blob/master/src/Perfolizer/Perfolizer/Mathematics/GenericEstimators/HodgesLehmannEstimator.cs
+     *
+     * Takes into account tolerance which answers the question "is change is big enough to matter?"
+     */
+    fun compare(): DistributionComparison {
+        val experimentShift = shift()
+        val baselineMedian = pseudoMedian(baseline)
+        val experimentRatio = ratio()
+        val isExperimentImproved = isExperimentImproved(baselineMedian)
+        val isExperimentRegressed = isExperimentRegressed(baselineMedian)
+        val experimentRelativeChange = experimentRatio - 1
+        return DistributionComparison(
+            experimentRelativeChange = experimentRelativeChange,
+            experimentAbsoluteChange = experimentShift,
+            isExperimentRegressed = isExperimentRegressed,
+            isExperimentImproved = isExperimentImproved
+        )
+    }
+
+    class Builder(
+        private var baseline: DoubleArray,
+        private var experiment: DoubleArray
+    ) {
+        private var significance: Double = 0.05
+        private var tolerance: Double = 0.01
+
+        fun significance(significance: Double) = apply { this.significance = significance }
+        fun tolerance(tolerance: Double) = apply { this.tolerance = tolerance }
+        fun baseline(baseline: DoubleArray) = apply { this.baseline = baseline }
+        fun experiment(experiment: DoubleArray) = apply { this.experiment = experiment }
+
+        fun build() = DistributionComparator(
+            baseline = baseline,
+            experiment = experiment,
+            tolerance = tolerance,
+            significance = significance
+        )
+
+    }
+}
diff --git a/src/main/kotlin/com/atlassian/performance/tools/report/api/distribution/DistributionComparison.kt b/src/main/kotlin/com/atlassian/performance/tools/report/api/distribution/DistributionComparison.kt
@@ -0,0 +1,18 @@
+package com.atlassian.performance.tools.report.api.distribution
+
+class DistributionComparison(
+    val experimentRelativeChange: Double,
+    val experimentAbsoluteChange: Double,
+    val isExperimentRegressed: Boolean,
+    val isExperimentImproved: Boolean
+) {
+
+    init {
+        if (isExperimentImproved && isExperimentRegressed) {
+            throw IllegalArgumentException("Experiment can't be both regressed and improved at the same time")
+        }
+    }
+
+    fun hasImpact() = isExperimentRegressed || isExperimentImproved
+
+}
diff --git a/src/main/kotlin/com/atlassian/performance/tools/report/api/judge/RelativeNonparametricPerformanceJudge.kt b/src/main/kotlin/com/atlassian/performance/tools/report/api/judge/RelativeNonparametricPerformanceJudge.kt
@@ -2,7 +2,7 @@ package com.atlassian.performance.tools.report.api.judge
 
 import com.atlassian.performance.tools.jiraactions.api.ActionType
 import com.atlassian.performance.tools.report.ActionMetricsReader
-import com.atlassian.performance.tools.report.api.ShiftedDistributionRegressionTest
+import com.atlassian.performance.tools.report.api.distribution.DistributionComparator
 import com.atlassian.performance.tools.report.api.junit.FailedAssertionJUnitReport
 import com.atlassian.performance.tools.report.api.junit.JUnitReport
 import com.atlassian.performance.tools.report.api.junit.SuccessfulJUnitReport
@@ -70,14 +70,16 @@ class RelativeNonparametricPerformanceJudge private constructor(
                 report = FailedAssertionJUnitReport(reportName, "No action $label results for $experimentCohort"),
                 action = action
             )
-        val test = ShiftedDistributionRegressionTest(baseline, experiment, mwAlpha = significance, ksAlpha = 0.0)
-        // shifts are negated, because ShiftedDistributionRegressionTest is relative to experiment, instead of baseline
+        val comparison = DistributionComparator.Builder(baseline, experiment)
+            .tolerance(toleranceRatio.toDouble())
+            .build()
+            .compare()
         val impact = LatencyImpact.Builder(
             action,
-            -test.percentageShift,
-            reader.convertToDuration(-test.locationShift)
+            comparison.experimentRelativeChange,
+            reader.convertToDuration(comparison.experimentAbsoluteChange)
         )
-            .relevant(test.overcomesTolerance(toleranceRatio.toDouble()))
+            .relevant(comparison.hasImpact())
             .build()
         impactHandlers.forEach { it.accept(impact) }
         return if (impact.regression) {
diff --git a/src/test/kotlin/com/atlassian/performance/tools/report/ShiftedDistributionRegressionTestTest.kt b/src/test/kotlin/com/atlassian/performance/tools/report/ShiftedDistributionRegressionTestTest.kt
@@ -1,6 +1,5 @@
 package com.atlassian.performance.tools.report
 
-import com.atlassian.performance.tools.jiraactions.api.*
 import com.atlassian.performance.tools.report.api.ShiftedDistributionRegressionTest
 import com.atlassian.performance.tools.report.api.result.FakeResults
 import com.atlassian.performance.tools.report.chart.Chart
@@ -12,7 +11,7 @@ import org.apache.commons.math3.distribution.NormalDistribution
 import org.apache.commons.math3.distribution.NormalDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY
 import org.apache.commons.math3.random.MersenneTwister
 import org.assertj.core.api.Assertions.assertThat
-import org.assertj.core.api.SoftAssertions.*
+import org.assertj.core.api.SoftAssertions.assertSoftly
 import org.assertj.core.data.Offset
 import org.junit.Ignore
 import org.junit.Test
@@ -112,7 +111,7 @@ class ShiftedDistributionRegressionTestTest {
     }
 
     /**
-     * In a 51% vs 49% case, small diffs should not dominate the big diffs.
+     * In a 51% slightly faster vs 49% much slower case, it should be a regression
      */
     @Ignore("https://ecosystem.atlassian.net/browse/JPERF-1297")
     @Test
@@ -152,16 +151,39 @@ class ShiftedDistributionRegressionTestTest {
         }
     }
 
+    @Test
+//    @Ignore("percentageShift calculation is fixed in in DistributionComparator where Hodges-Lehmann estimator with pseudo-median is used")
+    fun shouldDetectImprovementWhenEveryPercentileBetter() {
+        // given
+        val baseline =
+            this.javaClass.getResource("/real-results/view issue 9.17.0 vs 10.0.0/baseline.csv").readText().lines()
+                .map { it.toDouble() }.toDoubleArray()
+        val experiment =
+            this.javaClass.getResource("/real-results/view issue 9.17.0 vs 10.0.0/experiment.csv").readText().lines()
+                .map { it.toDouble() }.toDoubleArray()
+        // when
+        val test = ShiftedDistributionRegressionTest(baseline, experiment)
+        // then
+        plotQuantiles(baseline, experiment)
+        assertSoftly {
+            it.assertThat(test.isExperimentRegressed(0.01)).`as`("isExperimentRegressed").isFalse()
+            it.assertThat(test.percentageShift).`as`("").`as`("percentageShift").isEqualTo(0.03941908713692943)
+            it.assertThat(test.locationShift).`as`("").`as`("locationShift").isEqualTo(20.0)
+            it.assertThat(test.overcomesTolerance(0.01)).`as`("overcomesTolerance").isTrue()
+        }
+    }
 
 
     private fun plotQuantiles(
         baseline: DoubleArray,
         experiment: DoubleArray
     ) {
-        val chart = Chart(listOf(
-            chartLine(baseline, "baseline"),
-            chartLine(experiment, "experiment")
-        ))
+        val chart = Chart(
+            listOf(
+                chartLine(baseline, "baseline"),
+                chartLine(experiment, "experiment")
+            )
+        )
         val htmlFile = Files.createTempFile("kebab", ".html")
             .also { println("Distribution comparison at $it") }
         DistributionComparison(GitRepo.findFromCurrentDirectory()).render(chart, htmlFile)
@@ -174,7 +196,6 @@ class ShiftedDistributionRegressionTestTest {
         yAxisId = "latency-axis"
     )
 
-    @Ignore("Known bug: https://ecosystem.atlassian.net/browse/JPERF-1188")
     @Test
     fun shouldSeeNoShiftAcrossTheSameResult() {
         val result = FakeResults.fastResult
diff --git a/src/test/kotlin/com/atlassian/performance/tools/report/distribution/DistributionComparatorTest.kt b/src/test/kotlin/com/atlassian/performance/tools/report/distribution/DistributionComparatorTest.kt
diff --git a/src/test/resources/real-results/view issue 9.17.0 vs 10.0.0/baseline.csv b/src/test/resources/real-results/view issue 9.17.0 vs 10.0.0/baseline.csv
diff --git a/src/test/resources/real-results/view issue 9.17.0 vs 10.0.0/experiment.csv b/src/test/resources/real-results/view issue 9.17.0 vs 10.0.0/experiment.csv