Skip to content
This repository was archived by the owner on Jan 22, 2026. It is now read-only.

Commit 55c1646

Browse files
author
Rob Hudson
authored
Add Mann-Whitney U test for comparing histograms (bug 1395571) (#174)
1 parent 26e5deb commit 55c1646

File tree

6 files changed

+257
-18
lines changed

6 files changed

+257
-18
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,7 @@
99
*.o
1010
*~
1111
.DS_Store
12+
.coverage
13+
.eggs/
14+
python_moztelemetry.egg-info/
1215
docs/_build/

Dockerfile

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,32 @@
1-
FROM java:8
1+
FROM openjdk:8
22

33
# Versions of spark + hbase to use for our testing environment
4-
ENV SPARK_VERSION=2.0.2
5-
ENV HBASE_VERSION=1.2.3
4+
ENV SPARK_VERSION=2.0.2 \
5+
HBASE_VERSION=1.2.3
66

77
# install gcc
8-
RUN apt-get update && apt-get install -y g++ libpython-dev libsnappy-dev
8+
RUN apt-get update --fix-missing && \
9+
apt-get install -y \
10+
g++ libpython-dev libsnappy-dev
911

1012
# setup conda environment
11-
1213
# temporary workaround, pin miniconda version until it's fixed.
13-
RUN wget https://repo.continuum.io/miniconda/Miniconda2-4.3.21-Linux-x86_64.sh -O miniconda.sh
14-
RUN bash miniconda.sh -b -p /miniconda
15-
ENV PATH="/miniconda/bin:${PATH}"
16-
RUN hash -r
17-
RUN conda config --set always_yes yes --set changeps1 no
18-
# TODO: uncomment
19-
# RUN conda update -q conda
20-
RUN conda info -a # Useful for debugging any issues with conda
21-
22-
# install spark/hbase
23-
RUN wget -nv https://archive.apache.org/dist/hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz
24-
RUN tar -zxf hbase-$HBASE_VERSION-bin.tar.gz
14+
RUN echo "export PATH=/miniconda/bin:${PATH}" > /etc/profile.d/conda.sh && \
15+
wget --quiet https://repo.continuum.io/miniconda/Miniconda2-4.3.21-Linux-x86_64.sh -O miniconda.sh && \
16+
/bin/bash miniconda.sh -b -p /miniconda && \
17+
rm miniconda.sh
18+
19+
ENV PATH /miniconda/bin:${PATH}
20+
21+
RUN hash -r && \
22+
conda config --set always_yes yes --set changeps1 no && \
23+
# TODO: uncomment \
24+
# RUN conda update -q conda && \
25+
conda info -a # Useful for debugging any issues with conda
26+
27+
# install hbase
28+
RUN wget -nv https://archive.apache.org/dist/hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz && \
29+
tar -zxf hbase-$HBASE_VERSION-bin.tar.gz
2530

2631
# build + activate conda environment
2732
COPY ./environment.yml /python_moztelemetry/

environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ dependencies:
55
- pyspark
66
- python-snappy
77
- snappy
8+
- scipy

moztelemetry/stats.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
4+
# This Source Code Form is subject to the terms of the Mozilla Public
5+
# License, v. 2.0. If a copy of the MPL was not distributed with this
6+
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
7+
8+
from __future__ import division
9+
10+
import math
11+
from collections import namedtuple
12+
13+
14+
def _rank(sample):
15+
"""
16+
Assign numeric ranks to all values in the sample.
17+
18+
The ranks begin with 1 for the smallest value. When there are groups of
19+
tied values, assign a rank equal to the midpoint of unadjusted rankings.
20+
21+
E.g.::
22+
23+
>>> rank({3: 1, 5: 4, 9: 1})
24+
{3: 1.0, 5: 3.5, 9: 6.0}
25+
26+
"""
27+
rank = 1
28+
ranks = {}
29+
30+
for k in sorted(sample.keys()):
31+
n = sample[k]
32+
ranks[k] = rank + (n - 1) / 2
33+
rank += n
34+
35+
return ranks
36+
37+
38+
def _tie_correct(sample):
39+
"""
40+
Returns the tie correction value for U.
41+
42+
See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.tiecorrect.html
43+
44+
"""
45+
tc = 0
46+
n = sum(sample.values())
47+
48+
if n < 2:
49+
return 1.0 # Avoid a ``ZeroDivisionError``.
50+
51+
for k in sorted(sample.keys()):
52+
tc += math.pow(sample[k], 3) - sample[k]
53+
tc = 1 - tc / (math.pow(n, 3) - n)
54+
55+
return tc
56+
57+
58+
def ndtr(a):
59+
"""
60+
Returns the area under the Gaussian probability density function,
61+
integrated from minus infinity to x.
62+
63+
See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.ndtr.html#scipy.special.ndtr
64+
65+
"""
66+
sqrth = math.sqrt(2) / 2
67+
x = float(a) * sqrth
68+
z = abs(x)
69+
if z < sqrth:
70+
y = 0.5 + 0.5 * math.erf(x)
71+
else:
72+
y = 0.5 * math.erfc(z)
73+
if x > 0:
74+
y = 1 - y
75+
return y
76+
77+
78+
mwu_result = namedtuple('Mann_Whitney_U', ('u', 'p'))
79+
80+
81+
def mann_whitney_u(sample1, sample2, use_continuity=True):
82+
"""
83+
Computes the Mann-Whitney rank test on both samples.
84+
85+
Each sample is expected to be of the form::
86+
87+
{1: 5, 2: 20, 3: 12, ...}
88+
89+
Returns a named tuple with:
90+
``u`` equal to min(U for sample1, U for sample2), and
91+
``p`` equal to the p-value.
92+
93+
"""
94+
# Merge dictionaries, adding values if keys match.
95+
sample = sample1.copy()
96+
for k, v in sample2.items():
97+
sample[k] = sample.get(k, 0) + v
98+
99+
# Create a ranking dictionary using same keys for lookups.
100+
ranks = _rank(sample)
101+
102+
sum_of_ranks = sum([sample1[k] * ranks[k] for k, v in sample1.items()])
103+
n1 = sum(sample1.values())
104+
n2 = sum(sample2.values())
105+
106+
# Calculate Mann-Whitney U for both samples.
107+
u1 = sum_of_ranks - (n1 * (n1 + 1)) / 2
108+
u2 = n1 * n2 - u1
109+
110+
tie_correction = _tie_correct(sample)
111+
sd_u = math.sqrt(tie_correction * n1 * n2 * (n1 + n2 + 1) / 12.0)
112+
mean_rank = n1 * n2 / 2.0 + 0.5 * use_continuity
113+
z = abs((max(u1, u2) - mean_rank) / sd_u)
114+
115+
return mwu_result(min(u1, u2), ndtr(-z))

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,5 @@
2222
setup_requires=['pytest-runner', 'setuptools_scm'],
2323
# put pytest last to workaround this bug
2424
# https://bitbucket.org/pypa/setuptools/issues/196/tests_require-pytest-pytest-cov-breaks
25-
tests_require=['mock', 'pytest-timeout', 'moto', 'responses', 'pytest'],
25+
tests_require=['mock', 'pytest-timeout', 'moto', 'responses', 'scipy', 'pytest'],
2626
)

tests/test_stats.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
4+
# This Source Code Form is subject to the terms of the Mozilla Public
5+
# License, v. 2.0. If a copy of the MPL was not distributed with this
6+
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
7+
8+
"""
9+
This module implements test coverage for the stats functions in stats.py.
10+
"""
11+
import itertools
12+
13+
import numpy.random
14+
import scipy.stats
15+
16+
import pytest
17+
from moztelemetry import stats
18+
19+
20+
def l2d(values):
21+
# Convert a list of values to a histogram representation.
22+
d = {}
23+
for v in values:
24+
d[v] = d.get(v, 0) + 1
25+
return d
26+
27+
28+
# A normally distributed sample set.
29+
norm1 = list(numpy.random.normal(5, 3.25, 1000))
30+
norm2 = list(numpy.random.normal(6, 2.5, 1000))
31+
32+
# A uniformly distributed sample set.
33+
uni1 = numpy.random.randint(1, 100, 1000)
34+
uni2 = numpy.random.randint(10, 120, 900)
35+
36+
# A skewed normal distribution.
37+
skew1 = list(scipy.stats.skewnorm.rvs(10, size=1000))
38+
skew2 = list(scipy.stats.skewnorm.rvs(5, size=900))
39+
40+
41+
samples = {
42+
'normalized': (norm1, norm2),
43+
'uniform': (uni1, uni2),
44+
'skewed': (skew1, skew2),
45+
}
46+
47+
48+
def test_rank():
49+
assert stats._rank({1: 1}) == {1: 1.0}
50+
assert stats._rank({1: 5, 2: 4, 3: 3, 4: 2, 5: 1}) == {
51+
1: 3.0,
52+
2: 7.5,
53+
3: 11.0,
54+
4: 13.5,
55+
5: 15.0,
56+
}
57+
58+
59+
def test_tie_correct():
60+
assert stats._tie_correct({}) == 1.0
61+
assert stats._tie_correct({1: 1}) == 1.0
62+
63+
64+
def test_ndtr():
65+
# Test invalid values raise an error.
66+
with pytest.raises(TypeError):
67+
stats.ndtr(None)
68+
with pytest.raises(ValueError):
69+
stats.ndtr('a')
70+
71+
assert round(stats.ndtr(0), 6) == 0.5
72+
assert round(stats.ndtr(1), 6) == 0.841345
73+
assert round(stats.ndtr(2), 6) == 0.977250
74+
assert round(stats.ndtr(3), 6) == 0.998650
75+
76+
assert round(stats.ndtr(0), 6) == round(scipy.special.ndtr(0), 6)
77+
assert round(stats.ndtr(1), 6) == round(scipy.special.ndtr(1), 6)
78+
assert round(stats.ndtr(1.5), 6) == round(scipy.special.ndtr(1.5), 6)
79+
assert round(stats.ndtr(2), 6) == round(scipy.special.ndtr(2), 6)
80+
assert round(stats.ndtr(3), 6) == round(scipy.special.ndtr(3), 6)
81+
82+
83+
def test_mann_whitney_u():
84+
distribution_types = ('normalized', 'uniform', 'skewed')
85+
86+
# Test different distributions against each other, including
87+
# like distributions against themselves.
88+
for sample1, sample2 in itertools.product(distribution_types, repeat=2):
89+
90+
arr1, arr2 = samples[sample1][0], samples[sample2][1]
91+
hist1, hist2 = l2d(arr1), l2d(arr2)
92+
93+
# Basic test, with defaults.
94+
res = stats.mann_whitney_u(hist1, hist2)
95+
sci = scipy.stats.mannwhitneyu(arr1, arr2)
96+
assert res.u == sci.statistic
97+
assert round(res.p, 6) == round(sci.pvalue, 6)
98+
99+
# Test that order of samples doesn't matter.
100+
res = stats.mann_whitney_u(hist2, hist1)
101+
sci = scipy.stats.mannwhitneyu(arr1, arr2)
102+
assert res.u == sci.statistic
103+
assert round(res.p, 6) == round(sci.pvalue, 6)
104+
105+
# Test exact same samples.
106+
res = stats.mann_whitney_u(hist1, hist1)
107+
sci = scipy.stats.mannwhitneyu(arr1, arr1)
108+
assert res.u == sci.statistic
109+
assert round(res.p, 6) == round(sci.pvalue, 6)
110+
111+
# Test with use_continuity = False.
112+
res = stats.mann_whitney_u(hist1, hist2, False)
113+
sci = scipy.stats.mannwhitneyu(arr1, arr2, False)
114+
assert res.u == sci.statistic
115+
assert round(res.p, 6) == round(sci.pvalue, 6)

0 commit comments

Comments
 (0)