Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,9 @@ Built-in standard errors include:
`'percentile'` (because standard error is not well defined for percentile
bootstrapping).

+ `PoissonBootstrap(unit, n_replicates, confidence)` : Computes a Poisson
bootstrap estimate of the standard error. It's identical to `Bootstrap`
+ `PoissonBootstrap(unit, n_replicates, confidence, ci_method)` : Computes a
Poisson bootstrap estimate of the standard error or percentiles. It's
identical to `Bootstrap`
except that we use `Poisson(1)` instead of multinomial distribution in
sampling. It's faster than `Bootstrap` on large data when computing in SQL.
See the [post](https://www.unofficialgoogledatascience.com/2015/08/an-introduction-to-poisson-bootstrap26.html)
Expand Down
24 changes: 23 additions & 1 deletion operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3274,6 +3274,27 @@ class PoissonBootstrap(Bootstrap):
instead of multinomial distribution in resampling. See
https://www.unofficialgoogledatascience.com/2015/08/an-introduction-to-poisson-bootstrap26.html
for an introduction.

Attributes:
unit: The column representing the blocks to be resampled in block Poisson
bootstrap. If specified we sample the unique blocks in the `unit` column,
otherwise we sample rows.
n_replicates: The number of bootstrap replicates.
confidence: The level of the confidence interval, must be in (0, 1). If
specified, we return confidence interval range instead of standard error.
Additionally, a display() function will be bound to the result so you can
visualize the confidence interval nicely in Colab and Jupyter notebook.
Required if ci_method is 'percentile'.
children: A tuple of a Metric whose result we bootstrap on.
enable_optimization: If all leaf Metrics are Sum and/or Count, or can be
expressed equivalently by Sum and/or Count, then we can preaggregate the
data for faster computation. See compute_slices() for more details.
has_been_preaggregated: If the Metric and data has already been
preaggregated, this will be set to True. And all other attributes
inherited from Operation.
ci_method: The method to compute confidence intervals. Can be 'std'
(default, uses standard error and normal approximation) or 'percentile'
(uses empirical percentiles from the bootstrap distribution).
"""

def __init__(
Expand All @@ -3284,6 +3305,7 @@ def __init__(
confidence: Optional[float] = None,
enable_optimization=True,
name_tmpl: Text = '{} Poisson Bootstrap',
ci_method: Literal['std', 'percentile'] = 'std',
**kwargs,
):
super(PoissonBootstrap, self).__init__(
Expand All @@ -3293,7 +3315,7 @@ def __init__(
confidence,
enable_optimization,
name_tmpl,
ci_method='std',
ci_method=ci_method,
**kwargs,
)

Expand Down
114 changes: 101 additions & 13 deletions operations_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2053,8 +2053,11 @@ def test_integration(self):

class PoissonBootstrapTests(parameterized.TestCase):

@parameterized.parameters([([],), (['grp2'],), (('grp2', 'grp3'),)])
def test_runs(self, split_by):
@parameterized.product(
split_by=[[], ['grp2'], ('grp2', 'grp3')],
ci_method=['std', 'percentile'],
)
def test_runs(self, split_by, ci_method):
df = pd.DataFrame({
'x': range(6),
'grp': range(6),
Expand All @@ -2067,9 +2070,11 @@ def test_runs(self, split_by):
metrics.Max('x'),
metrics.Min('x'),
])
m1 = operations.PoissonBootstrap('grp', m, 5, 0.9)
m2 = operations.PoissonBootstrap('grp', m, 5, 0.9, False)
m3 = operations.PoissonBootstrap(None, m, 5, 0.9)
m1 = operations.PoissonBootstrap('grp', m, 5, 0.9, ci_method=ci_method)
m2 = operations.PoissonBootstrap(
'grp', m, 5, 0.9, False, ci_method=ci_method
)
m3 = operations.PoissonBootstrap(None, m, 5, 0.9, ci_method=ci_method)
np.random.seed(0)
res1 = m1.compute_on(df, split_by).display(return_formatted_df=True)
np.random.seed(0)
Expand All @@ -2079,8 +2084,11 @@ def test_runs(self, split_by):
testing.assert_frame_equal(res1, res2)
testing.assert_frame_equal(res2, res3)

@parameterized.parameters([([],), (['grp2'],), (('grp2', 'grp3'),)])
def test_each_unit_has_one_row(self, split_by):
@parameterized.product(
split_by=[[], ['grp2'], ('grp2', 'grp3')],
ci_method=['std', 'percentile'],
)
def test_each_unit_has_one_row(self, split_by, ci_method):
df = pd.DataFrame({
'x': range(6),
'grp': range(6),
Expand All @@ -2095,16 +2103,19 @@ def test_each_unit_has_one_row(self, split_by):
metrics.Min('x'),
])
m = operations.AbsoluteChange('grp4', 1, m)
m1 = operations.PoissonBootstrap('grp', m, 5, 0.9)
m2 = operations.PoissonBootstrap(None, m, 5, 0.9)
m1 = operations.PoissonBootstrap('grp', m, 5, 0.9, ci_method=ci_method)
m2 = operations.PoissonBootstrap(None, m, 5, 0.9, ci_method=ci_method)
np.random.seed(0)
res1 = m1.compute_on(df, split_by).display(return_formatted_df=True)
np.random.seed(0)
res2 = m2.compute_on(df, split_by).display(return_formatted_df=True)
testing.assert_frame_equal(res1, res2)

@parameterized.parameters([([],), (['grp2'],), (('grp2', 'grp3'),)])
def test_each_unit_has_multiple_rows(self, split_by):
@parameterized.product(
split_by=[[], ['grp2'], ('grp2', 'grp3')],
ci_method=['std', 'percentile'],
)
def test_each_unit_has_multiple_rows(self, split_by, ci_method):
df = pd.DataFrame({
'x': range(6),
'grp': [0] * 3 + [1] * 3,
Expand All @@ -2119,14 +2130,91 @@ def test_each_unit_has_multiple_rows(self, split_by):
metrics.Min('x'),
])
m = operations.AbsoluteChange('grp4', 0, m)
m1 = operations.PoissonBootstrap('grp', m, 5, 0.9)
m2 = operations.PoissonBootstrap('grp', m, 5, 0.9, False)
m1 = operations.PoissonBootstrap('grp', m, 5, 0.9, ci_method=ci_method)
m2 = operations.PoissonBootstrap(
'grp', m, 5, 0.9, False, ci_method=ci_method
)
np.random.seed(0)
res1 = m1.compute_on(df, split_by).display(return_formatted_df=True)
np.random.seed(0)
res2 = m2.compute_on(df, split_by).display(return_formatted_df=True)
testing.assert_frame_equal(res1, res2)

@parameterized.parameters(['std', 'percentile'])
def test_confidence(self, ci_method):
np.random.seed(42)
n = 100
x = np.arange(0, 3, 0.5)
df = pd.DataFrame({'x': x, 'grp': ['A'] * 3 + ['B'] * 3})
metric = metrics.MetricList((metrics.Sum('x'), metrics.Count('x')))
bootstrap_no_unit = operations.PoissonBootstrap(None, metric, n)

melted = operations.PoissonBootstrap(
None, metric, n, 0.9, ci_method=ci_method
).compute_on(df, melted=True)
np.random.seed(42)
expected = bootstrap_no_unit.compute_on(df, melted=True)

if ci_method == 'std':
multiplier = stats.t.ppf((1 + 0.9) / 2, n - 1)
lower_ci = (
expected['Value'] - multiplier * expected['Poisson Bootstrap SE']
)
upper_ci = (
expected['Value'] + multiplier * expected['Poisson Bootstrap SE']
)
else:
np.random.seed(42)
estimates = []
for _ in range(n):
weights = np.random.poisson(size=len(x))
sample = df.iloc[np.arange(len(x)).repeat(weights)]
res = metric.compute_on(sample, return_dataframe=False)
estimates.append(res)

# calculate percentile directly here
q_lower = (1 - 0.9) / 2 * 100
q_upper = (1 + 0.9) / 2 * 100
lower_ci = [
np.percentile([e[0] for e in estimates], q_lower),
np.percentile([e[1] for e in estimates], q_lower),
]
upper_ci = [
np.percentile([e[0] for e in estimates], q_upper),
np.percentile([e[1] for e in estimates], q_upper),
]

expected['Poisson Bootstrap CI-lower'] = lower_ci
expected['Poisson Bootstrap CI-upper'] = upper_ci

expected.drop('Poisson Bootstrap SE', axis=1, inplace=True)
testing.assert_frame_equal(melted, expected, rtol=0.1)
melted.display() # Check display() runs.

np.random.seed(42)
unmelted = operations.PoissonBootstrap(
None, metric, n, 0.9, ci_method=ci_method
).compute_on(df)
expected = pd.DataFrame(
[
list(melted.loc['sum(x)'].values)
+ list(melted.loc['count(x)'].values)
],
columns=pd.MultiIndex.from_product(
[
['sum(x)', 'count(x)'],
[
'Value',
'Poisson Bootstrap CI-lower',
'Poisson Bootstrap CI-upper',
],
],
names=['Metric', None],
),
)
testing.assert_frame_equal(unmelted, expected, rtol=0.1)
unmelted.display() # Check display() runs.

def test_get_samples_with_unit(self):
x = range(6)
df = pd.DataFrame({'x': x, 'grp': ['A'] * 3 + ['B'] * 3})
Expand Down
Loading