diff --git a/README.md b/README.md index 6070b60..4ca7d77 100644 --- a/README.md +++ b/README.md @@ -187,8 +187,9 @@ Built-in standard errors include: `'percentile'` (because standard error is not well defined for percentile bootstrapping). -+ `PoissonBootstrap(unit, n_replicates, confidence)` : Computes a Poisson - bootstrap estimate of the standard error. It's identical to `Bootstrap` ++ `PoissonBootstrap(unit, n_replicates, confidence, ci_method)` : Computes a + Poisson bootstrap estimate of the standard error or percentiles. It's + identical to `Bootstrap` except that we use `Poisson(1)` instead of multinomial distribution in sampling. It's faster than `Bootstrap` on large data when computing in SQL. See the [post](https://www.unofficialgoogledatascience.com/2015/08/an-introduction-to-poisson-bootstrap26.html) diff --git a/operations.py b/operations.py index a4c6d0e..8a71af3 100644 --- a/operations.py +++ b/operations.py @@ -3274,6 +3274,27 @@ class PoissonBootstrap(Bootstrap): instead of multinomial distribution in resampling. See https://www.unofficialgoogledatascience.com/2015/08/an-introduction-to-poisson-bootstrap26.html for an introduction. + + Attributes: + unit: The column representing the blocks to be resampled in block Poisson + bootstrap. If specified we sample the unique blocks in the `unit` column, + otherwise we sample rows. + n_replicates: The number of bootstrap replicates. + confidence: The level of the confidence interval, must be in (0, 1). If + specified, we return confidence interval range instead of standard error. + Additionally, a display() function will be bound to the result so you can + visualize the confidence interval nicely in Colab and Jupyter notebook. + Required if ci_method is 'percentile'. + children: A tuple of a Metric whose result we bootstrap on. + enable_optimization: If all leaf Metrics are Sum and/or Count, or can be + expressed equivalently by Sum and/or Count, then we can preaggregate the + data for faster computation. See compute_slices() for more details. + has_been_preaggregated: If the Metric and data has already been + preaggregated, this will be set to True. And all other attributes + inherited from Operation. + ci_method: The method to compute confidence intervals. Can be 'std' + (default, uses standard error and normal approximation) or 'percentile' + (uses empirical percentiles from the bootstrap distribution). """ def __init__( @@ -3284,6 +3305,7 @@ def __init__( confidence: Optional[float] = None, enable_optimization=True, name_tmpl: Text = '{} Poisson Bootstrap', + ci_method: Literal['std', 'percentile'] = 'std', **kwargs, ): super(PoissonBootstrap, self).__init__( @@ -3293,7 +3315,7 @@ def __init__( confidence, enable_optimization, name_tmpl, - ci_method='std', + ci_method=ci_method, **kwargs, ) diff --git a/operations_test.py b/operations_test.py index d117635..4d3fc12 100644 --- a/operations_test.py +++ b/operations_test.py @@ -2053,8 +2053,11 @@ def test_integration(self): class PoissonBootstrapTests(parameterized.TestCase): - @parameterized.parameters([([],), (['grp2'],), (('grp2', 'grp3'),)]) - def test_runs(self, split_by): + @parameterized.product( + split_by=[[], ['grp2'], ('grp2', 'grp3')], + ci_method=['std', 'percentile'], + ) + def test_runs(self, split_by, ci_method): df = pd.DataFrame({ 'x': range(6), 'grp': range(6), @@ -2067,9 +2070,11 @@ def test_runs(self, split_by): metrics.Max('x'), metrics.Min('x'), ]) - m1 = operations.PoissonBootstrap('grp', m, 5, 0.9) - m2 = operations.PoissonBootstrap('grp', m, 5, 0.9, False) - m3 = operations.PoissonBootstrap(None, m, 5, 0.9) + m1 = operations.PoissonBootstrap('grp', m, 5, 0.9, ci_method=ci_method) + m2 = operations.PoissonBootstrap( + 'grp', m, 5, 0.9, False, ci_method=ci_method + ) + m3 = operations.PoissonBootstrap(None, m, 5, 0.9, ci_method=ci_method) np.random.seed(0) res1 = m1.compute_on(df, split_by).display(return_formatted_df=True) np.random.seed(0) @@ -2079,8 +2084,11 @@ def test_runs(self, split_by): testing.assert_frame_equal(res1, res2) testing.assert_frame_equal(res2, res3) - @parameterized.parameters([([],), (['grp2'],), (('grp2', 'grp3'),)]) - def test_each_unit_has_one_row(self, split_by): + @parameterized.product( + split_by=[[], ['grp2'], ('grp2', 'grp3')], + ci_method=['std', 'percentile'], + ) + def test_each_unit_has_one_row(self, split_by, ci_method): df = pd.DataFrame({ 'x': range(6), 'grp': range(6), @@ -2095,16 +2103,19 @@ def test_each_unit_has_one_row(self, split_by): metrics.Min('x'), ]) m = operations.AbsoluteChange('grp4', 1, m) - m1 = operations.PoissonBootstrap('grp', m, 5, 0.9) - m2 = operations.PoissonBootstrap(None, m, 5, 0.9) + m1 = operations.PoissonBootstrap('grp', m, 5, 0.9, ci_method=ci_method) + m2 = operations.PoissonBootstrap(None, m, 5, 0.9, ci_method=ci_method) np.random.seed(0) res1 = m1.compute_on(df, split_by).display(return_formatted_df=True) np.random.seed(0) res2 = m2.compute_on(df, split_by).display(return_formatted_df=True) testing.assert_frame_equal(res1, res2) - @parameterized.parameters([([],), (['grp2'],), (('grp2', 'grp3'),)]) - def test_each_unit_has_multiple_rows(self, split_by): + @parameterized.product( + split_by=[[], ['grp2'], ('grp2', 'grp3')], + ci_method=['std', 'percentile'], + ) + def test_each_unit_has_multiple_rows(self, split_by, ci_method): df = pd.DataFrame({ 'x': range(6), 'grp': [0] * 3 + [1] * 3, @@ -2119,14 +2130,91 @@ def test_each_unit_has_multiple_rows(self, split_by): metrics.Min('x'), ]) m = operations.AbsoluteChange('grp4', 0, m) - m1 = operations.PoissonBootstrap('grp', m, 5, 0.9) - m2 = operations.PoissonBootstrap('grp', m, 5, 0.9, False) + m1 = operations.PoissonBootstrap('grp', m, 5, 0.9, ci_method=ci_method) + m2 = operations.PoissonBootstrap( + 'grp', m, 5, 0.9, False, ci_method=ci_method + ) np.random.seed(0) res1 = m1.compute_on(df, split_by).display(return_formatted_df=True) np.random.seed(0) res2 = m2.compute_on(df, split_by).display(return_formatted_df=True) testing.assert_frame_equal(res1, res2) + @parameterized.parameters(['std', 'percentile']) + def test_confidence(self, ci_method): + np.random.seed(42) + n = 100 + x = np.arange(0, 3, 0.5) + df = pd.DataFrame({'x': x, 'grp': ['A'] * 3 + ['B'] * 3}) + metric = metrics.MetricList((metrics.Sum('x'), metrics.Count('x'))) + bootstrap_no_unit = operations.PoissonBootstrap(None, metric, n) + + melted = operations.PoissonBootstrap( + None, metric, n, 0.9, ci_method=ci_method + ).compute_on(df, melted=True) + np.random.seed(42) + expected = bootstrap_no_unit.compute_on(df, melted=True) + + if ci_method == 'std': + multiplier = stats.t.ppf((1 + 0.9) / 2, n - 1) + lower_ci = ( + expected['Value'] - multiplier * expected['Poisson Bootstrap SE'] + ) + upper_ci = ( + expected['Value'] + multiplier * expected['Poisson Bootstrap SE'] + ) + else: + np.random.seed(42) + estimates = [] + for _ in range(n): + weights = np.random.poisson(size=len(x)) + sample = df.iloc[np.arange(len(x)).repeat(weights)] + res = metric.compute_on(sample, return_dataframe=False) + estimates.append(res) + + # calculate percentile directly here + q_lower = (1 - 0.9) / 2 * 100 + q_upper = (1 + 0.9) / 2 * 100 + lower_ci = [ + np.percentile([e[0] for e in estimates], q_lower), + np.percentile([e[1] for e in estimates], q_lower), + ] + upper_ci = [ + np.percentile([e[0] for e in estimates], q_upper), + np.percentile([e[1] for e in estimates], q_upper), + ] + + expected['Poisson Bootstrap CI-lower'] = lower_ci + expected['Poisson Bootstrap CI-upper'] = upper_ci + + expected.drop('Poisson Bootstrap SE', axis=1, inplace=True) + testing.assert_frame_equal(melted, expected, rtol=0.1) + melted.display() # Check display() runs. + + np.random.seed(42) + unmelted = operations.PoissonBootstrap( + None, metric, n, 0.9, ci_method=ci_method + ).compute_on(df) + expected = pd.DataFrame( + [ + list(melted.loc['sum(x)'].values) + + list(melted.loc['count(x)'].values) + ], + columns=pd.MultiIndex.from_product( + [ + ['sum(x)', 'count(x)'], + [ + 'Value', + 'Poisson Bootstrap CI-lower', + 'Poisson Bootstrap CI-upper', + ], + ], + names=['Metric', None], + ), + ) + testing.assert_frame_equal(unmelted, expected, rtol=0.1) + unmelted.display() # Check display() runs. + def test_get_samples_with_unit(self): x = range(6) df = pd.DataFrame({'x': x, 'grp': ['A'] * 3 + ['B'] * 3})