Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,15 @@ def to_sql(table, split_by=None):
'CumulativeDistribution',
'PercentChange',
'AbsoluteChange',
'LogPercentChange',
'PrePostChange',
'CUPED',
'MH',
'Jackknife',
'Bootstrap',
'PoissonBootstrap',
'LogTransform',
'ReverseLogPercentTransform',
# Diversity Operations
'HHI',
'Entropy',
Expand Down
3 changes: 2 additions & 1 deletion metrics_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1016,7 +1016,8 @@ def test_built_ins(self):
'Comparison',
'MetricWithCI',
'Model',
'DiversityBase'
'DiversityBase',
'MetricFunction',
)),
)
self.assertEmpty(set(metrics.BUILT_INS).difference(all_classes))
Expand Down
151 changes: 151 additions & 0 deletions operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,89 @@ def get_sql_template_for_comparison(self, raw_table_alias, base_table_alias):
return f'{raw_table_alias}.%(r)s - {base_table_alias}.%(b)s'


class LogPercentChange(PercentChange):
"""Log percent change estimator on a Metric.

Computes the logarithm of the ratio of expected values between two
distributions.

If the primary parameter of interest is the relative percentage difference
between the expectations of two random variables, X1 and X0, defined as
100 * (E[X1] / E[X0] - 1), we can define an intermediate parameter for
statistical inference: the difference of their logarithms. Let this
transformed parameter be g(θ) = log(E[X1]) - log(E[X0]) = log(E[X1] / E[X0]).

After computing a confidence interval [L, U] for g(θ), we can apply the
transformation f(x) = 100(e^x - 1) to the interval endpoints. This yields
[f(L), f(U)], the confidence interval for the original parameter of interest.

Attributes:
extra_split_by: The column(s) that contains the conditions.
baseline_key: The value of the condition that represents the baseline (e.g.,
"Control"). All conditions will be compared to this baseline. If
condition_column contains multiple columns, then baseline_key should be a
tuple.
children: A tuple of a Metric whose result we compute log ratio on.
include_base: A boolean for whether the baseline condition should be
included in the output.
base: The log base to use. Can be 'ln' (natural log) or 'log10'. Defaults to
'ln'.
reverse_log: A boolean for whether to reverse the log transformation, which
will be automatically set to True.
And all other attributes inherited from Operation.
"""

def __init__(
self,
condition_column: Text,
baseline_key,
child: Optional[metrics.Metric] = None,
include_base: bool = False,
name_tmpl: Text = '{}',
base: str = 'ln',
**kwargs,
):
super(LogPercentChange, self).__init__(
condition_column, baseline_key, child, include_base, name_tmpl, **kwargs
)
if base not in ('ln', 'log10'):
raise ValueError("base must be 'ln' or 'log10'")
self.base = base
self.reverse_log = True

def compute_on_children(self, children, split_by):
"""Computes the log percent change on the children."""
children_float = children.astype(float)
if (children_float <= 0).any().any():
warnings.warn(
'LogPercentChange found zero or negative values. Returning NaN.'
)

# log (treatment / control) = log(treatment) - log(control)
log_children = LogTransform(base=self.base).compute_on_children(
children_float, split_by
)
log_ratio = AbsoluteChange.compute_on_children(
self, log_children, split_by
)

return log_ratio

def get_post_processing_fn(self):
"""Returns the inverse transformation function."""
return ReverseLogPercentTransform(base=self.base).func

def get_sql_template_for_comparison(self, raw_table_alias, base_table_alias):
divide_sql = sql.SAFE_DIVIDE_FN(
numer=f'{raw_table_alias}.%(r)s',
denom=f'{base_table_alias}.%(b)s',
)
if self.base == 'ln':
return f'LN({divide_sql})'
else:
return f'LOG10({divide_sql})'


def _check_covariates_match_base(base, cov):
len_base = len(base) if isinstance(base, metrics.MetricList) else 1
len_cov = len(cov) if isinstance(cov, metrics.MetricList) else 1
Expand Down Expand Up @@ -1834,6 +1917,60 @@ def display(
return display


class MetricFunction(Operation):
"""Base class for applying element-wise functions to Metric results."""

def __init__(self, child, func, name_tmpl, **kwargs):
super().__init__(child, name_tmpl, **kwargs)
self.func = func

def compute_on_children(self, children, split_by):
return self.func(children)


class LogTransform(MetricFunction):
"""Base class for applying log functions (ln or log10) to Metric results."""

def __init__(self, child=None, base: str = 'ln', **kwargs):
if base not in ('ln', 'log10'):
raise ValueError("base must be 'ln' or 'log10'")
self.base = base
func = np.log if base == 'ln' else np.log10
super().__init__(
child,
func,
'Log({})' if base == 'ln' else 'Log10({})',
additional_fingerprint_attrs=['base'],
**kwargs
)


class ReverseLogPercentTransform(MetricFunction):
"""Base class for applying reverse log functions to Metric results.

This is used to convert log transformed metrics back to their original scale,
i.e. from log(1 + x) to x.
"""

def __init__(self, child=None, base: str = 'ln', **kwargs):
if base not in ('ln', 'log10'):
raise ValueError("base must be 'ln' or 'log10'")
self.base = base
if base == 'ln':
func = lambda x: 100 * (np.exp(x) - 1)
name_tmpl = '100 * Exp({}) - 1'
else:
func = lambda x: 100 * (10**x - 1)
name_tmpl = '100 * 10^({}) - 1'
super().__init__(
child,
func,
name_tmpl,
additional_fingerprint_attrs=['base'],
**kwargs
)


class MetricWithCI(Operation):
"""Base class for Metrics that have confidence interval info in the return.

Expand Down Expand Up @@ -1919,6 +2056,20 @@ def compute_slices(self, df, split_by=None):
res = point_est.join(utils.melt(std))
if self.confidence:
res = self.compute_ci(res)

if len(self.children) == 1 and getattr(
self.children[0], 'reverse_log', False
):
transform_fn = self.children[0].get_post_processing_fn()
res.iloc[:, 0] = transform_fn(res.iloc[:, 0])
if self.confidence:
res[self.prefix + ' CI-lower'] = transform_fn(
res[self.prefix + ' CI-lower']
)
res[self.prefix + ' CI-upper'] = transform_fn(
res[self.prefix + ' CI-upper']
)

res = utils.unmelt(res)
if not self.confidence:
return res
Expand Down
113 changes: 113 additions & 0 deletions operations_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,47 @@ def test_percent_change_object_dtype_divide_by_zero(self):
expected.index.name = 'grp'
testing.assert_frame_equal(output, expected)

def test_log_percent_change(self):
metric = operations.LogPercentChange('grp', 'B', metrics.Sum('x'))
output = metric.compute_on(self.df)
expected = pd.DataFrame(
[[np.log(3.0)]],
columns=['sum(x)'],
index=['A'],
)
expected.index.name = 'grp'
testing.assert_frame_equal(output, expected)

def test_log_percent_change_base_log10(self):
metric = operations.LogPercentChange(
'grp',
'B',
metrics.Sum('x'),
base='log10',
)
output = metric.compute_on(self.df)
expected = pd.DataFrame(
[[np.log10(3.0)]],
columns=['sum(x)'],
index=['A'],
)
expected.index.name = 'grp'
testing.assert_frame_equal(output, expected)

def test_log_percent_change_warns_on_zero_negative_returns_nan(self):
df = pd.DataFrame({
'x': [-1, 1],
'grp': ['B', 'A'],
})
metric = operations.LogPercentChange('grp', 'B', metrics.Sum('x'))
with self.assertWarns(UserWarning) as cm:
output = metric.compute_on(df)
self.assertIn(
'LogPercentChange found zero or negative values. Returning NaN.',
str(cm.warning),
)
self.assertTrue(np.isnan(output.values[0][0]))

def test_absolute_change(self):
metric = operations.AbsoluteChange('grp', 'B', metrics.Sum('x'))
output = metric.compute_on(self.df)
Expand Down Expand Up @@ -1199,6 +1240,74 @@ def test_jackknife_with_operation_with_multiple_columns_display(self):
output = jk_change.compute_on(df)
output.display()

def test_log_percent_change_and_jackknife(self):
df = pd.DataFrame({
'x': [10.0, 20.0, 30.0, 40.0],
'grp': ['Control', 'Control', 'Treatment', 'Treatment'],
'cookie': [1, 2, 3, 4],
})
metric = operations.LogPercentChange('grp', 'Control', metrics.Mean('x'))
jk = operations.Jackknife('cookie', metric)
output = jk.compute_on(df)

# Control mean = 15.0, Treatment mean = 35.0
# Ratio = 35.0 / 15.0 = 7.0 / 3.0
# Expected inverse transformed Value = 100 * (7/3 - 1) = 100 * 4/3 = 133.33
expected_val = 100.0 * 4.0 / 3.0
self.assertAlmostEqual(
output.loc['Treatment', ('mean(x)', 'Value')],
expected_val,
)

def test_log_percent_change_jackknife_with_confidence_reverses_log(self):
df = pd.DataFrame({
'x': [10.0, 20.0, 30.0, 40.0],
'grp': ['Control', 'Control', 'Treatment', 'Treatment'],
'cookie': [1, 2, 3, 4],
})

# 1. Run LogPercentChange normally. `reverse_log` is True by default.
# The output should have both Value and CI-bounds in the post-transformed
# form.
metric = operations.LogPercentChange('grp', 'Control', metrics.Mean('x'))
jk = operations.Jackknife('cookie', metric, confidence=0.95)
res = jk.compute_on(df)

# 2. Re-run with `reverse_log` disabled manually to get the raw log-domain
# estimates.
metric_no_transform = operations.LogPercentChange(
'grp', 'Control', metrics.Mean('x')
)
metric_no_transform.reverse_log = False
jk_no_transform = operations.Jackknife(
'cookie', metric_no_transform, confidence=0.95
)
res_no_transform = jk_no_transform.compute_on(df)

# 3. Mathematically verify that the inverse transform was correctly applied
# to both the Point Value and both CI Bounds in `operations.py`.
transform_fn = metric.get_post_processing_fn()

expected_val = transform_fn(
res_no_transform.loc['Treatment', ('mean(x)', 'Value')]
)
expected_lower = transform_fn(
res_no_transform.loc['Treatment', ('mean(x)', 'Jackknife CI-lower')]
)
expected_upper = transform_fn(
res_no_transform.loc['Treatment', ('mean(x)', 'Jackknife CI-upper')]
)

self.assertAlmostEqual(
res.loc['Treatment', ('mean(x)', 'Value')], expected_val
)
self.assertAlmostEqual(
res.loc['Treatment', ('mean(x)', 'Jackknife CI-lower')], expected_lower
)
self.assertAlmostEqual(
res.loc['Treatment', ('mean(x)', 'Jackknife CI-upper')], expected_upper
)

def test_operation_with_jackknife(self):
df = pd.DataFrame({
'x': range(1, 11),
Expand Down Expand Up @@ -2521,6 +2630,10 @@ def test_different_metrics_have_different_fingerprints(self):
operations.Bootstrap('x', n_replicates=10),
operations.Bootstrap('x', confidence=0.9),
operations.Bootstrap('x', confidence=0.95),
operations.LogTransform('x'),
operations.LogTransform('x', base='log10'),
operations.ReverseLogPercentTransform('x'),
operations.ReverseLogPercentTransform('x', base='log10'),
diversity.HHI('x'),
diversity.HHI('y'),
diversity.Entropy('x'),
Expand Down
Loading