From 298870b7ddc8908423aac717e3e7e396a8c5a92e Mon Sep 17 00:00:00 2001 From: Goran Jelic-Cizmek Date: Mon, 15 Jul 2024 20:51:44 +0200 Subject: [PATCH 01/13] Refactor to use more numpy functions internally --- src/data_morph/data/dataset.py | 3 + src/data_morph/data/stats.py | 23 +++--- src/data_morph/morpher.py | 66 +++++++++------ src/data_morph/plotting/static.py | 4 +- .../shapes/bases/line_collection.py | 80 ++++++------------- tests/data/test_stats.py | 6 +- tests/shapes/bases/test_line_collection.py | 6 -- tests/test_morpher.py | 4 +- 8 files changed, 94 insertions(+), 98 deletions(-) diff --git a/src/data_morph/data/dataset.py b/src/data_morph/data/dataset.py index b3a2c705..cd9ad1bb 100644 --- a/src/data_morph/data/dataset.py +++ b/src/data_morph/data/dataset.py @@ -50,6 +50,9 @@ def __init__( self.df: pd.DataFrame = self._validate_data(df).pipe(self._scale_data, scale) """pandas.DataFrame: DataFrame containing columns x and y.""" + self._x = self.df['x'].to_numpy() + self._y = self.df['y'].to_numpy() + self.name: str = name """str: The name to use for the dataset.""" diff --git a/src/data_morph/data/stats.py b/src/data_morph/data/stats.py index d3c52669..a8a3070c 100644 --- a/src/data_morph/data/stats.py +++ b/src/data_morph/data/stats.py @@ -1,8 +1,10 @@ """Utility functions for calculating summary statistics.""" from collections import namedtuple +from numbers import Number +from typing import Iterable -import pandas as pd +import numpy as np SummaryStatistics = namedtuple( 'SummaryStatistics', ['x_mean', 'y_mean', 'x_stdev', 'y_stdev', 'correlation'] @@ -12,14 +14,17 @@ ) -def get_values(df: pd.DataFrame) -> SummaryStatistics: +def get_values(x: Iterable[Number], y: Iterable[Number]) -> SummaryStatistics: """ Calculate the summary statistics for the given set of points. Parameters ---------- - df : pandas.DataFrame - A dataset with columns x and y. + x : Iterable[Number] + The ``x`` value of the dataset. + + y : Iterable[Number] + The ``y`` value of the dataset. Returns ------- @@ -28,9 +33,9 @@ def get_values(df: pd.DataFrame) -> SummaryStatistics: along with the Pearson correlation coefficient between the two. """ return SummaryStatistics( - df.x.mean(), - df.y.mean(), - df.x.std(), - df.y.std(), - df.corr().x.y, + np.mean(x), + np.mean(y), + np.std(x, ddof=1), + np.std(y, ddof=1), + np.corrcoef(x, y)[0, 1], ) diff --git a/src/data_morph/morpher.py b/src/data_morph/morpher.py index e0d7da76..c108b400 100644 --- a/src/data_morph/morpher.py +++ b/src/data_morph/morpher.py @@ -3,7 +3,7 @@ from functools import partial from numbers import Number from pathlib import Path -from typing import Optional, Union +from typing import Iterable, MutableSequence, Optional, Union import numpy as np import pandas as pd @@ -239,16 +239,26 @@ def _record_frames( frame_number += 1 return frame_number - def _is_close_enough(self, df1: pd.DataFrame, df2: pd.DataFrame) -> bool: + def _is_close_enough( + self, + x1: Iterable[Number], + y1: Iterable[Number], + x2: Iterable[Number], + y2: Iterable[Number], + ) -> bool: """ Check whether the statistics are within the acceptable bounds. Parameters ---------- - df1 : pandas.DataFrame - The original DataFrame. - df2 : pandas.DataFrame - The DataFrame after the latest perturbation. + x1 : Iterable[Number] + The original value of ``x``. + y1 : Iterable[Number] + The original value of ``y``. + x2 : Iterable[Number] + The perturbed value of ``x``. + y2 : Iterable[Number] + The perturbed value of ``y``. Returns ------- @@ -258,10 +268,8 @@ def _is_close_enough(self, df1: pd.DataFrame, df2: pd.DataFrame) -> bool: return np.all( np.abs( np.subtract( - *( - np.floor(np.array(get_values(data)) * 10**self.decimals) - for data in [df1, df2] - ) + np.floor(np.array(get_values(x1, y1)) * 10**self.decimals), + np.floor(np.array(get_values(x2, y2)) * 10**self.decimals), ) ) == 0 @@ -269,21 +277,24 @@ def _is_close_enough(self, df1: pd.DataFrame, df2: pd.DataFrame) -> bool: def _perturb( self, - df: pd.DataFrame, + x: MutableSequence[Number], + y: MutableSequence[Number], target_shape: Shape, *, shake: Number, allowed_dist: Number, temp: Number, bounds: BoundingBox, - ) -> pd.DataFrame: + ) -> tuple[MutableSequence[Number], MutableSequence[Number]]: """ Perform one round of perturbation. Parameters ---------- - df : pandas.DataFrame - The data to perturb. + x : MutableSequence[Number] + The ``x`` part of the dataset. + y : MutableSequence[Number] + The ``y`` part of the dataset. target_shape : Shape The shape to morph the data into. shake : numbers.Number @@ -300,12 +311,12 @@ def _perturb( Returns ------- - pandas.DataFrame + tuple[MutableSequence[Number], MutableSequence[Number]] The input dataset with one point perturbed. """ - row = self._rng.integers(0, len(df)) - initial_x = df.at[row, 'x'] - initial_y = df.at[row, 'y'] + row = self._rng.integers(0, len(x)) + initial_x = x[row] + initial_y = y[row] # this is the simulated annealing step, if "do_bad", then we are willing to # accept a new state which is worse than the current one @@ -324,10 +335,10 @@ def _perturb( within_bounds = [new_x, new_y] in bounds done = close_enough and within_bounds - df.loc[row, 'x'] = new_x - df.loc[row, 'y'] = new_y + x[row] = new_x + y[row] = new_y - return df + return x, y def morph( self, @@ -468,11 +479,17 @@ def _tweening(frame, *, min_value, max_value): # numpydoc ignore=PR01,RT01 max_value=max_shake, ) + x, y = ( + start_shape.df['x'].to_numpy(copy=True), + start_shape.df['y'].to_numpy(copy=True), + ) + for i in self._looper( iterations, leave=True, ascii=True, desc=f'{target_shape} pattern' ): perturbed_data = self._perturb( - morphed_data.copy(), + np.copy(x), + np.copy(y), target_shape=target_shape, shake=get_current_shake(i), allowed_dist=allowed_dist, @@ -480,8 +497,9 @@ def _tweening(frame, *, min_value, max_value): # numpydoc ignore=PR01,RT01 bounds=start_shape.morph_bounds, ) - if self._is_close_enough(start_shape.df, perturbed_data): - morphed_data = perturbed_data + if self._is_close_enough(x, y, *perturbed_data): + x, y = perturbed_data + morphed_data = pd.DataFrame({'x': x, 'y': y}) frame_number = record_frames( data=morphed_data, diff --git a/src/data_morph/plotting/static.py b/src/data_morph/plotting/static.py index e91047ec..33d807c7 100644 --- a/src/data_morph/plotting/static.py +++ b/src/data_morph/plotting/static.py @@ -58,11 +58,11 @@ def plot( ax.xaxis.set_major_formatter(tick_formatter) ax.yaxis.set_major_formatter(tick_formatter) - res = get_values(df) + res = get_values(df['x'].to_numpy(), df['y'].to_numpy()) labels = ('X Mean', 'Y Mean', 'X SD', 'Y SD', 'Corr.') locs = np.linspace(0.8, 0.2, num=len(labels)) - max_label_length = max([len(label) for label in labels]) + max_label_length = max(len(label) for label in labels) max_stat = int(np.log10(np.max(np.abs(res)))) + 1 mean_x_digits, mean_y_digits = ( int(x) + 1 for x in np.log10(np.abs([res.x_mean, res.y_mean])) diff --git a/src/data_morph/shapes/bases/line_collection.py b/src/data_morph/shapes/bases/line_collection.py index ee155387..f1001572 100644 --- a/src/data_morph/shapes/bases/line_collection.py +++ b/src/data_morph/shapes/bases/line_collection.py @@ -4,6 +4,7 @@ from typing import Iterable import matplotlib.pyplot as plt +import numpy as np from matplotlib.axes import Axes from ...plotting.style import plot_with_custom_style @@ -44,65 +45,36 @@ def distance(self, x: Number, y: Number) -> float: float The minimum distance from the lines of this shape to the point (x, y). - """ - return min( - self._distance_point_to_line(point=(x, y), line=line) for line in self.lines - ) - - def _distance_point_to_line( - self, - point: Iterable[Number], - line: Iterable[Iterable[Number]], - ) -> float: - """ - Calculate the minimum distance between a point and a line. - - Parameters - ---------- - point : Iterable[numbers.Number] - Coordinates of a point in 2D space. - line : Iterable[Iterable[numbers.Number]] - Coordinates of the endpoints of a line in 2D space. - - Returns - ------- - float - The minimum distance between the point and the line. Notes ----- - Implementation based on `this VBA code`_. + Implementation based on `this SO answer`_. - .. _this VBA code: http://local.wasp.uwa.edu.au/~pbourke/geometry/pointline/source.vba + .. _this SO answer: https://stackoverflow.com/a/58781995 """ - start, end = line - line_mag = self._euclidean_distance(start, end) - - if line_mag < 0.00000001: - # Arbitrarily large value - return 9999 - - px, py = point - x1, y1 = start - x2, y2 = end - - u1 = ((px - x1) * (x2 - x1)) + ((py - y1) * (y2 - y1)) - u = u1 / (line_mag * line_mag) - - if (u < 0.00001) or (u > 1): - # closest point does not fall within the line segment, take the shorter - # distance to an endpoint - distance = min( - self._euclidean_distance(point, start), - self._euclidean_distance(point, end), - ) - else: - # Intersecting point is on the line, use the formula - ix = x1 + u * (x2 - x1) - iy = y1 + u * (y2 - y1) - distance = self._euclidean_distance(point, (ix, iy)) - - return distance + p = np.array([x, y]) + lines = np.array(self.lines) + + a = lines[:, 0, :] + b = lines[:, 1, :] + + d_ba = b - a + d = np.divide(d_ba, (np.hypot(d_ba[:, 0], d_ba[:, 1]).reshape(-1, 1))) + + # signed parallel distance components + # rowwise dot products of 2D vectors + s = np.multiply(a - p, d).sum(axis=1) + t = np.multiply(p - b, d).sum(axis=1) + + # clamped parallel distance + h = np.maximum.reduce([s, t, np.zeros(len(s))]) + + # perpendicular distance component + # rowwise cross products of 2D vectors + d_pa = p - a + c = d_pa[:, 0] * d[:, 1] - d_pa[:, 1] * d[:, 0] + + return np.min(np.hypot(h, c)) @plot_with_custom_style def plot(self, ax: Axes = None) -> Axes: diff --git a/tests/data/test_stats.py b/tests/data/test_stats.py index c99134ed..e2d3d901 100644 --- a/tests/data/test_stats.py +++ b/tests/data/test_stats.py @@ -1,5 +1,7 @@ """Test the stats module.""" +import numpy as np + from data_morph.data.loader import DataLoader from data_morph.data.stats import get_values @@ -9,10 +11,10 @@ def test_stats(): data = DataLoader.load_dataset('dino').df - stats = get_values(data) + stats = get_values(data['x'], data['y']) assert stats.x_mean == data.x.mean() assert stats.y_mean == data.y.mean() assert stats.x_stdev == data.x.std() assert stats.y_stdev == data.y.std() - assert stats.correlation == data.corr().x.y + np.allclose(stats.correlation, data.corr().x.y) diff --git a/tests/shapes/bases/test_line_collection.py b/tests/shapes/bases/test_line_collection.py index 6b356552..1105e58d 100644 --- a/tests/shapes/bases/test_line_collection.py +++ b/tests/shapes/bases/test_line_collection.py @@ -34,12 +34,6 @@ def test_distance_nonzero(self, line_collection, point, expected_distance): """Test the distance() method on points not on lines in the collection.""" assert pytest.approx(line_collection.distance(*point)) == expected_distance - @pytest.mark.parametrize('line', [[(0, 0), (0, 0)], [(-1, -1), (-1, -1)]], ids=str) - def test_distance_to_small_line_magnitude(self, line_collection, line): - """Test _distance_point_to_line() for small line magnitudes.""" - distance = line_collection._distance_point_to_line((30, 50), line) - assert distance == 9999 - def test_repr(self, line_collection): """Test that the __repr__() method is working.""" lines = r'\n '.join( diff --git a/tests/test_morpher.py b/tests/test_morpher.py index 471f9a1f..e5f3142a 100644 --- a/tests/test_morpher.py +++ b/tests/test_morpher.py @@ -172,7 +172,9 @@ def test_no_writing(self, capsys): with pytest.raises(AssertionError): assert_frame_equal(morphed_data, dataset.df) - assert morpher._is_close_enough(dataset.df, morphed_data) + assert morpher._is_close_enough( + dataset.df['x'], dataset.df['y'], morphed_data['x'], morphed_data['y'] + ) _, err = capsys.readouterr() assert f'{target_shape} pattern: 100%' in err From a52b25d918c27a7ddd6c4ef863f90d3abff0ca8c Mon Sep 17 00:00:00 2001 From: Goran Jelic-Cizmek Date: Sat, 20 Jul 2024 15:50:37 +0200 Subject: [PATCH 02/13] Add `shifted_*` functions and `Statistics` class --- src/data_morph/data/stats.py | 215 +++++++++++++++++++++++++++++++++++ tests/data/test_stats.py | 166 ++++++++++++++++++++++++++- 2 files changed, 380 insertions(+), 1 deletion(-) diff --git a/src/data_morph/data/stats.py b/src/data_morph/data/stats.py index a8a3070c..e37b4fcd 100644 --- a/src/data_morph/data/stats.py +++ b/src/data_morph/data/stats.py @@ -14,6 +14,221 @@ ) +def shifted_mean( + mean_old: float, + value_old: float, + value_new: float, + size: int, +) -> float: + """ + Return the shifted mean from the perturbed data. + """ + return (mean_old - value_old / size) + value_new / size + + +def shifted_var( + mean_old: float, + var_old: float, + value_old: float, + delta: float, + size: int, + *, + ddof: int = 0, +) -> float: + """ + Return the shifted covariance from the perturbed data. + """ + return ( + var_old + + 2 * delta * (value_old - mean_old) / (size - ddof) + + delta**2 * (1 / (size - ddof) - 1 / (size - ddof) / size) + ) + + +def shifted_stdev(*args, **kwargs): + return np.sqrt(shifted_var(*args, **kwargs)) + + +def shifted_corrcoef( + x_old: float, + y_old: float, + x_new: float, + y_new: float, + meanx_old: float, # the mean + meany_old: float, # the mean + xy_old: float, # the mean + varx_old: float, # the variance - ^2 + vary_old: float, # the variance - ^2 + size: int, +) -> float: + """ + Return the shifted correlation coefficient of the perturbed data. + """ + deltax = x_new - x_old + deltay = y_new - y_old + + numerator = ( + xy_old + + (deltax * y_old + deltay * x_old + deltax * deltay) / size + - shifted_mean(mean_old=meanx_old, value_old=x_old, value_new=x_new, size=size) + * shifted_mean(mean_old=meany_old, value_old=y_old, value_new=y_new, size=size) + ) + + denominator = np.sqrt( + shifted_var( + mean_old=meanx_old, + var_old=varx_old, + value_old=x_old, + delta=deltax, + size=size, + ) + * shifted_var( + mean_old=meany_old, + var_old=vary_old, + value_old=y_old, + delta=deltay, + size=size, + ) + ) + + return numerator / denominator + + +class Statistics: + def __init__(self, x: Iterable[Number], y: Iterable[Number]): + if len(x) != len(y): + raise ValueError('The two datasets should have the same size') + + self._x = np.copy(x) + self._y = np.copy(y) + self._size = len(self._x) + self._x_mean = np.mean(self._x) + self._y_mean = np.mean(self._y) + self._x_var = np.var(self._x, ddof=0) + self._x_stdev = np.sqrt(self._x_var) + self._y_var = np.var(self._y, ddof=0) + self._y_stdev = np.sqrt(self._y_var) + self._corrcoef = np.corrcoef(self._x, self._y)[0, 1] + self._xy_mean = np.mean(self._x * self._y) + + @property + def x_mean(self): + """ + Return the mean of the ``x`` data. + """ + return self._x_mean + + @property + def y_mean(self): + """ + Return the mean of the ``y`` data. + """ + return self._y_mean + + @property + def x_stdev(self): + """ + Return the std of the ``x`` data. + """ + return self._x_stdev + + @property + def y_stdev(self): + """ + Return the std of the ``y`` data. + """ + return self._y_stdev + + @property + def corrcoef(self): + """ + Return the correlation coefficient of the ``x`` and ``y`` data. + """ + return self._corrcoef + + def __len__(self): + """ + Return the size of the dataset. + """ + return len(self._x) + + def perturb( + self, + index: int, + deltax: float, + deltay: float, + *, + update: bool = False, + ) -> SummaryStatistics: + """ + Perturb a single point and return the new ``SummaryStatistics``. + """ + x_mean = shifted_mean( + mean_old=self.x_mean, + value_old=self._x[index], + value_new=self._x[index] + deltax, + size=len(self), + ) + y_mean = shifted_mean( + mean_old=self.y_mean, + value_old=self._y[index], + value_new=self._y[index] + deltay, + size=len(self), + ) + + x_var = shifted_var( + mean_old=self.x_mean, + var_old=self._x_var, + value_old=self._x[index], + delta=deltax, + size=len(self), + ) + + y_var = shifted_var( + mean_old=self.y_mean, + var_old=self._y_var, + value_old=self._y[index], + delta=deltay, + size=len(self), + ) + + corrcoef = shifted_corrcoef( + x_old=self._x[index], + y_old=self._y[index], + x_new=self._x[index] + deltax, + y_new=self._y[index] + deltay, + meanx_old=self.x_mean, + meany_old=self.y_mean, + xy_old=self._xy_mean, + varx_old=self._x_var, + vary_old=self._y_var, + size=len(self), + ) + + if update: + self._x_mean = x_mean + self._y_mean = y_mean + self._x_var = x_var + self._y_var = y_var + self._x_stdev = np.sqrt(x_var) + self._y_stdev = np.sqrt(y_var) + self._corrcoef = corrcoef + self._xy_mean += ( + deltax * self._y[index] + deltay * self._x[index] + deltax * deltay + ) / len(self) + + self._x[index] += deltax + self._y[index] += deltay + + return SummaryStatistics( + x_mean, + y_mean, + np.sqrt(x_var), + np.sqrt(y_var), + corrcoef, + ) + + def get_values(x: Iterable[Number], y: Iterable[Number]) -> SummaryStatistics: """ Calculate the summary statistics for the given set of points. diff --git a/tests/data/test_stats.py b/tests/data/test_stats.py index e2d3d901..74eefac1 100644 --- a/tests/data/test_stats.py +++ b/tests/data/test_stats.py @@ -1,9 +1,15 @@ """Test the stats module.""" import numpy as np +from numpy.testing import assert_allclose, assert_equal from data_morph.data.loader import DataLoader -from data_morph.data.stats import get_values +from data_morph.data.stats import ( + get_values, + shifted_mean, + shifted_var, + shifted_corrcoef, +) def test_stats(): @@ -18,3 +24,161 @@ def test_stats(): assert stats.x_stdev == data.x.std() assert stats.y_stdev == data.y.std() np.allclose(stats.correlation, data.corr().x.y) + + +def test_new_mean(): + data = DataLoader.load_dataset('dino').df + + # make sure if we don't do anything to the data that we retrieve the same results + x = data['x'].to_numpy() + + assert_equal(np.mean(x), shifted_mean(np.mean(x), x[0], x[0], len(x))) + + # we want to test both very large and very small displacements + for scale in [0.1, 10]: + x_old = data['x'].to_numpy() + y_old = data['y'].to_numpy() + + # scaling the data + x_old /= np.max(np.abs(x_old)) + y_old /= np.max(np.abs(y_old)) + + x_new = np.copy(x_old) + y_new = np.copy(y_old) + + rng = np.random.default_rng(42) + + for _ in range(100_000): + row = rng.integers(0, len(x_old)) + jitter_x, jitter_y = rng.normal(loc=0, scale=scale, size=2) + + x_new[row] += jitter_x + y_new[row] += jitter_y + + meanx = np.mean(x_new) + new_meanx = shifted_mean(np.mean(x_old), x_old[row], x_new[row], len(x_old)) + + meany = np.mean(y_new) + new_meany = shifted_mean(np.mean(y_old), y_old[row], y_new[row], len(y_old)) + + assert_allclose(meanx, new_meanx) + assert_allclose(meany, new_meany) + + x_old = np.copy(x_new) + y_old = np.copy(y_new) + + +def test_new_var(): + data = DataLoader.load_dataset('dino').df + + # make sure if we don't do anything to the data that we retrieve the same results + x = data['x'].to_numpy() + + assert_equal(np.var(x, ddof=0), shifted_var(np.mean(x), np.var(x), x[0], 0, len(x))) + + # we want to test both very large and very small displacements + for scale in [0.1, 10]: + x_old = data['x'].to_numpy() + y_old = data['y'].to_numpy() + + # scaling the data + x_old /= np.max(np.abs(x_old)) + y_old /= np.max(np.abs(y_old)) + + x_new = np.copy(x_old) + y_new = np.copy(y_old) + + rng = np.random.default_rng(42) + + for _ in range(100_000): + row = rng.integers(0, len(x_old)) + jitter_x, jitter_y = rng.normal(loc=0, scale=scale, size=2) + + x_new[row] += jitter_x + y_new[row] += jitter_y + + varx = np.var(x_new) + new_varx = shifted_var( + np.mean(x_old), np.var(x_old), x_old[row], jitter_x, len(x_old) + ) + + vary = np.var(y_new) + new_vary = shifted_var( + np.mean(y_old), np.var(y_old), y_old[row], jitter_y, len(y_old) + ) + + assert_allclose(varx, new_varx) + assert_allclose(vary, new_vary) + + x_old = np.copy(x_new) + y_old = np.copy(y_new) + + +def test_new_corrcoef(): + data = DataLoader.load_dataset('dino').df + + # make sure if we don't do anything to the data that we retrieve the same results + x = data['x'].to_numpy() + y = data['y'].to_numpy() + corrcoef = np.corrcoef(x, y)[0, 1] + row = 0 + new_corrcoef = shifted_corrcoef( + x_old=x[row], + y_old=y[row], + x_new=x[row], + y_new=y[row], + meanx_old=np.mean(x), + meany_old=np.mean(y), + xy_old=np.mean(x * y), + varx_old=np.var(x), + vary_old=np.var(y), + size=len(x), + ) + + corrcoef_by_hand = np.cov(x, y, ddof=0) / np.sqrt(np.var(x) * np.var(y)) + + assert_allclose(corrcoef, corrcoef_by_hand[0, 1]) + assert_allclose(corrcoef_by_hand[0, 1], new_corrcoef) + + # we want to test both very large and very small displacements + for scale in [0.1, 10]: + x_old = data['x'].to_numpy() + y_old = data['y'].to_numpy() + + # scaling the data + x_old /= np.max(np.abs(x_old)) + y_old /= np.max(np.abs(y_old)) + + x_new = np.copy(x_old) + y_new = np.copy(y_old) + + rng = np.random.default_rng(42) + + for _ in range(100_000): + row = rng.integers(0, len(x_old)) + jitter_x, jitter_y = rng.normal(loc=0, scale=scale, size=2) + + x_new[row] += jitter_x + y_new[row] += jitter_y + + # corrcoef = np.corrcoef(x_new, y_new)[0, 1] + corrcoef = ( + np.cov(x_new, y_new, ddof=0) / np.sqrt(np.var(x_new) * np.var(y_new)) + )[0, 1] + new_corrcoef = shifted_corrcoef( + x_old=x_old[row], + y_old=y_old[row], + x_new=x_new[row], + y_new=y_new[row], + meanx_old=np.mean(x_old), + meany_old=np.mean(y_old), + xy_old=np.mean(x_old * y_old), + varx_old=np.var(x_old), + vary_old=np.var(y_old), + size=len(x_old), + ) + + assert_allclose(corrcoef, new_corrcoef) + + x_old = np.copy(x_new) + y_old = np.copy(y_new) From 2a805976eafd4fe0cf7410b364b27f5aa6767c8f Mon Sep 17 00:00:00 2001 From: Goran Jelic-Cizmek Date: Sat, 20 Jul 2024 15:52:06 +0200 Subject: [PATCH 03/13] Actually use the cached statistics --- src/data_morph/morpher.py | 56 ++++++++++++++++++++++++--------------- tests/data/test_stats.py | 2 +- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/src/data_morph/morpher.py b/src/data_morph/morpher.py index c108b400..58f1dbaa 100644 --- a/src/data_morph/morpher.py +++ b/src/data_morph/morpher.py @@ -11,7 +11,7 @@ from .bounds.bounding_box import BoundingBox from .data.dataset import Dataset -from .data.stats import get_values +from .data.stats import Statistics, SummaryStatistics from .plotting.animation import ( ease_in_out_quadratic, ease_in_out_sine, @@ -241,24 +241,20 @@ def _record_frames( def _is_close_enough( self, - x1: Iterable[Number], - y1: Iterable[Number], - x2: Iterable[Number], - y2: Iterable[Number], + item1: SummaryStatistics, + item2: SummaryStatistics, + /, ) -> bool: """ Check whether the statistics are within the acceptable bounds. Parameters ---------- - x1 : Iterable[Number] - The original value of ``x``. - y1 : Iterable[Number] - The original value of ``y``. - x2 : Iterable[Number] - The perturbed value of ``x``. - y2 : Iterable[Number] - The perturbed value of ``y``. + item1: SummaryStatistics + the first summary statistic + + item2: SummaryStatistics + the second summary statistic Returns ------- @@ -268,8 +264,8 @@ def _is_close_enough( return np.all( np.abs( np.subtract( - np.floor(np.array(get_values(x1, y1)) * 10**self.decimals), - np.floor(np.array(get_values(x2, y2)) * 10**self.decimals), + np.floor(np.array(item1) * 10**self.decimals), + np.floor(np.array(item2) * 10**self.decimals), ) ) == 0 @@ -285,7 +281,7 @@ def _perturb( allowed_dist: Number, temp: Number, bounds: BoundingBox, - ) -> tuple[MutableSequence[Number], MutableSequence[Number]]: + ) -> tuple[int, MutableSequence[Number], MutableSequence[Number]]: """ Perform one round of perturbation. @@ -311,8 +307,8 @@ def _perturb( Returns ------- - tuple[MutableSequence[Number], MutableSequence[Number]] - The input dataset with one point perturbed. + tuple[int, MutableSequence[Number], MutableSequence[Number]] + The index and input dataset with one point perturbed. """ row = self._rng.integers(0, len(x)) initial_x = x[row] @@ -338,7 +334,7 @@ def _perturb( x[row] = new_x y[row] = new_y - return x, y + return row, x, y def morph( self, @@ -484,10 +480,16 @@ def _tweening(frame, *, min_value, max_value): # numpydoc ignore=PR01,RT01 start_shape.df['y'].to_numpy(copy=True), ) + # the starting dataset statistics + stats = Statistics(x, y) + + # the summary statistics of the above + summary_stats = stats.perturb(0, 0, 0) + for i in self._looper( iterations, leave=True, ascii=True, desc=f'{target_shape} pattern' ): - perturbed_data = self._perturb( + index, *perturbed_data = self._perturb( np.copy(x), np.copy(y), target_shape=target_shape, @@ -497,8 +499,20 @@ def _tweening(frame, *, min_value, max_value): # numpydoc ignore=PR01,RT01 bounds=start_shape.morph_bounds, ) - if self._is_close_enough(x, y, *perturbed_data): + new_summary_stats = stats.perturb( + index, + perturbed_data[0][index] - x[index], + perturbed_data[1][index] - y[index], + ) + + if self._is_close_enough(summary_stats, new_summary_stats): x, y = perturbed_data + summary_stats = stats.perturb( + index, + perturbed_data[0][index] - x[index], + perturbed_data[1][index] - y[index], + update=True, + ) morphed_data = pd.DataFrame({'x': x, 'y': y}) frame_number = record_frames( diff --git a/tests/data/test_stats.py b/tests/data/test_stats.py index 74eefac1..b1a12121 100644 --- a/tests/data/test_stats.py +++ b/tests/data/test_stats.py @@ -6,9 +6,9 @@ from data_morph.data.loader import DataLoader from data_morph.data.stats import ( get_values, + shifted_corrcoef, shifted_mean, shifted_var, - shifted_corrcoef, ) From 58f6d7c28b416767260c91feec5738892783ca50 Mon Sep 17 00:00:00 2001 From: Goran Jelic-Cizmek Date: Sat, 20 Jul 2024 16:13:22 +0200 Subject: [PATCH 04/13] Remove unused comment --- tests/data/test_stats.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/data/test_stats.py b/tests/data/test_stats.py index b1a12121..45f16d84 100644 --- a/tests/data/test_stats.py +++ b/tests/data/test_stats.py @@ -161,7 +161,6 @@ def test_new_corrcoef(): x_new[row] += jitter_x y_new[row] += jitter_y - # corrcoef = np.corrcoef(x_new, y_new)[0, 1] corrcoef = ( np.cov(x_new, y_new, ddof=0) / np.sqrt(np.var(x_new) * np.var(y_new)) )[0, 1] From 9635da5f14fe43d71ebef5ee1c487cbf05732db3 Mon Sep 17 00:00:00 2001 From: Goran Jelic-Cizmek Date: Sat, 20 Jul 2024 16:27:07 +0200 Subject: [PATCH 05/13] Use `value_new` instead of `delta` for `shifted_var` --- src/data_morph/data/stats.py | 14 +++++++------- tests/data/test_stats.py | 8 +++++--- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/data_morph/data/stats.py b/src/data_morph/data/stats.py index e37b4fcd..2b442493 100644 --- a/src/data_morph/data/stats.py +++ b/src/data_morph/data/stats.py @@ -30,7 +30,7 @@ def shifted_var( mean_old: float, var_old: float, value_old: float, - delta: float, + value_new: float, size: int, *, ddof: int = 0, @@ -40,8 +40,8 @@ def shifted_var( """ return ( var_old - + 2 * delta * (value_old - mean_old) / (size - ddof) - + delta**2 * (1 / (size - ddof) - 1 / (size - ddof) / size) + + 2 * (value_new - value_old) * (value_old - mean_old) / (size - ddof) + + (value_new - value_old) ** 2 * (1 / (size - ddof) - 1 / (size - ddof) / size) ) @@ -79,14 +79,14 @@ def shifted_corrcoef( mean_old=meanx_old, var_old=varx_old, value_old=x_old, - delta=deltax, + value_new=x_new, size=size, ) * shifted_var( mean_old=meany_old, var_old=vary_old, value_old=y_old, - delta=deltay, + value_new=y_new, size=size, ) ) @@ -180,7 +180,7 @@ def perturb( mean_old=self.x_mean, var_old=self._x_var, value_old=self._x[index], - delta=deltax, + value_new=self._x[index] + deltax, size=len(self), ) @@ -188,7 +188,7 @@ def perturb( mean_old=self.y_mean, var_old=self._y_var, value_old=self._y[index], - delta=deltay, + value_new=self._y[index] + deltay, size=len(self), ) diff --git a/tests/data/test_stats.py b/tests/data/test_stats.py index 45f16d84..0caf7446 100644 --- a/tests/data/test_stats.py +++ b/tests/data/test_stats.py @@ -74,7 +74,9 @@ def test_new_var(): # make sure if we don't do anything to the data that we retrieve the same results x = data['x'].to_numpy() - assert_equal(np.var(x, ddof=0), shifted_var(np.mean(x), np.var(x), x[0], 0, len(x))) + assert_equal( + np.var(x, ddof=0), shifted_var(np.mean(x), np.var(x), x[0], x[0], len(x)) + ) # we want to test both very large and very small displacements for scale in [0.1, 10]: @@ -99,12 +101,12 @@ def test_new_var(): varx = np.var(x_new) new_varx = shifted_var( - np.mean(x_old), np.var(x_old), x_old[row], jitter_x, len(x_old) + np.mean(x_old), np.var(x_old), x_old[row], x_new[row], len(x_old) ) vary = np.var(y_new) new_vary = shifted_var( - np.mean(y_old), np.var(y_old), y_old[row], jitter_y, len(y_old) + np.mean(y_old), np.var(y_old), y_old[row], y_new[row], len(y_old) ) assert_allclose(varx, new_varx) From c683af719a1cb1478d512e915112d7668997d0a8 Mon Sep 17 00:00:00 2001 From: Goran Jelic-Cizmek Date: Sun, 21 Jul 2024 13:34:53 +0200 Subject: [PATCH 06/13] Fix bug in implementation of `morph` Order of operations was wrong before, causing incorrect outputs. --- src/data_morph/morpher.py | 4 ++-- tests/test_morpher.py | 9 ++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/data_morph/morpher.py b/src/data_morph/morpher.py index 58f1dbaa..2601422e 100644 --- a/src/data_morph/morpher.py +++ b/src/data_morph/morpher.py @@ -3,7 +3,7 @@ from functools import partial from numbers import Number from pathlib import Path -from typing import Iterable, MutableSequence, Optional, Union +from typing import MutableSequence, Optional, Union import numpy as np import pandas as pd @@ -506,13 +506,13 @@ def _tweening(frame, *, min_value, max_value): # numpydoc ignore=PR01,RT01 ) if self._is_close_enough(summary_stats, new_summary_stats): - x, y = perturbed_data summary_stats = stats.perturb( index, perturbed_data[0][index] - x[index], perturbed_data[1][index] - y[index], update=True, ) + x, y = perturbed_data morphed_data = pd.DataFrame({'x': x, 'y': y}) frame_number = record_frames( diff --git a/tests/test_morpher.py b/tests/test_morpher.py index e5f3142a..4ce710a2 100644 --- a/tests/test_morpher.py +++ b/tests/test_morpher.py @@ -11,6 +11,7 @@ from pandas.testing import assert_frame_equal from data_morph.data.loader import DataLoader +from data_morph.data.stats import Statistics from data_morph.morpher import DataMorpher from data_morph.shapes.factory import ShapeFactory @@ -170,10 +171,16 @@ def test_no_writing(self, capsys): freeze_for=0, ) + stats = Statistics(dataset.df['x'], dataset.df['y']).perturb(0, 0, 0) + perturbed_stats = Statistics(morphed_data['x'], morphed_data['y']).perturb( + 0, 0, 0 + ) + with pytest.raises(AssertionError): assert_frame_equal(morphed_data, dataset.df) assert morpher._is_close_enough( - dataset.df['x'], dataset.df['y'], morphed_data['x'], morphed_data['y'] + stats, + perturbed_stats, ) _, err = capsys.readouterr() From 448c8453f32469403e01d5b98d6d2c9b697444d2 Mon Sep 17 00:00:00 2001 From: Goran Jelic-Cizmek Date: Sun, 21 Jul 2024 13:58:24 +0200 Subject: [PATCH 07/13] Updated all docstrings accordingly --- src/data_morph/data/stats.py | 169 ++++++++++++++++++++++++++++++++--- 1 file changed, 155 insertions(+), 14 deletions(-) diff --git a/src/data_morph/data/stats.py b/src/data_morph/data/stats.py index 2b442493..4ea5c500 100644 --- a/src/data_morph/data/stats.py +++ b/src/data_morph/data/stats.py @@ -21,7 +21,23 @@ def shifted_mean( size: int, ) -> float: """ - Return the shifted mean from the perturbed data. + Return the shifted mean by perturbing one point. + + Parameters + ---------- + mean_old : float + The old value of the mean of the data. + value_old : float + The old value of the point (before perturbation). + value_new : float + The new value of the point (after perturbation). + size : int + The size of the dataset. + + Returns + ------- + float + The new value of the mean of the data. """ return (mean_old - value_old / size) + value_new / size @@ -33,10 +49,32 @@ def shifted_var( value_new: float, size: int, *, - ddof: int = 0, + ddof: float = 0, ) -> float: """ - Return the shifted covariance from the perturbed data. + Compute the shifted variance by perturbing one point. + + Parameters + ---------- + mean_old : float + The old value of the mean of the data. + var_old : float + The old value of the variance of the data. + value_old : float + The old value of the point (before perturbation). + value_new : float + The new value of the point (after perturbation). + size : int + The size of the dataset. + ddof : float, optional + “Delta Degrees of Freedom”: the divisor used in the calculation is ``N + - ddof``, where ``N`` represents the number of elements. By default + ``ddof`` is zero. + + Returns + ------- + float + The new value of the covariance of the data. """ return ( var_old @@ -46,6 +84,21 @@ def shifted_var( def shifted_stdev(*args, **kwargs): + """ + Compute the shifted standard deviation by perturbing one point. + + Parameters + ---------- + *args + The positional arguments passed to :attr:``shifted_cov``. + **kwargs + The keyword arguments passed to :attr:``shifted_cov``. + + Returns + ------- + float + The new value of the standard deviation of the data. + """ return np.sqrt(shifted_var(*args, **kwargs)) @@ -54,15 +107,43 @@ def shifted_corrcoef( y_old: float, x_new: float, y_new: float, - meanx_old: float, # the mean - meany_old: float, # the mean - xy_old: float, # the mean - varx_old: float, # the variance - ^2 - vary_old: float, # the variance - ^2 + meanx_old: float, + meany_old: float, + xy_old: float, + varx_old: float, + vary_old: float, size: int, ) -> float: """ - Return the shifted correlation coefficient of the perturbed data. + Compute the shifted correlation of the data by perturbing one point. + + Parameters + ---------- + x_old : float + The old value of the point ``x`` (before perturbation). + y_old : float + The old value of the point ``y`` (before perturbation). + x_new : float + The new value of the point ``x`` (after perturbation). + y_new : float + The new value of the point ``y`` (after perturbation). + meanx_old : float + The old value of the mean of the data ``x``. + meany_old : float + The old value of the mean of the data ``y``. + xy_old : float + The old value of the mean of ``x * y``. + varx_old : float + The old value of the variance of ``x``. + vary_old : float + The old value of the variance of ``y``. + size : int + The size of the dataset. + + Returns + ------- + float + The new correlation coefficient of the data. """ deltax = x_new - x_old deltay = y_new - y_old @@ -95,6 +176,17 @@ def shifted_corrcoef( class Statistics: + """ + Container for computing various statistics of the data. + + Parameters + ---------- + x : iterable of float + The ``x`` value of the data as an iterable. + y : iterable of float + The ``y`` value of the data as an iterable. + """ + def __init__(self, x: Iterable[Number], y: Iterable[Number]): if len(x) != len(y): raise ValueError('The two datasets should have the same size') @@ -112,43 +204,73 @@ def __init__(self, x: Iterable[Number], y: Iterable[Number]): self._xy_mean = np.mean(self._x * self._y) @property - def x_mean(self): + def x_mean(self) -> float: """ Return the mean of the ``x`` data. + + Returns + ------- + float + The mean of the ``x`` data. """ return self._x_mean @property - def y_mean(self): + def y_mean(self) -> float: """ Return the mean of the ``y`` data. + + Returns + ------- + float + The mean of the ``y`` data. """ return self._y_mean @property - def x_stdev(self): + def x_stdev(self) -> float: """ Return the std of the ``x`` data. + + Returns + ------- + float + The standard deviation of the ``x`` data. """ return self._x_stdev @property - def y_stdev(self): + def y_stdev(self) -> float: """ Return the std of the ``y`` data. + + Returns + ------- + float + The standard deviation of the ``y`` data. """ return self._y_stdev @property - def corrcoef(self): + def corrcoef(self) -> float: """ Return the correlation coefficient of the ``x`` and ``y`` data. + + Returns + ------- + float + The correlation coefficient between ``x`` and ``y`` data. """ return self._corrcoef def __len__(self): """ Return the size of the dataset. + + Returns + ------- + int + The size of the dataset. """ return len(self._x) @@ -162,6 +284,25 @@ def perturb( ) -> SummaryStatistics: """ Perturb a single point and return the new ``SummaryStatistics``. + + Parameters + ---------- + index : int + The index of the point we wish to perturb. + + deltax : float + The amount by which to perturb the ``x`` point. + + deltay : float + The amount by which to perturb the ``y`` point. + + update : bool, optional + Whether to actually update the data (default: False). + + Returns + ------- + SummaryStatistics + The new summary statistics. """ x_mean = shifted_mean( mean_old=self.x_mean, From 4f7882e0f047a78cdc0aaba75ad7a1d57f569baa Mon Sep 17 00:00:00 2001 From: Goran Jelic-Cizmek Date: Mon, 22 Jul 2024 20:06:54 +0200 Subject: [PATCH 08/13] Add implementation of median The implementation uses an AVL tree to keep track of the low and high parts of the input array, and then updates the trees accordingly in O(log n) time (instead of O(n) provided by numpy.median). --- pyproject.toml | 1 + src/data_morph/data/stats.py | 99 ++++++++++++++++++++++++++++++++++++ tests/data/test_stats.py | 59 +++++++++++++++++++++ 3 files changed, 159 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index b621e320..38a4bd02 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ dynamic = [ ] dependencies = [ + "avltree==1.1.2", # exact version as it's not likely to be deprecated "matplotlib>=3.3", "numpy>=1.20", "pandas>=1.2", diff --git a/src/data_morph/data/stats.py b/src/data_morph/data/stats.py index 4ea5c500..75a80822 100644 --- a/src/data_morph/data/stats.py +++ b/src/data_morph/data/stats.py @@ -5,6 +5,7 @@ from typing import Iterable import numpy as np +from avltree import AvlTree SummaryStatistics = namedtuple( 'SummaryStatistics', ['x_mean', 'y_mean', 'x_stdev', 'y_stdev', 'correlation'] @@ -14,6 +15,31 @@ ) +def _create_median_avltree(data, /) -> tuple[AvlTree, AvlTree]: + """ + Return a tuple of low and high AVL trees from input data. + + Parameters + ---------- + data + The input data as an iterable + + Returns + ------- + tuple[AvlTree, AvlTree] + The low and high AVL trees. + """ + size = len(data) + xlow = AvlTree() + xhigh = AvlTree() + for index in range(size // 2): + xlow[data[index]] = None + for index in range(size // 2, size): + xhigh[data[index]] = None + + return xlow, xhigh + + def shifted_mean( mean_old: float, value_old: float, @@ -175,6 +201,79 @@ def shifted_corrcoef( return numerator / denominator +def shifted_median( + xlow: AvlTree, + xhigh: AvlTree, + value_old: float, + value_new: float, +) -> float: + """ + Compute the shifted median using AVL trees. + + Parameters + ---------- + xlow : AvlTree + The lower part of the AVL tree (below the median). + xhigh : AvlTree + The higher part of the AVL tree (above the median). + value_old : float + The old value of the point (before perturbation). + value_new : float + The new value of the point (after perturbation). + + Returns + ------- + float + The new value of the median of the data. + """ + + low_max = xlow.maximum() + high_min = xhigh.minimum() + + # case 1: xi in S1 (low) and xi' in S1 (low) + if value_old <= low_max and value_new <= low_max: + # remove xi from S1 + del xlow[value_old] + # insert xi' into S1 + xlow[value_new] = None + # case 2: xi in S1 (low) and xi' in S2 (high) + elif value_old <= low_max < value_new: + # remove xi from S1 + del xlow[value_old] + # insert xi' into S2 + xhigh[value_new] = None + # remove smallest element from S2 + high_min = xhigh.minimum() + del xhigh[high_min] + # put the above element into S1 + xlow[high_min] = None + # case 3: xi in S2 (high) and xi' in S1 (low) + elif value_new <= high_min <= value_old: + # remove xi from S2 + del xhigh[value_old] + # insert xi' into S1 + xlow[value_new] = None + # remove largest element from S1 + low_max = xlow.maximum() + del xlow[low_max] + # put the above element into S2 + xhigh[low_max] = None + # case 4: xi in S2 (high) and xi' in S2 (high) + elif value_old >= high_min and value_new >= high_min: + # remove xi from S2 + del xhigh[value_old] + # insert xi' into S2 + xhigh[value_new] = None + + # in any case, the median should now be computable from xhigh.minimum() and + # xlow.maximum() + + if len(xlow) == len(xhigh): + return (xlow.maximum() + xhigh.minimum()) / 2 + + return xhigh.minimum() + + class Statistics: """ Container for computing various statistics of the data. diff --git a/tests/data/test_stats.py b/tests/data/test_stats.py index 0caf7446..659dffbe 100644 --- a/tests/data/test_stats.py +++ b/tests/data/test_stats.py @@ -1,13 +1,16 @@ """Test the stats module.""" import numpy as np +import pytest from numpy.testing import assert_allclose, assert_equal from data_morph.data.loader import DataLoader from data_morph.data.stats import ( + _create_median_avltree, get_values, shifted_corrcoef, shifted_mean, + shifted_median, shifted_var, ) @@ -183,3 +186,59 @@ def test_new_corrcoef(): x_old = np.copy(x_new) y_old = np.copy(y_new) + + +@pytest.mark.parametrize('data', [list(range(5)), list(range(10))]) +def test_shifted_median(data): + """ + Check that the ``shifted_median`` function works properly. + """ + data = np.sort(np.array(data, dtype=float)) + size = len(data) + xlow, xhigh = _create_median_avltree(data) + assert_equal(data, np.concatenate([list(xlow), list(xhigh)])) + # make sure it works if we don't do anything + ref = np.median(data) + actual = shifted_median(xlow, xhigh, data[0], data[0]) + assert_equal(len(data), len(xlow) + len(xhigh)) + assert_equal(ref, actual) + + # move lower part of data into itself + x = np.copy(data) + xlow, xhigh = _create_median_avltree(data) + index = size // 2 - 1 + x[index] -= 0.3 + ref = np.median(x) + actual = shifted_median(xlow, xhigh, data[index], x[index]) + assert_equal(len(x), len(xlow) + len(xhigh)) + assert_equal(ref, actual) + + # move higher part of data into itself + x = np.copy(data) + xlow, xhigh = _create_median_avltree(data) + index = size // 2 + 1 + x[index] += 0.3 + ref = np.median(x) + actual = shifted_median(xlow, xhigh, data[index], x[index]) + assert_equal(len(x), len(xlow) + len(xhigh)) + assert_equal(ref, actual) + + # move higher part of data into lower part + x = np.copy(data) + xlow, xhigh = _create_median_avltree(data) + index = size // 2 + 1 + x[index] -= 5.2 + ref = np.median(x) + actual = shifted_median(xlow, xhigh, data[index], x[index]) + assert_equal(len(x), len(xlow) + len(xhigh)) + assert_equal(ref, actual) + + # move lower part of data into higher part + x = np.copy(data) + xlow, xhigh = _create_median_avltree(data) + index = size // 2 - 1 + x[index] += 5.2 + ref = np.median(x) + actual = shifted_median(xlow, xhigh, data[index], x[index]) + assert_equal(len(x), len(xlow) + len(xhigh)) + assert_equal(ref, actual) From fce745d61b4753b885a542c6180ba67ef0f34928 Mon Sep 17 00:00:00 2001 From: Goran Jelic-Cizmek Date: Sun, 15 Sep 2024 13:15:50 +0200 Subject: [PATCH 09/13] Add median to cached statistics --- src/data_morph/data/stats.py | 274 +++++++++++++++++++++++++++-------- tests/data/test_stats.py | 14 +- 2 files changed, 223 insertions(+), 65 deletions(-) diff --git a/src/data_morph/data/stats.py b/src/data_morph/data/stats.py index 75a80822..e1003387 100644 --- a/src/data_morph/data/stats.py +++ b/src/data_morph/data/stats.py @@ -1,23 +1,125 @@ """Utility functions for calculating summary statistics.""" from collections import namedtuple +from collections.abc import Mapping, Sequence from numbers import Number -from typing import Iterable +from typing import Iterable, Optional import numpy as np from avltree import AvlTree SummaryStatistics = namedtuple( - 'SummaryStatistics', ['x_mean', 'y_mean', 'x_stdev', 'y_stdev', 'correlation'] + 'SummaryStatistics', + ['x_mean', 'y_mean', 'x_med', 'y_med', 'x_stdev', 'y_stdev', 'correlation'], ) SummaryStatistics.__doc__ = ( 'Named tuple containing the summary statistics for plotting/analysis.' ) -def _create_median_avltree(data, /) -> tuple[AvlTree, AvlTree]: +class SortedCounter: """ - Return a tuple of low and high AVL trees from input data. + A version of ``collections.Counter`` that is ordered. + + Notes + ----- + The time complexity of lookup, insertion, and deletion is O(log n). + """ + + def __init__(self, iterable: Optional[Iterable] = None, /): + """ + Create a ``SortedCounter`` from an iterable or a mapping. + """ + self._tree = AvlTree() + # internal counter for how many elements are there (= size of + # container) + self._counter = 0 + + if iterable is not None: + if isinstance(iterable, Iterable): + for item in iterable: + self.add(item) + elif isinstance(iterable, Mapping): + for key, value in iterable.items(): + self.add(key, times=value) + else: + raise TypeError( + f'Type {type(iterable)} not supported for SortedCounter container.' + ) + + self._counter = sum(self._tree[key] for key in self._tree) + + def add(self, key, /, times: int = 1): + """ + Add a value to the container (possibly multiple times). + + Raises + ------ + ValueError + If ``times`` is < 0. + """ + if times <= 0: + raise ValueError( + f'Cannot add {times} item {key} to container; # of items to add must be > 0' + ) + + try: + self._tree[key] += times + except KeyError: + self._tree[key] = times + self._counter += times + + def remove(self, key, /, times: int = 1): + """ + Remove a value from the container (possibly multiple times). + + Raises + ------ + ValueError + If ``times`` is < 0. + + Notes + ----- + If ``key`` is not present in the container, does nothing. + If ``times`` is larger than the current number of items in the + container, ``key`` is removed from the container. + """ + if times <= 0: + raise ValueError( + f'Cannot remove item {key} {times} times from container; # of items to remove must be > 0' + ) + + if self._tree[key] - times <= 0: + del self._tree[key] + else: + self._tree[key] -= times + self._counter -= times + + def __getitem__(self, key): + return self._tree[key] + + def __len__(self): + return self._counter + + def maximum(self): + """ + Return the maximum value stored in the container. + """ + return self._tree.maximum() + + def minimum(self): + """ + Return the minimum value stored in the container. + """ + return self._tree.minimum() + + def __repr__(self): + return f'{self.__class__.__name__}({repr(dict(self._tree))})' + + +def create_median_tree(data: Sequence, /) -> tuple[SortedCounter, SortedCounter]: + """ + Return a tuple of low and high ``SortedCounter``s from input data. Parameters ---------- @@ -26,18 +128,18 @@ def _create_median_avltree(data, /) -> tuple[AvlTree, AvlTree]: Returns ------- - tuple[AvlTree, AvlTree] - The low and high AVL trees. - """ - size = len(data) - xlow = AvlTree() - xhigh = AvlTree() - for index in range(size // 2): - xlow[data[index]] = None - for index in range(size // 2, size): - xhigh[data[index]] = None + tuple[SortedCounter, SortedCounter] + The low and high ``SortedCounter``s. - return xlow, xhigh + Notes + ----- + The time complexity of the execution of the function is O(n log n) due to + the sorting operation done beforehand. + """ + # make sure the data is sorted + data = sorted(data) + half = len(data) // 2 + return SortedCounter(data[:half]), SortedCounter(data[half:]) def shifted_mean( @@ -202,22 +304,25 @@ def shifted_corrcoef( def shifted_median( - xlow: AvlTree, - xhigh: AvlTree, + xlow: SortedCounter, + xhigh: SortedCounter, value_old: float, value_new: float, ) -> float: """ - Compute the shifted median using AVL trees. + Compute the shifted median using two ``SortedCounter``s. Parameters ---------- - xlow : AvlTree - The lower part of the AVL tree (below the median). - xhigh : AvlTree - The higher part of the AVL tree (above the median). + xlow : SortedCounter + The lower part of the data (below the median). + + xhigh : SortedCounter + The higher part of the data (above the median). + value_old : float The old value of the point (before perturbation). + value_new : float The new value of the point (after perturbation). @@ -225,51 +330,64 @@ def shifted_median( ------- float The new value of the median of the data. + + Notes + ----- + Modifies ``xlow`` and ``xhigh`` in-place. """ + # notation: + # S1 = lower half of data values + # S2 = upper half of data values + # G = range of values . Note that it can be empty in case of duplicates + # L = range <-inf, min(S1)> + # H = range + # xi = old value of the data point + # xi'= new value of the data point + # + # constraints: + # at the end of the computation, we need abs(len(S2) - len(S1)) = 0 + # if len(S1) + len(S2) is even, 1 if odd + low_max = xlow.maximum() high_min = xhigh.minimum() - # case 1: xi in S1 (low) and xi' in S1 (low) - if value_old <= low_max and value_new <= low_max: - # remove xi from S1 - del xlow[value_old] - # insert xi' into S1 - xlow[value_new] = None - # case 2: xi in S1 (low) and xi' in S2 (high) - elif value_old <= low_max < value_new: - # remove xi from S1 - del xlow[value_old] - # insert xi' into S2 - xhigh[value_new] = None - # remove smallest element from S2 - high_min = xhigh.minimum() - del xhigh[high_min] - # put the above element into S1 - xlow[high_min] = None - # case 3: xi in S2 (high) and xi' in S1 (low) - elif value_new <= high_min <= value_old: - # remove xi from S2 - del xhigh[value_old] - # insert xi' into S1 - xlow[value_new] = None - # remove largest element from S1 + # xi is guaranteed to be in either S1 or S2 + if value_old <= low_max: + xlow.remove(value_old) + elif value_old >= high_min: + xhigh.remove(value_old) + + # it doesn't really matter where we insert it since we are rebalancing + # later anyway + if value_new <= xlow.maximum(): + xlow.add(value_new) + else: + xhigh.add(value_new) + + remainder = (len(xlow) + len(xhigh)) % 2 + + # Rebalance the two SortedCounters if their sizes differ by more than 1 + # NOTE: this operation is O(log n) since we are always doing it only a + # handful (fixed number) of times + + # remove items from xlow and add them in xhigh + while len(xlow) > len(xhigh) + remainder: low_max = xlow.maximum() - del xlow[low_max] - # put the above element into S2 - xhigh[low_max] = None - # case 4: xi in S2 (high) and xi' in S2 (high) - elif value_old >= high_min and value_new >= high_min: - # remove xi from S2 - del xhigh[value_old] - # insert xi' into S2 - xhigh[value_new] = None - - # in any case, the median should now be computable from xhigh.minimum() and - # xlow.maximum() + xlow.remove(low_max) + xhigh.add(low_max) + + # remove items from xhigh and add them in xlow + while len(xhigh) > len(xlow): + high_min = xhigh.minimum() + xhigh.remove(high_min) + xlow.add(high_min) + # Compute the median based on the sizes of xlow and xhigh if len(xlow) == len(xhigh): return (xlow.maximum() + xhigh.minimum()) / 2 + if len(xlow) > len(xhigh): + return xlow.maximum() return xhigh.minimum() @@ -295,6 +413,10 @@ def __init__(self, x: Iterable[Number], y: Iterable[Number]): self._size = len(self._x) self._x_mean = np.mean(self._x) self._y_mean = np.mean(self._y) + self._x_median = np.median(self._x) + self._y_median = np.median(self._y) + self._x_low, self._x_high = create_median_tree(self._x) + self._y_low, self._y_high = create_median_tree(self._y) self._x_var = np.var(self._x, ddof=0) self._x_stdev = np.sqrt(self._x_var) self._y_var = np.var(self._y, ddof=0) @@ -445,9 +567,27 @@ def perturb( size=len(self), ) + # `shifted_median` updates the containers in-place; in case + # `update=False`, we put it back + x_median = shifted_median( + xlow=self._x_low, + xhigh=self._x_high, + value_old=self._x[index], + value_new=self._x[index] + deltax, + ) + + y_median = shifted_median( + xlow=self._y_low, + xhigh=self._y_high, + value_old=self._y[index], + value_new=self._y[index] + deltay, + ) + if update: self._x_mean = x_mean self._y_mean = y_mean + self._x_median = x_median + self._y_median = y_median self._x_var = x_var self._y_var = y_var self._x_stdev = np.sqrt(x_var) @@ -459,10 +599,26 @@ def perturb( self._x[index] += deltax self._y[index] += deltay + else: + shifted_median( + xlow=self._x_low, + xhigh=self._x_high, + value_old=self._x[index] + deltax, + value_new=self._x[index], + ) + + shifted_median( + xlow=self._y_low, + xhigh=self._y_high, + value_old=self._y[index] + deltay, + value_new=self._y[index], + ) return SummaryStatistics( x_mean, y_mean, + x_median, + y_median, np.sqrt(x_var), np.sqrt(y_var), corrcoef, @@ -490,6 +646,8 @@ def get_values(x: Iterable[Number], y: Iterable[Number]) -> SummaryStatistics: return SummaryStatistics( np.mean(x), np.mean(y), + np.median(x), + np.median(y), np.std(x, ddof=1), np.std(y, ddof=1), np.corrcoef(x, y)[0, 1], diff --git a/tests/data/test_stats.py b/tests/data/test_stats.py index 659dffbe..b2143737 100644 --- a/tests/data/test_stats.py +++ b/tests/data/test_stats.py @@ -6,7 +6,7 @@ from data_morph.data.loader import DataLoader from data_morph.data.stats import ( - _create_median_avltree, + create_median_tree, get_values, shifted_corrcoef, shifted_mean, @@ -195,8 +195,8 @@ def test_shifted_median(data): """ data = np.sort(np.array(data, dtype=float)) size = len(data) - xlow, xhigh = _create_median_avltree(data) - assert_equal(data, np.concatenate([list(xlow), list(xhigh)])) + xlow, xhigh = create_median_tree(data) + # assert_equal(data, np.concatenate([list(xlow), list(xhigh)])) # make sure it works if we don't do anything ref = np.median(data) actual = shifted_median(xlow, xhigh, data[0], data[0]) @@ -205,7 +205,7 @@ def test_shifted_median(data): # move lower part of data into itself x = np.copy(data) - xlow, xhigh = _create_median_avltree(data) + xlow, xhigh = create_median_tree(data) index = size // 2 - 1 x[index] -= 0.3 ref = np.median(x) @@ -215,7 +215,7 @@ def test_shifted_median(data): # move higher part of data into itself x = np.copy(data) - xlow, xhigh = _create_median_avltree(data) + xlow, xhigh = create_median_tree(data) index = size // 2 + 1 x[index] += 0.3 ref = np.median(x) @@ -225,7 +225,7 @@ def test_shifted_median(data): # move higher part of data into lower part x = np.copy(data) - xlow, xhigh = _create_median_avltree(data) + xlow, xhigh = create_median_tree(data) index = size // 2 + 1 x[index] -= 5.2 ref = np.median(x) @@ -235,7 +235,7 @@ def test_shifted_median(data): # move lower part of data into higher part x = np.copy(data) - xlow, xhigh = _create_median_avltree(data) + xlow, xhigh = create_median_tree(data) index = size // 2 - 1 x[index] += 5.2 ref = np.median(x) From 53078c52a0f6269c84658347da98e0f05d084c0b Mon Sep 17 00:00:00 2001 From: Goran Jelic-Cizmek Date: Sun, 15 Sep 2024 14:15:43 +0200 Subject: [PATCH 10/13] Small fixup for median --- src/data_morph/data/stats.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/data_morph/data/stats.py b/src/data_morph/data/stats.py index e1003387..5b8065e7 100644 --- a/src/data_morph/data/stats.py +++ b/src/data_morph/data/stats.py @@ -365,14 +365,12 @@ def shifted_median( else: xhigh.add(value_new) - remainder = (len(xlow) + len(xhigh)) % 2 - # Rebalance the two SortedCounters if their sizes differ by more than 1 # NOTE: this operation is O(log n) since we are always doing it only a # handful (fixed number) of times # remove items from xlow and add them in xhigh - while len(xlow) > len(xhigh) + remainder: + while len(xlow) > len(xhigh): low_max = xlow.maximum() xlow.remove(low_max) xhigh.add(low_max) From aef33250ab81b4ec3b6c4084125d76678d7976d4 Mon Sep 17 00:00:00 2001 From: Goran Jelic-Cizmek Date: Sun, 10 Nov 2024 16:51:25 +0100 Subject: [PATCH 11/13] Use custom implementation of `SortedCounter` --- pyproject.toml | 2 +- src/data_morph/data/stats.py | 110 ++--------------------------------- 2 files changed, 6 insertions(+), 106 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 019edf25..09121d6d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,10 +44,10 @@ dynamic = [ ] dependencies = [ - "avltree==1.1.2", # exact version as it's not likely to be deprecated "matplotlib>=3.3", "numpy>=1.20", "pandas>=1.2", + "sortedcounter", "tqdm>=4.64.1", ] optional-dependencies.dev = [ diff --git a/src/data_morph/data/stats.py b/src/data_morph/data/stats.py index 5b8065e7..c1ec44ac 100644 --- a/src/data_morph/data/stats.py +++ b/src/data_morph/data/stats.py @@ -1,12 +1,12 @@ """Utility functions for calculating summary statistics.""" from collections import namedtuple -from collections.abc import Mapping, Sequence +from collections.abc import Sequence from numbers import Number -from typing import Iterable, Optional +from typing import Iterable import numpy as np -from avltree import AvlTree +from sortedcounter import SortedCounter SummaryStatistics = namedtuple( 'SummaryStatistics', @@ -17,114 +17,14 @@ ) -class SortedCounter: - """ - A version of ``collections.Counter`` that is ordered. - - Notes - ----- - The time complexity of lookup, insertion, and deletion is O(log n). - """ - - def __init__(self, iterable: Optional[Iterable] = None, /): - """ - Create a ``SortedCounter`` from an iterable or a mapping. - """ - self._tree = AvlTree() - # internal counter for how many elements are there (= size of - # container) - self._counter = 0 - - if iterable is not None: - if isinstance(iterable, Iterable): - for item in iterable: - self.add(item) - elif isinstance(iterable, Mapping): - for key, value in iterable.items(): - self.add(key, times=value) - else: - raise TypeError( - f'Type {type(iterable)} not supported for SortedCounter container.' - ) - - self._counter = sum(self._tree[key] for key in self._tree) - - def add(self, key, /, times: int = 1): - """ - Add a value to the container (possibly multiple times). - - Raises - ------ - ValueError - If ``times`` is < 0. - """ - if times <= 0: - raise ValueError( - f'Cannot add {times} item {key} to container; # of items to add must be > 0' - ) - - try: - self._tree[key] += times - except KeyError: - self._tree[key] = times - self._counter += times - - def remove(self, key, /, times: int = 1): - """ - Remove a value from the container (possibly multiple times). - - Raises - ------ - ValueError - If ``times`` is < 0. - - Notes - ----- - If ``key`` is not present in the container, does nothing. - If ``times`` is larger than the current number of items in the - container, ``key`` is removed from the container. - """ - if times <= 0: - raise ValueError( - f'Cannot remove item {key} {times} times from container; # of items to remove must be > 0' - ) - - if self._tree[key] - times <= 0: - del self._tree[key] - else: - self._tree[key] -= times - self._counter -= times - - def __getitem__(self, key): - return self._tree[key] - - def __len__(self): - return self._counter - - def maximum(self): - """ - Return the maximum value stored in the container. - """ - return self._tree.maximum() - - def minimum(self): - """ - Return the minimum value stored in the container. - """ - return self._tree.minimum() - - def __repr__(self): - return f'{self.__class__.__name__}({repr(dict(self._tree))})' - - def create_median_tree(data: Sequence, /) -> tuple[SortedCounter, SortedCounter]: """ Return a tuple of low and high ``SortedCounter``s from input data. Parameters ---------- - data - The input data as an iterable + data : Sequence + The input data as an iterable. Returns ------- From 186763d27ed880730a23f2362026e36b6197d715 Mon Sep 17 00:00:00 2001 From: JCGoran Date: Mon, 11 Nov 2024 18:05:21 +0100 Subject: [PATCH 12/13] Update package name --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0ec46788..dbb5550b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ dependencies = [ "matplotlib>=3.3", "numpy>=1.20", "pandas>=1.2", - "sortedcounter", + "sortedcountercpp", "tqdm>=4.64.1", ] optional-dependencies.dev = [ From 656c96deb776f45b6cd75aafd98d442fd015fd80 Mon Sep 17 00:00:00 2001 From: Goran Jelic-Cizmek Date: Mon, 11 Nov 2024 18:24:37 +0100 Subject: [PATCH 13/13] Please the lint overlord --- src/data_morph/data/stats.py | 9 ++++----- src/data_morph/morpher.py | 9 +++++---- tests/data/test_stats.py | 1 - 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/data_morph/data/stats.py b/src/data_morph/data/stats.py index c1ec44ac..38e44758 100644 --- a/src/data_morph/data/stats.py +++ b/src/data_morph/data/stats.py @@ -1,9 +1,8 @@ """Utility functions for calculating summary statistics.""" from collections import namedtuple -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from numbers import Number -from typing import Iterable import numpy as np from sortedcounter import SortedCounter @@ -111,7 +110,7 @@ def shifted_var( ) -def shifted_stdev(*args, **kwargs): +def shifted_stdev(*args: float, **kwargs: int) -> float: """ Compute the shifted standard deviation by perturbing one point. @@ -302,7 +301,7 @@ class Statistics: The ``y`` value of the data as an iterable. """ - def __init__(self, x: Iterable[Number], y: Iterable[Number]): + def __init__(self, x: Iterable[Number], y: Iterable[Number]) -> None: if len(x) != len(y): raise ValueError('The two datasets should have the same size') @@ -382,7 +381,7 @@ def corrcoef(self) -> float: """ return self._corrcoef - def __len__(self): + def __len__(self) -> int: """ Return the size of the dataset. diff --git a/src/data_morph/morpher.py b/src/data_morph/morpher.py index f83747cb..48c019fb 100644 --- a/src/data_morph/morpher.py +++ b/src/data_morph/morpher.py @@ -2,6 +2,7 @@ from __future__ import annotations +from collections.abc import MutableSequence from functools import partial from numbers import Number from pathlib import Path @@ -251,11 +252,11 @@ def _is_close_enough( Parameters ---------- - item1: SummaryStatistics - the first summary statistic + item1 : SummaryStatistics + The first summary statistic. - item2: SummaryStatistics - the second summary statistic + item2 : SummaryStatistics + The second summary statistic. Returns ------- diff --git a/tests/data/test_stats.py b/tests/data/test_stats.py index b2143737..bbad1c56 100644 --- a/tests/data/test_stats.py +++ b/tests/data/test_stats.py @@ -196,7 +196,6 @@ def test_shifted_median(data): data = np.sort(np.array(data, dtype=float)) size = len(data) xlow, xhigh = create_median_tree(data) - # assert_equal(data, np.concatenate([list(xlow), list(xhigh)])) # make sure it works if we don't do anything ref = np.median(data) actual = shifted_median(xlow, xhigh, data[0], data[0])