From e708f6c55a1e25a3f6a75b0c6913ae0a3dfd5ab7 Mon Sep 17 00:00:00 2001 From: Goran Jelic-Cizmek Date: Mon, 22 Jul 2024 13:16:57 +0200 Subject: [PATCH] Replace some pandas operations with numpy --- src/data_morph/data/dataset.py | 3 ++ src/data_morph/data/stats.py | 23 ++++++----- src/data_morph/morpher.py | 66 ++++++++++++++++++++----------- src/data_morph/plotting/static.py | 2 +- tests/data/test_stats.py | 6 ++- tests/test_morpher.py | 4 +- 6 files changed, 67 insertions(+), 37 deletions(-) diff --git a/src/data_morph/data/dataset.py b/src/data_morph/data/dataset.py index b3a2c705..cd9ad1bb 100644 --- a/src/data_morph/data/dataset.py +++ b/src/data_morph/data/dataset.py @@ -50,6 +50,9 @@ def __init__( self.df: pd.DataFrame = self._validate_data(df).pipe(self._scale_data, scale) """pandas.DataFrame: DataFrame containing columns x and y.""" + self._x = self.df['x'].to_numpy() + self._y = self.df['y'].to_numpy() + self.name: str = name """str: The name to use for the dataset.""" diff --git a/src/data_morph/data/stats.py b/src/data_morph/data/stats.py index d3c52669..a8a3070c 100644 --- a/src/data_morph/data/stats.py +++ b/src/data_morph/data/stats.py @@ -1,8 +1,10 @@ """Utility functions for calculating summary statistics.""" from collections import namedtuple +from numbers import Number +from typing import Iterable -import pandas as pd +import numpy as np SummaryStatistics = namedtuple( 'SummaryStatistics', ['x_mean', 'y_mean', 'x_stdev', 'y_stdev', 'correlation'] @@ -12,14 +14,17 @@ ) -def get_values(df: pd.DataFrame) -> SummaryStatistics: +def get_values(x: Iterable[Number], y: Iterable[Number]) -> SummaryStatistics: """ Calculate the summary statistics for the given set of points. Parameters ---------- - df : pandas.DataFrame - A dataset with columns x and y. + x : Iterable[Number] + The ``x`` value of the dataset. + + y : Iterable[Number] + The ``y`` value of the dataset. Returns ------- @@ -28,9 +33,9 @@ def get_values(df: pd.DataFrame) -> SummaryStatistics: along with the Pearson correlation coefficient between the two. """ return SummaryStatistics( - df.x.mean(), - df.y.mean(), - df.x.std(), - df.y.std(), - df.corr().x.y, + np.mean(x), + np.mean(y), + np.std(x, ddof=1), + np.std(y, ddof=1), + np.corrcoef(x, y)[0, 1], ) diff --git a/src/data_morph/morpher.py b/src/data_morph/morpher.py index e0d7da76..c108b400 100644 --- a/src/data_morph/morpher.py +++ b/src/data_morph/morpher.py @@ -3,7 +3,7 @@ from functools import partial from numbers import Number from pathlib import Path -from typing import Optional, Union +from typing import Iterable, MutableSequence, Optional, Union import numpy as np import pandas as pd @@ -239,16 +239,26 @@ def _record_frames( frame_number += 1 return frame_number - def _is_close_enough(self, df1: pd.DataFrame, df2: pd.DataFrame) -> bool: + def _is_close_enough( + self, + x1: Iterable[Number], + y1: Iterable[Number], + x2: Iterable[Number], + y2: Iterable[Number], + ) -> bool: """ Check whether the statistics are within the acceptable bounds. Parameters ---------- - df1 : pandas.DataFrame - The original DataFrame. - df2 : pandas.DataFrame - The DataFrame after the latest perturbation. + x1 : Iterable[Number] + The original value of ``x``. + y1 : Iterable[Number] + The original value of ``y``. + x2 : Iterable[Number] + The perturbed value of ``x``. + y2 : Iterable[Number] + The perturbed value of ``y``. Returns ------- @@ -258,10 +268,8 @@ def _is_close_enough(self, df1: pd.DataFrame, df2: pd.DataFrame) -> bool: return np.all( np.abs( np.subtract( - *( - np.floor(np.array(get_values(data)) * 10**self.decimals) - for data in [df1, df2] - ) + np.floor(np.array(get_values(x1, y1)) * 10**self.decimals), + np.floor(np.array(get_values(x2, y2)) * 10**self.decimals), ) ) == 0 @@ -269,21 +277,24 @@ def _is_close_enough(self, df1: pd.DataFrame, df2: pd.DataFrame) -> bool: def _perturb( self, - df: pd.DataFrame, + x: MutableSequence[Number], + y: MutableSequence[Number], target_shape: Shape, *, shake: Number, allowed_dist: Number, temp: Number, bounds: BoundingBox, - ) -> pd.DataFrame: + ) -> tuple[MutableSequence[Number], MutableSequence[Number]]: """ Perform one round of perturbation. Parameters ---------- - df : pandas.DataFrame - The data to perturb. + x : MutableSequence[Number] + The ``x`` part of the dataset. + y : MutableSequence[Number] + The ``y`` part of the dataset. target_shape : Shape The shape to morph the data into. shake : numbers.Number @@ -300,12 +311,12 @@ def _perturb( Returns ------- - pandas.DataFrame + tuple[MutableSequence[Number], MutableSequence[Number]] The input dataset with one point perturbed. """ - row = self._rng.integers(0, len(df)) - initial_x = df.at[row, 'x'] - initial_y = df.at[row, 'y'] + row = self._rng.integers(0, len(x)) + initial_x = x[row] + initial_y = y[row] # this is the simulated annealing step, if "do_bad", then we are willing to # accept a new state which is worse than the current one @@ -324,10 +335,10 @@ def _perturb( within_bounds = [new_x, new_y] in bounds done = close_enough and within_bounds - df.loc[row, 'x'] = new_x - df.loc[row, 'y'] = new_y + x[row] = new_x + y[row] = new_y - return df + return x, y def morph( self, @@ -468,11 +479,17 @@ def _tweening(frame, *, min_value, max_value): # numpydoc ignore=PR01,RT01 max_value=max_shake, ) + x, y = ( + start_shape.df['x'].to_numpy(copy=True), + start_shape.df['y'].to_numpy(copy=True), + ) + for i in self._looper( iterations, leave=True, ascii=True, desc=f'{target_shape} pattern' ): perturbed_data = self._perturb( - morphed_data.copy(), + np.copy(x), + np.copy(y), target_shape=target_shape, shake=get_current_shake(i), allowed_dist=allowed_dist, @@ -480,8 +497,9 @@ def _tweening(frame, *, min_value, max_value): # numpydoc ignore=PR01,RT01 bounds=start_shape.morph_bounds, ) - if self._is_close_enough(start_shape.df, perturbed_data): - morphed_data = perturbed_data + if self._is_close_enough(x, y, *perturbed_data): + x, y = perturbed_data + morphed_data = pd.DataFrame({'x': x, 'y': y}) frame_number = record_frames( data=morphed_data, diff --git a/src/data_morph/plotting/static.py b/src/data_morph/plotting/static.py index e91047ec..e88eb5d5 100644 --- a/src/data_morph/plotting/static.py +++ b/src/data_morph/plotting/static.py @@ -58,7 +58,7 @@ def plot( ax.xaxis.set_major_formatter(tick_formatter) ax.yaxis.set_major_formatter(tick_formatter) - res = get_values(df) + res = get_values(df['x'].to_numpy(), df['y'].to_numpy()) labels = ('X Mean', 'Y Mean', 'X SD', 'Y SD', 'Corr.') locs = np.linspace(0.8, 0.2, num=len(labels)) diff --git a/tests/data/test_stats.py b/tests/data/test_stats.py index c99134ed..e2d3d901 100644 --- a/tests/data/test_stats.py +++ b/tests/data/test_stats.py @@ -1,5 +1,7 @@ """Test the stats module.""" +import numpy as np + from data_morph.data.loader import DataLoader from data_morph.data.stats import get_values @@ -9,10 +11,10 @@ def test_stats(): data = DataLoader.load_dataset('dino').df - stats = get_values(data) + stats = get_values(data['x'], data['y']) assert stats.x_mean == data.x.mean() assert stats.y_mean == data.y.mean() assert stats.x_stdev == data.x.std() assert stats.y_stdev == data.y.std() - assert stats.correlation == data.corr().x.y + np.allclose(stats.correlation, data.corr().x.y) diff --git a/tests/test_morpher.py b/tests/test_morpher.py index 471f9a1f..e5f3142a 100644 --- a/tests/test_morpher.py +++ b/tests/test_morpher.py @@ -172,7 +172,9 @@ def test_no_writing(self, capsys): with pytest.raises(AssertionError): assert_frame_equal(morphed_data, dataset.df) - assert morpher._is_close_enough(dataset.df, morphed_data) + assert morpher._is_close_enough( + dataset.df['x'], dataset.df['y'], morphed_data['x'], morphed_data['y'] + ) _, err = capsys.readouterr() assert f'{target_shape} pattern: 100%' in err