Skip to content

Commit

Permalink
Replace some pandas operations with numpy
Browse files Browse the repository at this point in the history
  • Loading branch information
JCGoran committed Jul 22, 2024
1 parent 4d5319d commit e708f6c
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 37 deletions.
3 changes: 3 additions & 0 deletions src/data_morph/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ def __init__(
self.df: pd.DataFrame = self._validate_data(df).pipe(self._scale_data, scale)
"""pandas.DataFrame: DataFrame containing columns x and y."""

self._x = self.df['x'].to_numpy()
self._y = self.df['y'].to_numpy()

self.name: str = name
"""str: The name to use for the dataset."""

Expand Down
23 changes: 14 additions & 9 deletions src/data_morph/data/stats.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""Utility functions for calculating summary statistics."""

from collections import namedtuple
from numbers import Number
from typing import Iterable

import pandas as pd
import numpy as np

SummaryStatistics = namedtuple(
'SummaryStatistics', ['x_mean', 'y_mean', 'x_stdev', 'y_stdev', 'correlation']
Expand All @@ -12,14 +14,17 @@
)


def get_values(df: pd.DataFrame) -> SummaryStatistics:
def get_values(x: Iterable[Number], y: Iterable[Number]) -> SummaryStatistics:
"""
Calculate the summary statistics for the given set of points.
Parameters
----------
df : pandas.DataFrame
A dataset with columns x and y.
x : Iterable[Number]
The ``x`` value of the dataset.
y : Iterable[Number]
The ``y`` value of the dataset.
Returns
-------
Expand All @@ -28,9 +33,9 @@ def get_values(df: pd.DataFrame) -> SummaryStatistics:
along with the Pearson correlation coefficient between the two.
"""
return SummaryStatistics(
df.x.mean(),
df.y.mean(),
df.x.std(),
df.y.std(),
df.corr().x.y,
np.mean(x),
np.mean(y),
np.std(x, ddof=1),
np.std(y, ddof=1),
np.corrcoef(x, y)[0, 1],
)
66 changes: 42 additions & 24 deletions src/data_morph/morpher.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from functools import partial
from numbers import Number
from pathlib import Path
from typing import Optional, Union
from typing import Iterable, MutableSequence, Optional, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -239,16 +239,26 @@ def _record_frames(
frame_number += 1
return frame_number

def _is_close_enough(self, df1: pd.DataFrame, df2: pd.DataFrame) -> bool:
def _is_close_enough(
self,
x1: Iterable[Number],
y1: Iterable[Number],
x2: Iterable[Number],
y2: Iterable[Number],
) -> bool:
"""
Check whether the statistics are within the acceptable bounds.
Parameters
----------
df1 : pandas.DataFrame
The original DataFrame.
df2 : pandas.DataFrame
The DataFrame after the latest perturbation.
x1 : Iterable[Number]
The original value of ``x``.
y1 : Iterable[Number]
The original value of ``y``.
x2 : Iterable[Number]
The perturbed value of ``x``.
y2 : Iterable[Number]
The perturbed value of ``y``.
Returns
-------
Expand All @@ -258,32 +268,33 @@ def _is_close_enough(self, df1: pd.DataFrame, df2: pd.DataFrame) -> bool:
return np.all(
np.abs(
np.subtract(
*(
np.floor(np.array(get_values(data)) * 10**self.decimals)
for data in [df1, df2]
)
np.floor(np.array(get_values(x1, y1)) * 10**self.decimals),
np.floor(np.array(get_values(x2, y2)) * 10**self.decimals),
)
)
== 0
)

def _perturb(
self,
df: pd.DataFrame,
x: MutableSequence[Number],
y: MutableSequence[Number],
target_shape: Shape,
*,
shake: Number,
allowed_dist: Number,
temp: Number,
bounds: BoundingBox,
) -> pd.DataFrame:
) -> tuple[MutableSequence[Number], MutableSequence[Number]]:
"""
Perform one round of perturbation.
Parameters
----------
df : pandas.DataFrame
The data to perturb.
x : MutableSequence[Number]
The ``x`` part of the dataset.
y : MutableSequence[Number]
The ``y`` part of the dataset.
target_shape : Shape
The shape to morph the data into.
shake : numbers.Number
Expand All @@ -300,12 +311,12 @@ def _perturb(
Returns
-------
pandas.DataFrame
tuple[MutableSequence[Number], MutableSequence[Number]]
The input dataset with one point perturbed.
"""
row = self._rng.integers(0, len(df))
initial_x = df.at[row, 'x']
initial_y = df.at[row, 'y']
row = self._rng.integers(0, len(x))
initial_x = x[row]
initial_y = y[row]

# this is the simulated annealing step, if "do_bad", then we are willing to
# accept a new state which is worse than the current one
Expand All @@ -324,10 +335,10 @@ def _perturb(
within_bounds = [new_x, new_y] in bounds
done = close_enough and within_bounds

df.loc[row, 'x'] = new_x
df.loc[row, 'y'] = new_y
x[row] = new_x
y[row] = new_y

return df
return x, y

def morph(
self,
Expand Down Expand Up @@ -468,20 +479,27 @@ def _tweening(frame, *, min_value, max_value): # numpydoc ignore=PR01,RT01
max_value=max_shake,
)

x, y = (
start_shape.df['x'].to_numpy(copy=True),
start_shape.df['y'].to_numpy(copy=True),
)

for i in self._looper(
iterations, leave=True, ascii=True, desc=f'{target_shape} pattern'
):
perturbed_data = self._perturb(
morphed_data.copy(),
np.copy(x),
np.copy(y),
target_shape=target_shape,
shake=get_current_shake(i),
allowed_dist=allowed_dist,
temp=get_current_temp(i),
bounds=start_shape.morph_bounds,
)

if self._is_close_enough(start_shape.df, perturbed_data):
morphed_data = perturbed_data
if self._is_close_enough(x, y, *perturbed_data):
x, y = perturbed_data
morphed_data = pd.DataFrame({'x': x, 'y': y})

frame_number = record_frames(
data=morphed_data,
Expand Down
2 changes: 1 addition & 1 deletion src/data_morph/plotting/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def plot(
ax.xaxis.set_major_formatter(tick_formatter)
ax.yaxis.set_major_formatter(tick_formatter)

res = get_values(df)
res = get_values(df['x'].to_numpy(), df['y'].to_numpy())

labels = ('X Mean', 'Y Mean', 'X SD', 'Y SD', 'Corr.')
locs = np.linspace(0.8, 0.2, num=len(labels))
Expand Down
6 changes: 4 additions & 2 deletions tests/data/test_stats.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Test the stats module."""

import numpy as np

from data_morph.data.loader import DataLoader
from data_morph.data.stats import get_values

Expand All @@ -9,10 +11,10 @@ def test_stats():

data = DataLoader.load_dataset('dino').df

stats = get_values(data)
stats = get_values(data['x'], data['y'])

assert stats.x_mean == data.x.mean()
assert stats.y_mean == data.y.mean()
assert stats.x_stdev == data.x.std()
assert stats.y_stdev == data.y.std()
assert stats.correlation == data.corr().x.y
np.allclose(stats.correlation, data.corr().x.y)
4 changes: 3 additions & 1 deletion tests/test_morpher.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,9 @@ def test_no_writing(self, capsys):

with pytest.raises(AssertionError):
assert_frame_equal(morphed_data, dataset.df)
assert morpher._is_close_enough(dataset.df, morphed_data)
assert morpher._is_close_enough(
dataset.df['x'], dataset.df['y'], morphed_data['x'], morphed_data['y']
)

_, err = capsys.readouterr()
assert f'{target_shape} pattern: 100%' in err
Expand Down

0 comments on commit e708f6c

Please sign in to comment.