Skip to content

Commit

Permalink
feat: add generalized metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
younesStrittmatter committed Sep 8, 2024
1 parent 0ed1e1f commit 44f1b33
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 11 deletions.
124 changes: 124 additions & 0 deletions src/autora/utils/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import numpy as np


def norms(arr: np.ndarray) -> np.ndarray:
"""
Calculate the norms along the first axis
Examples:
>>> import pandas as pd
>>> from autora.utils.transform import to_array
Simple dataframe with one condition
>>> df = pd.DataFrame({'x_0': [.2, 2, 3]})
First transform:
>>> as_array = to_array(df)
>>> norms(as_array)
array([0.2, 2. , 3. ])
>>> df_two_dim = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 0, 4]})
>>> as_array = to_array(df_two_dim)
>>> norms(as_array)
array([1., 1., 5.])
For nested dataframes
>>> df_nested = pd.DataFrame({
... 'x_0': [[0, 0], [0, 1], [1, 0], [3, 4]]
... })
>>> as_array = to_array(df_nested)
>>> norms(as_array)
array([0., 1., 1., 5.])
... and deeply nested
>>> df_nested_deep = pd.DataFrame({
... 'x_0': [[[0, 0], [0, 1]], [[3, 0], [0, 4]]]
... })
>>> as_array = to_array(df_nested_deep)
>>> norms(as_array)
array([1., 5.])
... no matter how many columns
>>> df_nested_deep_multi_column = pd.DataFrame({
... 'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]],
... 'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]]
... })
>>> as_array = to_array(df_nested_deep_multi_column)
>>> norms(as_array)
array([5., 1.])
"""
return np.array([np.linalg.norm(np.ravel(row)) for row in arr])


def distances(arr_1: np.ndarray, arr_2: np.ndarray) -> np.ndarray:
"""
Calculate the euclidian distance between two arrays no matter their dimension along the
first axis
Examples:
>>> import pandas as pd
>>> from autora.utils.transform import to_array
Simple dataframe with one condition
>>> df_1 = pd.DataFrame({'x_0': [0, 1, 2]})
>>> df_2 = pd.DataFrame({'x_0': [1, 2, 3]})
First transform:
>>> as_array_1 = to_array(df_1)
>>> as_array_2 = to_array(df_2)
>>> distances(as_array_1, as_array_2)
array([1., 1., 1.])
>>> df_two_dim_1 = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 0, 4]})
>>> df_two_dim_2 = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 1, 4]})
>>> as_array_1 = to_array(df_two_dim_1)
>>> as_array_2 = to_array(df_two_dim_2)
>>> distances(as_array_1, as_array_2)
array([0., 1., 0.])
For nested dataframes
>>> df_nested_1 = pd.DataFrame({
... 'x_0': [[0, 0], [0, 2], [0, 2], [0, 10], [4, 0]]
... })
>>> df_nested_2 = pd.DataFrame({
... 'x_0': [[1, 0], [0, 0], [0, 5], [0, 6], [0, 3]]
... })
>>> as_array_1 = to_array(df_nested_1)
>>> as_array_2 = to_array(df_nested_2)
>>> distances(as_array_1, as_array_2)
array([1., 2., 3., 4., 5.])
... and deeply nested
>>> df_nested_deep_1 = pd.DataFrame({
... 'x_0': [[[0, 0], [0, 1]], [[6, 0], [0, 10]]]
... })
>>> df_nested_deep_2 = pd.DataFrame({
... 'x_0': [[[0, 0], [0, 1]], [[3, 0], [0, 6]]]
... })
>>> as_array_1 = to_array(df_nested_deep_1)
>>> as_array_2 = to_array(df_nested_deep_2)
>>> distances(as_array_1, as_array_2)
array([0., 5.])
... no matter how many columns
>>> df_nested_deep_multi_column_1 = pd.DataFrame({
... 'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]],
... 'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]]
... })
>>> df_nested_deep_multi_column_2 = pd.DataFrame({
... 'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]],
... 'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]]
... })
>>> as_array_1 = to_array(df_nested_deep_multi_column_1)
>>> as_array_2 = to_array(df_nested_deep_multi_column_2)
>>> distances(as_array_1, as_array_2)
array([0., 0.])
"""
# Check that the two arrays have the same shape
assert arr_1.shape == arr_2.shape, "Arrays must have the same shape"

# For each row, calculate the squared distance
return np.sqrt(
np.array(
[np.sum((np.ravel(a) - np.ravel(b)) ** 2) for a, b in zip(arr_1, arr_2)]
)
)
31 changes: 20 additions & 11 deletions src/autora/utils/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
import pandas as pd


def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
def to_array(arr: Union[pd.DataFrame, pd.Series, np.ndarray]) -> np.ndarray:
"""
Transforms a pandas data frame to a numpy array
Args:
df: the pandas data frame
arr: the pandas data frame
Returns:
a numpy array
Expand All @@ -19,7 +19,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
... 'x_0': [1, 2, 3],
... 'x_1': [4, 5, 6],
... 'x_2': [7, 8, 9]})
>>> np.array_equal(np.array(df_one), df_to_array(df_one))
>>> np.array_equal(np.array(df_one), to_array(df_one))
True
If the rows contain lists ...
Expand All @@ -28,7 +28,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
... 'x_1': [[0, 1], [1, 1], [2, 1]],
... 'x_2': [[0, 2], [1, 2], [2, 2]]
... })
>>> array_transformed = df_to_array(df_list)
>>> array_transformed = to_array(df_list)
>>> array_cast = np.array(df_list)
the results are not equal:
Expand Down Expand Up @@ -61,7 +61,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
... 'x_1': [np.array([0, 1]), np.array([1, 1]), np.array([2, 1])],
... 'x_2': [np.array([0, 2]), np.array([1, 2]), np.array([2, 2])]
... })
>>> array_transformed = df_to_array(df_array)
>>> array_transformed = to_array(df_array)
>>> array_cast = np.array(df_list)
the results are not equal:
Expand Down Expand Up @@ -93,7 +93,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
... 'x_0': [[[0,0],[1,1]], [[0,0],[2,2]]],
... 'x_1': [[[1,1],[1,1]], [[1,1],[2,2]]]
... })
>>> df_to_array(df_nested)
>>> to_array(df_nested)
array([[[[0, 0],
[1, 1]],
<BLANKLINE>
Expand All @@ -111,12 +111,14 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
a flattening version of this (ATTENTION: when using the flattening version,
information about which entry belongs to which condition is lost):
"""
if isinstance(arr, np.ndarray):
return arr

_lst = [list(row) for _, row in df.iterrows()]
_lst = [list(row) for _, row in arr.iterrows()]
return np.array(_lst)


def df_to_array_flatten(df: Union[pd.DataFrame, pd.Series]) -> np.array:
def to_array_flatten(arr: Union[pd.DataFrame, pd.Series, np.ndarray]) -> np.ndarray:
"""
Flattens elements in a pandas DataFrame to resolve shape inconsistencies.
Expand All @@ -131,11 +133,18 @@ def df_to_array_flatten(df: Union[pd.DataFrame, pd.Series]) -> np.array:
... 'x_0': [0, 2, 4],
... 'x_1': [[1, 1], [3, 3], [5, 5]]
... })
>>> df_to_array_flatten(df_inconsistent)
>>> to_array_flatten(df_inconsistent)
array([[0, 1, 1],
[2, 3, 3],
[4, 5, 5]])
"""
if isinstance(arr, np.ndarray):
return arr
return np.array(
[np.concatenate([np.ravel(x) if isinstance(x, (list, np.ndarray)) else [x] for x in row])
for _, row in df.iterrows()])
[
np.concatenate(
[np.ravel(x) if isinstance(x, (list, np.ndarray)) else [x] for x in row]
)
for _, row in arr.iterrows()
]
)

0 comments on commit 44f1b33

Please sign in to comment.