feat: add generalized metrics

AutoResearch · Sep 8, 2024 · 44f1b33 · 44f1b33
1 parent 0ed1e1f
commit 44f1b33
Show file tree

Hide file tree

Showing 2 changed files with 144 additions and 11 deletions.
diff --git a/src/autora/utils/metrics.py b/src/autora/utils/metrics.py
@@ -0,0 +1,124 @@
+import numpy as np
+
+
+def norms(arr: np.ndarray) -> np.ndarray:
+    """
+    Calculate the norms along the first axis
+    Examples:
+        >>> import pandas as pd
+        >>> from autora.utils.transform import to_array
+
+        Simple dataframe with one condition
+        >>> df = pd.DataFrame({'x_0': [.2, 2, 3]})
+
+        First transform:
+        >>> as_array = to_array(df)
+        >>> norms(as_array)
+        array([0.2, 2. , 3. ])
+
+        >>> df_two_dim = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 0, 4]})
+        >>> as_array = to_array(df_two_dim)
+        >>> norms(as_array)
+        array([1., 1., 5.])
+
+        For nested dataframes
+        >>> df_nested = pd.DataFrame({
+        ...     'x_0': [[0, 0], [0, 1], [1, 0], [3, 4]]
+        ... })
+        >>> as_array = to_array(df_nested)
+        >>> norms(as_array)
+        array([0., 1., 1., 5.])
+
+        ... and deeply nested
+        >>> df_nested_deep = pd.DataFrame({
+        ...     'x_0': [[[0, 0], [0, 1]], [[3, 0], [0, 4]]]
+        ... })
+        >>> as_array = to_array(df_nested_deep)
+        >>> norms(as_array)
+        array([1., 5.])
+
+        ... no matter how many columns
+        >>> df_nested_deep_multi_column = pd.DataFrame({
+        ...     'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]],
+        ...     'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]]
+        ... })
+        >>> as_array = to_array(df_nested_deep_multi_column)
+        >>> norms(as_array)
+        array([5., 1.])
+    """
+    return np.array([np.linalg.norm(np.ravel(row)) for row in arr])
+
+
+def distances(arr_1: np.ndarray, arr_2: np.ndarray) -> np.ndarray:
+    """
+    Calculate the euclidian distance between two arrays no matter their dimension along the
+    first axis
+    Examples:
+        >>> import pandas as pd
+        >>> from autora.utils.transform import to_array
+
+        Simple dataframe with one condition
+        >>> df_1 = pd.DataFrame({'x_0': [0, 1, 2]})
+        >>> df_2 = pd.DataFrame({'x_0': [1, 2, 3]})
+
+        First transform:
+        >>> as_array_1 = to_array(df_1)
+        >>> as_array_2 = to_array(df_2)
+        >>> distances(as_array_1, as_array_2)
+        array([1., 1., 1.])
+
+        >>> df_two_dim_1 = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 0, 4]})
+        >>> df_two_dim_2 = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 1, 4]})
+        >>> as_array_1 = to_array(df_two_dim_1)
+        >>> as_array_2 = to_array(df_two_dim_2)
+        >>> distances(as_array_1, as_array_2)
+        array([0., 1., 0.])
+
+        For nested dataframes
+        >>> df_nested_1 = pd.DataFrame({
+        ...     'x_0': [[0, 0], [0, 2], [0, 2], [0, 10], [4, 0]]
+        ... })
+        >>> df_nested_2 = pd.DataFrame({
+        ...     'x_0': [[1, 0], [0, 0], [0, 5], [0, 6], [0, 3]]
+        ... })
+        >>> as_array_1 = to_array(df_nested_1)
+        >>> as_array_2 = to_array(df_nested_2)
+        >>> distances(as_array_1, as_array_2)
+        array([1., 2., 3., 4., 5.])
+
+        ... and deeply nested
+        >>> df_nested_deep_1 = pd.DataFrame({
+        ...     'x_0': [[[0, 0], [0, 1]], [[6, 0], [0, 10]]]
+        ... })
+        >>> df_nested_deep_2 = pd.DataFrame({
+        ...     'x_0': [[[0, 0], [0, 1]], [[3, 0], [0, 6]]]
+        ... })
+        >>> as_array_1 = to_array(df_nested_deep_1)
+        >>> as_array_2 = to_array(df_nested_deep_2)
+        >>> distances(as_array_1, as_array_2)
+        array([0., 5.])
+
+        ... no matter how many columns
+        >>> df_nested_deep_multi_column_1 = pd.DataFrame({
+        ...     'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]],
+        ...     'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]]
+        ... })
+        >>> df_nested_deep_multi_column_2 = pd.DataFrame({
+        ...     'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]],
+        ...     'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]]
+        ... })
+        >>> as_array_1 = to_array(df_nested_deep_multi_column_1)
+        >>> as_array_2 = to_array(df_nested_deep_multi_column_2)
+        >>> distances(as_array_1, as_array_2)
+        array([0., 0.])
+
+    """
+    # Check that the two arrays have the same shape
+    assert arr_1.shape == arr_2.shape, "Arrays must have the same shape"
+
+    # For each row, calculate the squared distance
+    return np.sqrt(
+        np.array(
+            [np.sum((np.ravel(a) - np.ravel(b)) ** 2) for a, b in zip(arr_1, arr_2)]
+        )
+    )
diff --git a/src/autora/utils/transform.py b/src/autora/utils/transform.py
@@ -4,11 +4,11 @@
 import pandas as pd
 
 
-def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
+def to_array(arr: Union[pd.DataFrame, pd.Series, np.ndarray]) -> np.ndarray:
     """
     Transforms a pandas data frame to a numpy array
     Args:
-        df: the pandas data frame
+        arr: the pandas data frame
 
     Returns:
         a numpy array
@@ -19,7 +19,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
         ...     'x_0': [1, 2, 3],
         ...     'x_1': [4, 5, 6],
         ...     'x_2': [7, 8, 9]})
-        >>> np.array_equal(np.array(df_one), df_to_array(df_one))
+        >>> np.array_equal(np.array(df_one), to_array(df_one))
         True
 
         If the rows contain lists ...
@@ -28,7 +28,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
         ...     'x_1': [[0, 1], [1, 1], [2, 1]],
         ...     'x_2': [[0, 2], [1, 2], [2, 2]]
         ... })
-        >>> array_transformed = df_to_array(df_list)
+        >>> array_transformed = to_array(df_list)
         >>> array_cast = np.array(df_list)
 
         the results are not equal:
@@ -61,7 +61,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
         ...     'x_1': [np.array([0, 1]), np.array([1, 1]), np.array([2, 1])],
         ...     'x_2': [np.array([0, 2]), np.array([1, 2]), np.array([2, 2])]
         ... })
-        >>> array_transformed = df_to_array(df_array)
+        >>> array_transformed = to_array(df_array)
         >>> array_cast = np.array(df_list)
 
         the results are not equal:
@@ -93,7 +93,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
         ...     'x_0': [[[0,0],[1,1]], [[0,0],[2,2]]],
         ...     'x_1': [[[1,1],[1,1]], [[1,1],[2,2]]]
         ... })
-        >>> df_to_array(df_nested)
+        >>> to_array(df_nested)
         array([[[[0, 0],
                  [1, 1]],
         <BLANKLINE>
@@ -111,12 +111,14 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
         a flattening version of this (ATTENTION: when using the flattening version,
         information about which entry belongs to which condition is lost):
     """
+    if isinstance(arr, np.ndarray):
+        return arr
 
-    _lst = [list(row) for _, row in df.iterrows()]
+    _lst = [list(row) for _, row in arr.iterrows()]
     return np.array(_lst)
 
 
-def df_to_array_flatten(df: Union[pd.DataFrame, pd.Series]) -> np.array:
+def to_array_flatten(arr: Union[pd.DataFrame, pd.Series, np.ndarray]) -> np.ndarray:
     """
     Flattens elements in a pandas DataFrame to resolve shape inconsistencies.
 
@@ -131,11 +133,18 @@ def df_to_array_flatten(df: Union[pd.DataFrame, pd.Series]) -> np.array:
         ...     'x_0': [0, 2, 4],
         ...     'x_1': [[1, 1], [3, 3], [5, 5]]
         ... })
-        >>> df_to_array_flatten(df_inconsistent)
+        >>> to_array_flatten(df_inconsistent)
         array([[0, 1, 1],
                [2, 3, 3],
                [4, 5, 5]])
     """
+    if isinstance(arr, np.ndarray):
+        return arr
     return np.array(
-        [np.concatenate([np.ravel(x) if isinstance(x, (list, np.ndarray)) else [x] for x in row])
-         for _, row in df.iterrows()])
+        [
+            np.concatenate(
+                [np.ravel(x) if isinstance(x, (list, np.ndarray)) else [x] for x in row]
+            )
+            for _, row in arr.iterrows()
+        ]
+    )